In [1]:
from PyPDF2 import PdfReader
import pandas as pd
from datetime import datetime as dt
from IPython.display import display, HTML
import os

# Funktion um PDF-Metadaten auszulesen
def get_pdf_meta_data(minute):
    reader = PdfReader("../data/evaluation/pdf/" + minute + ".pdf")
    return reader.metadata

# Erstellung der Tabelle mit den Protokoll-Metadaten
# Zeitangaben in UTC
meta_data_columns = ['Protokoll', 'PDF erstellt', 'PDF zuletzt geändert', 'PDF Anwendung', 'PDF erstellt mit', 'XML zuletzt geändert']
df_meta_data =  pd.DataFrame(columns = meta_data_columns)

# Iteriere über die einzelnen Protokolle (eng. Minute) aus der Stichprobe
minute_sample = ["01020", "01229", "02087", "02202", "03018", "03103", "04015", "04184", "05101", "05158", 
                 "06017", "06185", "07118", "07140", "08025", "08080", "09014", "09126", "10061", "10213", 
                 "11102", "11124", "12037", "12152", "13176", "13217", "14145", "14150", "15058", "15077", 
                 "16002", "16133", "17036", "17248", "18014", "18088", "19021", "19180"]

for minute in minute_sample:
    metadata = get_pdf_meta_data(minute)
    metadata = pd.DataFrame([[minute, 
                              dt.strptime(metadata.getText("/CreationDate").replace("'", "")[:16], "D:%Y%m%d%H%M%S"),
                              dt.strptime(metadata.getText("/ModDate").replace("'", "")[:16], "D:%Y%m%d%H%M%S"),
                              metadata.getText("/Creator"),
                              metadata.getText("/Producer"),
                              dt.utcfromtimestamp(int(os.path.getmtime("../data/evaluation/raw/" + minute + ".xml"))).strftime('%Y-%m-%d %H:%M:%S')
                             ]], columns=meta_data_columns)
    df_meta_data = pd.concat([df_meta_data, metadata], ignore_index=True)

display(HTML("<div style='height: 500px; overflow:scroll-y'>"+df_meta_data.to_html()+"</div>"))

Unnamed: 0,Protokoll,PDF erstellt,PDF zuletzt geändert,PDF Anwendung,PDF erstellt mit,XML zuletzt geändert
0,1020,2012-02-09 20:41:01,2014-01-27 11:10:30,OmniPage CSDK 16,OmniPage 17,2015-07-09 07:40:16
1,1229,2012-02-15 21:46:30,2012-07-25 07:56:08,OmniPage CSDK 16,OmniPage 17,2015-07-09 07:40:30
2,2087,2012-01-30 10:27:27,2012-07-21 06:33:38,OmniPage CSDK 16,OmniPage 17,2015-07-09 07:40:52
3,2202,2012-01-31 14:02:47,2012-07-21 06:54:26,OmniPage CSDK 16,OmniPage 17,2015-07-09 07:41:00
4,3018,2012-01-20 10:24:49,2012-07-18 20:39:41,OmniPage CSDK 16,OmniPage 17,2015-07-09 07:41:04
5,3103,2012-01-23 15:38:17,2012-07-19 08:19:19,OmniPage CSDK 16,OmniPage 17,2015-07-09 07:41:10
6,4015,2012-01-14 01:04:46,2012-07-18 09:46:20,OmniPage CSDK 16,OmniPage 17,2015-07-09 07:41:24
7,4184,2012-01-13 14:33:40,2012-07-18 09:41:44,OmniPage CSDK 16,OmniPage 17,2015-07-09 07:41:40
8,5101,2012-01-11 07:10:13,2012-07-16 13:22:42,OmniPage CSDK 16,OmniPage 17,2015-07-09 07:42:08
9,5158,2011-12-20 08:35:59,2012-07-16 13:02:12,OmniPage CSDK 16,OmniPage 17,2015-07-09 07:42:10


In [2]:
import os, os.path
import pandas as pd
from lxml import etree
import datetime
from IPython.display import display, HTML

# Erstelle einer Übersichtstabelle
minutes_columns = ['period','nr','date','len', 'date_anomaly', 'datum_duplicate']
minutes = pd.DataFrame(columns=minutes_columns)

# Iteriere über alle Perioden
for period in range(1, 19):
    period_string = str(period).zfill(2)
    path = '../data/raw' + '/' + period_string
    count = len([name for name in os.listdir(path) if os.path.isfile(os.path.join(path, name))])     
    
    # Iteriere über alle Protokolle innerhalb einer Periode
    for number in range (1, count + 1):
        number_string = str(number).zfill(3)
        file_path = path + '/' + period_string + number_string + '.xml'
        
        # Füge Informationen zu dem DataFrame hinzu
        if os.path.exists(file_path):
            root = etree.parse(file_path).getroot()
            date = datetime.datetime.strptime(root.find('DATUM').text, '%d.%m.%Y').date()
            len_xml = len(root.find('TEXT').text)
            
            minutes = pd.concat([minutes, 
                              pd.DataFrame([[period, number, date, len_xml, False, False]], columns=minutes_columns) ], 
                             ignore_index=True)

In [3]:
# Startvariable setzen
date = minutes.head(1).date.values[0]

# Iteriere über alle Protokolle
for index, minute in minutes.iterrows():
    # Untersuche alle Protokolle außer dem Ersten und Letzten
    if index > 0 and index < len(minutes) - 1:
        date_previous = date
        date = minute.date
        date_next = minutes.iloc[[index + 1]].date.values[0]
               
        if date < date_previous or date > date_next:
            minutes.at[index,'date_anomaly'] = True

        if date == date_previous or date == date_next:
            minutes.at[index,'datum_duplicate'] = True

# Tabelle nach Anomalien filtern
minutes_anomaly_filtered = minutes[minutes['date_anomaly']==True]
display(HTML(minutes_anomaly_filtered.to_html()))

Unnamed: 0,period,nr,date,len,date_anomaly,datum_duplicate
193,1,194,1952-02-20,258922,True,False
194,1,195,1951-02-21,312238,True,False
456,2,175,1956-11-30,246221,True,False
457,2,176,1956-11-05,378133,True,False
2954,14,20,1999-02-23,583483,True,False
2955,14,21,1999-02-14,688475,True,False
3215,15,28,2003-12-20,521920,True,False
3216,15,29,2003-02-21,269037,True,False


In [4]:
minutes_duplicates_filtered = minutes[minutes['datum_duplicate']==True]
print('Anzahl von Sitzung, die am gleichen Tag wie eine andere Sitzung stattfand (die Zahl zählt beide): ', len(minutes_duplicates_filtered))
display(HTML(minutes_duplicates_filtered.head(5).to_html()))

Anzahl von Sitzung, die am gleichen Tag wie eine andere Sitzung stattfand (die Zahl zählt beide):  90


Unnamed: 0,period,nr,date,len,date_anomaly,datum_duplicate
2,1,3,1949-09-15,6521,False,True
3,1,4,1949-09-15,17267,False,True
19,1,20,1949-12-02,144998,False,True
20,1,21,1949-12-02,143193,False,True
23,1,24,1949-12-16,139918,False,True


In [5]:
minutes_duplicates_filtered.insert(5, 'len_duplicate',minutes_duplicates_filtered['len'])
minutes_duplicates_filtered.insert(2, 'nr_duplicate',minutes_duplicates_filtered['nr'])

# Groupieren nach Tag und Berechnung der relativen Abweichung
minutes_duplicates_filtered = minutes_duplicates_filtered.groupby(['date'], as_index=False).agg({'period': 'first', 'nr': 'first', 'nr_duplicate': 'last', 'date': 'first', 'len': 'first', 'len_duplicate': 'last'})
minutes_duplicates_filtered.insert(6, 'diff',minutes_duplicates_filtered['len']/minutes_duplicates_filtered['len_duplicate'])

# Filtern nach Abweichung
threshold = 0.025
minutes_duplicates_filtered = minutes_duplicates_filtered[(minutes_duplicates_filtered['diff'] < 1 + threshold) & (minutes_duplicates_filtered['diff'] > 1 - threshold)]
minutes_duplicates_filtered

Unnamed: 0,period,nr,nr_duplicate,date,len,len_duplicate,diff
1,1,20,21,1949-12-02,144998,143193,1.012605
3,1,41,42,1950-02-24,219451,216375,1.014216
5,1,75,76,1950-07-14,323715,320151,1.011132
7,1,149,150,1951-06-08,260737,257737,1.01164
8,1,223,224,1952-07-16,638199,627758,1.016632
9,1,280,281,1953-07-03,893683,877757,1.018144
12,2,219,220,1957-06-29,295928,290819,1.017568
13,2,225,226,1957-07-06,173908,171244,1.015557
14,3,97,98,1960-01-27,297161,297345,0.999381
17,4,66,67,1963-03-15,187034,187034,1.0


In [6]:
# Variablen für das nächste Kapitel speichern
%store minutes minutes_duplicates_filtered

Stored 'minutes' (DataFrame)
Stored 'minutes_duplicates_filtered' (DataFrame)
