In [79]:
import pandas as pd
import glob
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
import holidays

# Parameters

In [111]:
nl_holidays = holidays.NL()

# Functions

In [112]:
def load_files(path):
    allFiles = glob.glob(path + '/*.tsv')
    bigFile = []
    for f in tqdm(allFiles):
        df = pd.read_csv(f, delimiter='\t')
        df['ocr'] = df['ocr'].astype(str)
        df = df[~df['date'].str.contains('date')]  # remove double headers
        df = df[~df['ocr'].str.contains('objecttype')]  # remove double headers
        df['ocr'] = df['ocr'].astype(str)
        if 'len' in df.columns:
            pass
        else:
            df['len'] = df['ocr'].str.split().apply(len)
        if 'id' in df.columns:
            pass
        else:
            df['id'] = df['ocr_url'].astype(str).str.extract(r'(\d{9})')
        bigFile.append(df)
    return pd.concat(bigFile)

def generate_stats(df, title, freq='Y'):
    df['date'] = pd.to_datetime(df['date'])
    df.set_index('date', inplace=True)
    df['len'] = df['len'].astype(int)
    mean_x = df['len'].groupby(pd.Grouper(freq=freq)).describe()['mean']
    p05 = df['len'].groupby(pd.Grouper(freq=freq)).quantile(0.05)
    p95 = df['len'].groupby(pd.Grouper(freq=freq)).quantile(0.95)

    plt.plot(mean_x)
    plt.errorbar(mean_x.index, mean_x, yerr=[mean_x - p05, p95 - mean_x], linestyle='')
    plt.title('Mean Length {}'.format(df['len'].mean()))
    plt.ylabel('Total number of words')
    plt.xlabel('Date')
    plt.savefig('../figures/{}_{}_length.png'.format(title, type_));
    
    plt.clf()
    
    article_count = df['ocr'].groupby(pd.Grouper(freq=freq)).count()
    plt.plot(article_count)
    plt.title('Mean Count {}'.format(np.mean(article_count)))
    plt.ylabel('Articles per year')
    plt.xlabel('Date')
    plt.savefig('../figures/{}_{}_count.png'.format(title, type_));
    
    df['page'] = df['page'].astype(int)
    '''
    TO DO: clean up this code
    '''
    missing = df[df['page'] == 1].groupby(pd.Grouper(freq='D'))['page'].mean().sort_values()
    missingdates = missing[missing.isnull()]
    missingdates = missingdates[missingdates.index.day_name() != 'Sunday']
    missingdates = pd.Series([date.strftime('%d-%m-%Y') for date in missingdates.index if (date in nl_holidays) == False])   
    missingdates.to_csv('../missing_data/{}_{}_missing.csv'.format(title, type_), header=['dates'])
    
    df.reset_index(inplace=True)
    
def split_years(df):
    df['date'] = pd.to_datetime(df['date'])
    for name, group in df.set_index('date').groupby(pd.Grouper(freq ='Y')):
        group.to_csv(path + title + '_' + str(name.year) + '.tsv', sep='\t')
    

# Main

In [108]:
title = 'vv'
type_ = 'articles'
path = '../{}/{}/'.format(title, type_)

In [None]:
for title in ['algemeen_handelsblad','leeuwarder_courant','limburgs_dagblad','nrc','parool','trouw','vk', 'vv']:
    type_ = 'articles'
    path = '../{}/{}/'.format(title, type_)
    df = load_files(path)
    generate_stats(df, title)
    

In [None]:
df = load_files(path)
generate_stats(df, title)





  0%|          | 0/51 [00:00<?, ?it/s][A[A[A


  2%|▏         | 1/51 [00:31<26:10, 31.40s/it][A[A[A


  4%|▍         | 2/51 [01:07<26:45, 32.78s/it][A[A[A


  6%|▌         | 3/51 [01:10<19:03, 23.82s/it][A[A[A


  8%|▊         | 4/51 [01:13<13:55, 17.77s/it][A[A[A


 10%|▉         | 5/51 [01:14<09:37, 12.55s/it][A[A[A


 12%|█▏        | 6/51 [01:22<08:26, 11.26s/it][A[A[A


 14%|█▎        | 7/51 [02:41<23:02, 31.42s/it][A[A[A


 16%|█▌        | 8/51 [03:01<20:07, 28.08s/it][A[A[A


 18%|█▊        | 9/51 [03:08<15:19, 21.89s/it][A[A[A


 20%|█▉        | 10/51 [03:12<11:12, 16.41s/it][A[A[A


 22%|██▏       | 11/51 [03:13<07:57, 11.93s/it][A[A[A


 24%|██▎       | 12/51 [03:15<05:43,  8.81s/it][A[A[A


 25%|██▌       | 13/51 [03:38<08:20, 13.18s/it][A[A[A


 27%|██▋       | 14/51 [04:51<19:13, 31.17s/it][A[A[A


 29%|██▉       | 15/51 [04:58<14:19, 23.87s/it][A[A[A


 31%|███▏      | 16/51 [05:08<11:25, 19.58s/it][A[A[A


 33%|███▎   

In [110]:
#output split by years
df['date'] = pd.to_datetime(df['date'])
for name, group in df.set_index('date').groupby(pd.Grouper(freq ='Y')):
    group.to_csv(path + title + '_' + str(name.year) + '.tsv', sep='\t')

KeyboardInterrupt: 

In [96]:
df.reset_index(inplace=True)