In [None]:
import pandas as pd
import glob
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
import holidays

# Parameters

In [None]:
nl_holidays = holidays.NL()

# Functions

In [None]:
def load_files(path, quick=False):
    allFiles = glob.glob(path + '/*.tsv')
    bigFile = []
    for f in tqdm(allFiles):
        if quick==True:
            df = pd.read_csv(f, delimiter='\t', usecols=["date", "page", "ocr", "len"])
        else:
            df = pd.read_csv(f, delimiter='\t')
            df['ocr'] = df['ocr'].astype(str)
            df = df[~df['date'].str.contains('date')]  # remove double headers
            df = df[~df['ocr'].str.contains('objecttype')]  # remove double headers
            df['ocr'] = df['ocr'].astype(str)
            if 'len' in df.columns:
                pass
            else:
                df['len'] = df['ocr'].str.split().apply(len)
            if 'id' in df.columns:
                pass
            else:
                df['id'] = df['ocr_url'].astype(str).str.extract(r'(\d{9})')
        bigFile.append(df)
    return pd.concat(bigFile)

def generate_stats(df, title, freq='Y'):
    df['date'] = pd.to_datetime(df['date'])
    df.set_index('date', inplace=True)
    df['len'] = df['len'].astype(int)
    mean_x = df['len'].groupby(pd.Grouper(freq=freq)).describe()['mean']
    p05 = df['len'].groupby(pd.Grouper(freq=freq)).quantile(0.05)
    p95 = df['len'].groupby(pd.Grouper(freq=freq)).quantile(0.95)

    plt.plot(mean_x)
    plt.errorbar(mean_x.index, mean_x, yerr=[mean_x - p05, p95 - mean_x], linestyle='')
    plt.title('Mean Length {}'.format(df['len'].mean()))
    plt.ylabel('Total number of words')
    plt.xlabel('Date')
    plt.savefig('../figures/{}_{}_length.png'.format(title, type_));
    
    plt.clf()
    
    article_count = df['ocr'].groupby(pd.Grouper(freq=freq)).count()
    plt.plot(article_count)
    plt.title('Mean Count {}'.format(np.mean(article_count)))
    plt.ylabel('Articles per year')
    plt.xlabel('Date')
    plt.savefig('../figures/{}_{}_count.png'.format(title, type_));
    
    df['page'] = df['page'].astype(int)
    '''
    TO DO: clean up this code
    '''
    missing = df[df['page'] == 1].groupby(pd.Grouper(freq='D'))['page'].mean().sort_values()
    missingdates = missing[missing.isnull()]
    missingdates = missingdates[missingdates.index.day_name() != 'Sunday']
    missingdates = pd.Series([date.strftime('%d-%m-%Y') for date in missingdates.index if (date in nl_holidays) == False])   
    missingdates.to_csv('../missing_data/{}_{}_missing.csv'.format(title, type_), header=['dates'])
    
    df.reset_index(inplace=True)
    
def split_years(df):
    df['date'] = pd.to_datetime(df['date'])
    for name, group in df.set_index('date').groupby(pd.Grouper(freq ='Y')):
        group.to_csv(path + title + '_' + str(name.year) + '.tsv', sep='\t')
    

# Main

In [None]:
title = 'trouw'
type_ = 'articles'
path = '../{}/{}/'.format(title, type_)

In [None]:
df = load_files(path, quick=True)
generate_stats(df, title)


In [None]:
#output split by years
df['date'] = pd.to_datetime(df['date'])
for name, group in df.set_index('date').groupby(pd.Grouper(freq ='Y')):
    group.to_csv(path + title + '_' + str(name.year) + '.tsv', sep='\t')

In [None]:
df.reset_index(inplace=True)