# Prepare Corpus
This notebook prepares the newspaper data for analysis of metadata and content. 

In [4]:
import pandas as pd
import os
import re
import sys
import unidecode
from os import listdir
from os.path import isfile, join
from tqdm import tqdm

sys.path.append('../')

## Functions

In [5]:
def digit_perc(x):
    '''
    function to calculate the percentage of digits in a text
    '''
    return round(sum(c.isdigit() for c in str(x)) / len(str(x)), 4)

# Preparing Metadata

In [19]:
data_path = '../../../datasets/newspapers_clean/'

In [None]:
# here we go through the list of datafiles and preprocess the data and calculate some additional features

results = []
subdirs = ['ads']
for root, dirs, files in os.walk(data_path):
    if all(subdir in dirs for subdir in subdirs):
        results.append(os.path.join(root,subdirs[0]))


regex_pat = re.compile(r'[^a-zA-Z\s]', flags=re.IGNORECASE)
meta_data = []

for path in results[0:1]:
    print(path)
    frame = []
    onlyfiles = [f for f in listdir(path) if isfile(join(path, f))]
    onlyfiles = [f for f in onlyfiles if f.endswith('.tsv')]
    newspaper_name = path.split('/')[5]
    newspaper_years = []
    for filename in tqdm(onlyfiles):
        print(filename)
        df = pd.read_csv(os.path.join(path, filename), index_col=None, delimiter='\t')
        df['perc_digits'] = df['ocr'].apply(lambda x: digit_perc(x))
        df['ocr'] = df['ocr'].astype(str)
        df['ocr'] = df['ocr'].apply(lambda x: unidecode.unidecode(x)) 
        df['ocr'] = df['ocr'].str.replace(regex_pat, '') #only words
        df['ocr'] = df['ocr'].str.findall(r'\w{2,}').str.join(' ').str.lower() #only words longer than 2 chars
        df['identifier'] = df['ocr_url'].apply(lambda x: x.split('/')[:][3][12:-4])
        df['string_length'] = df['ocr'].str.len()
        df['character_proportion'] = df['string_length'] / df ['size']
        #ocr = df[['identifier', 'ocr']]
        df.drop('ocr', axis=1, inplace=True)
        df['newspaper_name'] = 'nrc'
        #ocr.to_csv(os.path.join(path, 'ocr_' + filename), sep='\t', index=None)
        newspaper_years.append(df)

    frame = pd.concat(newspaper_years, axis=0, ignore_index=True)
    meta_data.append(frame)

meta_data = pd.concat(meta_data, axis=0, ignore_index=True)



In [28]:
# we remove ads with height or width smaller than 100 px
meta_data = meta_data[(meta_data['h'] >= 100) & (meta_data['w'] >= 100)] 
meta_data.to_csv(os.path.join(data_path,'full_metadata.csv'))