In [2]:
import glob
import pandas as pd
from docopt import docopt
import re
import unidecode
import nltk
from random import shuffle
from nltk.corpus import stopwords
from collections import Counter

In [15]:
StopWords = set(stopwords.words('dutch'))
c = Counter()

In [95]:
def article_to_sentences(text):
    '''
    split article into sentences
    '''
    sent_tokenizer = nltk.punkt.PunktSentenceTokenizer()
    sentences = sent_tokenizer.tokenize(text)
    return sentences


def clean_sentences(sentences):
    '''
    clean sentences by removing punctuation, accents,
    removing words shorter than 3 characters and stopwords
    '''
    cleanSentences = []
    for sentence in sentences:
        #sentence = re.sub("[^a-zA-Z]", " ", sentence)
        #sentence = ' '.join(word.lower() for word in sentence.split() if len(word) >= 3 and word not in StopWords)
        sentence = [w for w in sentence.split()]
        if len(sentence) > 0:
            cleanSentences.append(sentence)
    return cleanSentences

def pre_process(title):
    path = '../newspapers/{}'.format(title)
    allFiles = glob.glob(path + "/articles/*.tsv")

    for f in allFiles:
        df = pd.read_csv(f, delimiter='\t', header=None)
        df.columns = ['date', 'page', 'size', 'min_x', 'min_y',
                  'max_x', 'max_y', 'w', 'h', 'image_url', 'ocr_url', 'ocr']
        df = df.dropna(subset=['ocr'])  # remove lines with empty ocr field

        df = df[~df['date'].str.contains('date')]  # remove duplicate header rows
        # remove files that contain error msg
        excludes = ['objecttype', 'file directory not found']
        df = df[~df['ocr'].astype(str).str.contains('|'.join(excludes))]
        df['date'] = pd.to_datetime(df['date'])
        
        year = df['date'].dt.year[1]
        print('making sentences: {}'.format(year))

        df['ocr'] = df['ocr'].apply(lambda x: unidecode.unidecode(x))
        docs = df['ocr'].apply(article_to_sentences)
        
        with open('{}.txt'.format(year), 'w') as output:
            for doc in docs:
                for sentence in doc:
                    c.update(sentences.split())
                    output.write("\n%s" % sentence)
    with open("vocabulary.txt", 'w') as f:
        f.write('<S>\n</S>\n<UNK>\n')
        for k,v in c.most_common():
            f.write('{}\n'.format(k) )

In [96]:
pre_process('vk')

making sentences: 1957
making sentences: 1969
making sentences: 1987
making sentences: 1977
making sentences: 1968
making sentences: 1990
making sentences: 1980
making sentences: 1953
making sentences: 1954
making sentences: 1940
making sentences: 1941
making sentences: 1961
making sentences: 1972
making sentences: 1959
making sentences: 1962
making sentences: 1964
making sentences: 1967
making sentences: 1947
making sentences: 1952
making sentences: 1982
making sentences: 1983
making sentences: 1994
making sentences: 1945
making sentences: 1966
making sentences: 1950
making sentences: 1973
making sentences: 1979
making sentences: 1946
making sentences: 1971
making sentences: 1985
making sentences: 1984
making sentences: 1978
making sentences: 1955
making sentences: 1956
making sentences: 1993
making sentences: 1974
making sentences: 1991
making sentences: 1976
making sentences: 1989
making sentences: 1963
making sentences: 1970


  if (yield from self.run_code(code, result)):


making sentences: 1986
making sentences: 1948
making sentences: 1951
making sentences: 1995
making sentences: 1975
making sentences: 1992
making sentences: 1988
making sentences: 1965
making sentences: 1960
making sentences: 1958
making sentences: 1981


In [4]:
df = pd.read_csv('../../../datasets/newspapers_clean/limburgs_dagblad/articles/limburgs_dagblad_1971.tsv', delimiter='\t', parse_dates=True)

In [5]:
df

Unnamed: 0,date,page,size,min_x,min_y,max_x,max_y,w,h,image_url,ocr_url,ocr,len,id
0,1971-01-02,1,118094,90,1667,623,2130,533,463,http://imageviewer.kb.nl/ImagingService/imagin...,http://resolver.kb.nl/resolve?urn=ddd:01054127...,VANDAAG Sneeuw,2,10541279
1,1971-01-02,1,482078,77,2276,651,3208,574,932,http://imageviewer.kb.nl/ImagingService/imagin...,http://resolver.kb.nl/resolve?urn=ddd:01054127...,Verzet tegen benoeming Simonis De veertien Rot...,42,10541279
2,1971-01-02,1,972957,79,3351,653,5418,574,2067,http://imageviewer.kb.nl/ImagingService/imagin...,http://resolver.kb.nl/resolve?urn=ddd:01054127...,"SPORT Abe was kwaad Abc Lenstra, die gisteren ...",76,10541279
3,1971-01-02,1,2712934,793,2402,2839,4025,2046,1623,http://imageviewer.kb.nl/ImagingService/imagin...,http://resolver.kb.nl/resolve?urn=ddd:01054127...,Dymsjits en Koeznetsov naar strafkamp Ook grat...,45,10541279
4,1971-01-02,1,583635,801,4120,1464,5095,663,975,http://imageviewer.kb.nl/ImagingService/imagin...,http://resolver.kb.nl/resolve?urn=ddd:01054127...,Amerikaan in Amsterdam doodgeschoten AMSTERX)A...,83,10541279
5,1971-01-02,1,466433,788,5122,1447,5887,659,765,http://imageviewer.kb.nl/ImagingService/imagin...,http://resolver.kb.nl/resolve?urn=ddd:01054127...,Nederlands schip vergaan; vijf bemanningsleden...,60,10541279
6,1971-01-02,1,435845,801,5908,1454,6584,653,676,http://imageviewer.kb.nl/ImagingService/imagin...,http://resolver.kb.nl/resolve?urn=ddd:01054127...,Deens schip gezonken na aanvaring met Noordzee...,101,10541279
7,1971-01-02,1,1586576,1486,3638,2852,5122,1366,1484,http://imageviewer.kb.nl/ImagingService/imagin...,http://resolver.kb.nl/resolve?urn=ddd:01054127...,VRIJDAG WEER PROCES TEGEN GROEP JODEN? Koeznet...,392,10541279
8,1971-01-02,1,1135424,1478,4774,2159,6587,681,1813,http://imageviewer.kb.nl/ImagingService/imagin...,http://resolver.kb.nl/resolve?urn=ddd:01054127...,"Ambonnezen mishandelden 2 agenten WINTERSWIJK,...",277,10541279
9,1971-01-02,1,416066,2198,5157,2848,5907,650,750,http://imageviewer.kb.nl/ImagingService/imagin...,http://resolver.kb.nl/resolve?urn=ddd:01054127...,Jongeman en meisje door gasverstikking omgekom...,55,10541279
