# Cleaning the newspaper archive

In [1]:
import os
import re
import json
import itertools

import numpy as np
import pandas as pd
%matplotlib inline

from collections import Counter
from unicodedata import category
from utils import multicore_apply

from reo_toolkit import is_maori
from gensim.models import Phrases
from nltk.tokenize import word_tokenize, sent_tokenize

In [2]:
def remove_punctuation(text):
    return ''.join(ch for ch in text if category(ch)[0] != 'P')
    
def normalize_text(text):
    text = re.sub("\s{2,}", " ", text)
    return remove_punctuation(text.lower())

## Cleaning the text

In [21]:
with open('../data/teara-mi-content.txt', 'r', encoding = 'utf-8') as f:
    te_ara = f.read()

In [31]:
paragraphs = [
    line.strip() 
        for line in te_ara.split('\n')
            if not line.startswith('#') and 
               not line.startswith('http') and 
               len(line.strip()) > 0
]

In [32]:
te_ara = pd.DataFrame({'paragraph': paragraphs})

In [33]:
%%time
te_ara['sentence'] = multicore_apply(te_ara.paragraph, sent_tokenize)
te_ara = te_ara.explode('sentence')
te_ara = te_ara[~te_ara.sentence.isna()]

te_ara.head()

100%|██████████| 36.3k/36.3k [00:00<00:00, 154kit/s]
36341it [00:00, 473680.58it/s]


CPU times: user 7.86 s, sys: 1.77 s, total: 9.64 s
Wall time: 8.88 s


Unnamed: 0,paragraph,sentence
0,Ko te kāinga te pokapū o ngā mahi kai a te Māo...,Ko te kāinga te pokapū o ngā mahi kai a te Māori.
0,Ko te kāinga te pokapū o ngā mahi kai a te Māo...,"Ko te maramataka ka tohu i te wā ki tēnā mahi,..."
0,Ko te kāinga te pokapū o ngā mahi kai a te Māo...,Ka tauhokohoko ngā iwi i ngā kai mai i ngā mār...
1,Ngā kaihōpara me te hunga tauhokohoko,Ngā kaihōpara me te hunga tauhokohoko
2,Nō te takiwā o ngā tau 1250 – 1300 AD ka tae n...,Nō te takiwā o ngā tau 1250 – 1300 AD ka tae n...


In [34]:
# Print the number of sentences in te_ara
print("There are {} sentences in the māori newspaper archive".format(len(te_ara)))

There are 98802 sentences in the māori newspaper archive


Here are the first 5 sentences:

In [35]:
print('\n'.join(te_ara.sentence[:5]))

Ko te kāinga te pokapū o ngā mahi kai a te Māori.
Ko te maramataka ka tohu i te wā ki tēnā mahi, ki tēnā mahi.
Ka tauhokohoko ngā iwi i ngā kai mai i ngā māra, te hī ika, te mahi tuna, te tāwhiti manu, te kohikohi kai hoki.
Ngā kaihōpara me te hunga tauhokohoko
Nō te takiwā o ngā tau 1250 – 1300 AD ka tae ngā tīpuna o te Māori ki Aotearoa.


## Text cleaning

In [36]:
def extract_words(text):
    text = text.lower()
    results = []
    for word in word_tokenize(text):
        if re.search('[a-zāēīōū]', word):
            result = re.sub('\s{2,}', ' ',
                re.sub('[^a-zāēīōū]', ' ', word)
            )
            for res in result.split():
                results.append(res)
    return results

In [37]:
%%time
te_ara['words'] = multicore_apply(te_ara['sentence'], extract_words)

100%|██████████| 98.8k/98.8k [00:00<00:00, 220kit/s]  
98802it [00:00, 481381.15it/s]


CPU times: user 21.8 s, sys: 3.65 s, total: 25.4 s
Wall time: 23 s


In [50]:
word_counts = te_ara['words'].explode().value_counts()
word_counts = word_counts.to_frame().reset_index()
word_counts.columns = ['word', 'count']
word_counts.to_csv('../data/te_ara_word_counts.csv', index = False)
word_counts

Unnamed: 0,word,count
0,te,189061
1,i,116950
2,o,70319
3,ngā,56889
4,ki,56730
...,...,...
26293,evesham,1
26294,whakangākautanga,1
26295,kohukohungia,1
26296,hinehua,1


## Phrase modelling

We can use gensim to join common phrases in the text.

In [39]:
def phrase_model(lines, min_count, threshold, phrase_length):
    
    for _ in range(phrase_length):
        sentence_stream = [doc.split(" ") for doc in lines]
        bigram = Phrases(sentence_stream, min_count=min_count, threshold=threshold)
        lines = [' '.join(bigram[line.split()]) for line in lines]
        
    return lines

In [41]:
%%time
te_ara['phrase'] = phrase_model(te_ara.words.apply(lambda s: ' '.join(s)), 30, 10, 5)

CPU times: user 52.4 s, sys: 155 ms, total: 52.5 s
Wall time: 52.5 s


In [42]:
te_ara['phrase']

0         ko te kāinga te pokapū o ngā mahi kai a te māori
0        ko te maramataka ka tohu i te wā ki tēnā mahi ...
0        ka tauhokohoko ngā iwi i ngā kai mai i ngā mār...
1                    ngā kaihōpara me te hunga tauhokohoko
2        nō te takiwā o ngā tau ad ka tae ngā tīpuna o ...
                               ...                        
36339    nō mua atu ka tuhi ia i te haurongo mō ngoi i ...
36339    i taua tau ka whakaputaina tā ranginui walker ...
36339    i te tau ka puta te pukapuka a joe pere mō tōn...
36340    he uaua ki ētahi kaikōrero reo pākehā te whaka...
36340    pāwhiritia ngā pouaka kia rongo ai koe i te wh...
Name: phrase, Length: 98802, dtype: object

In [43]:
%%time
phrase_counts = te_ara.phrase.apply(str.split).explode().value_counts()
phrase_counts

CPU times: user 883 ms, sys: 7.48 ms, total: 890 ms
Wall time: 890 ms


te              189061
i               116950
o                70319
ngā              56889
ki               56730
                 ...  
whaitiripapa         1
whakapirihi          1
paneikuini           1
puakōwhai            1
puenga               1
Name: phrase, Length: 26590, dtype: int64

In [44]:
np.sum(phrase_counts > 30)

2883

## Saving output to disk

In [None]:
papers.to_csv('../data/papers.csv', index = False)