# Searching for Māori language text in the NZ Herald archives

I have collected about 450,000 NZ Herald articles. We want to search them for māori words to see how the use of Te Reo Māori in the NZ Herald has changed over time.

In [1]:
import re
import jsonlines
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

pd.set_option('display.max_rows', 500)

from utils import parallel_process
from multiprocessing import cpu_count
from nltk.tokenize import word_tokenize

from reo_toolkit import is_maori, vowels, consonants, ambiguous

In [2]:
nzherald = []
with jsonlines.open('../data/nzherald.json') as reader:
    for obj in reader:
        nzherald.append(obj)

In [3]:
nzherald[0]

{'url': 'https://www.nzherald.co.nz/nz/news/article.cfm?c_id=1&objectid=12329090',
 'byline': None,
 'date': '2 May, 2020 12:44pm',
 'headline': [' Six more Covid-19 cases; Chch man dies of coronavirus  '],
 'sponsor': [],
 'subheader': ['The latest Covid-19 victim will be remembered as '],
 'syndicator_name': ['NZ Herald'],
 'paragraphs': [' There are six new cases of Covid-19 today and the Ministry of Health reports that another person has died of coronavirus.',
  " George Hollings, a Rosewood Rest Home resident in his 80s, died in Christchurch's Burwood Hospital of the virus early this morning.",
  ' "George had a lot of friends who the family don\'t have contact details for and they\'d like for them to have the opportunity to grieve along with his family," the Ministry of Health says.',
  ' ',
  '"His family tell us that George will be remembered as a real Kiwi bloke, a rough diamond, who loved his deer stalking," it said.',
  ' Hollings\' family, in a statement to health officials

In [4]:
herald_data = pd.DataFrame(nzherald)

In [5]:
herald_data['date'] = pd.to_datetime(herald_data.date)

In [6]:
herald_data['paragraphs'] = herald_data.paragraphs.apply(lambda x: '\n'.join([s.strip() for s in x]).strip())

In [7]:
herald_data['byline'] = herald_data.byline.apply(lambda x: '\n'.join([s.strip() for s in x]).strip() if x else '')

In [8]:
herald_data['headline'] = (herald_data.headline
    .apply(lambda x: re.sub('\s{2,}', ' ', ' '.join([s.strip() for s in x]).strip()))
)

In [9]:
herald_data['subheader'] = herald_data.subheader.apply(lambda x: ' '.join(set(x))).str.replace("<[^>]+>", "")

In [10]:
herald_data['sponsor'] = herald_data.sponsor.apply(lambda x: ''.join(x))

In [11]:
herald_data['syndicator_name'] = herald_data.syndicator_name.apply(lambda x: ''.join(x))

In [12]:
herald_data.head()

Unnamed: 0,url,byline,date,headline,sponsor,subheader,syndicator_name,paragraphs
0,https://www.nzherald.co.nz/nz/news/article.cfm...,,2020-05-02 12:44:00,Six more Covid-19 cases; Chch man dies of coro...,,The latest Covid-19 victim will be remembered as,NZ Herald,There are six new cases of Covid-19 today and ...
1,https://www.nzherald.co.nz/business/news/artic...,By: Keith Bradsher,2020-04-29 09:48:00,Covid 19 coronavirus: China's factories are ba...,,New York Times: Manufacturing giant once again...,New York Times,An unemployed young college graduate has stopp...
2,https://www.nzherald.co.nz/business/news/artic...,By: The editorial board,2020-04-29 10:54:00,Covid 19 coronavirus: Antivirus fight takes a ...,,Financial Times: Furlough schemes will not be ...,Financial Times,The Black Death is often credited with transfo...
3,https://www.nzherald.co.nz/business/news/artic...,"By: David Fisher Senior writer, NZ Herald da...",2020-03-07 05:00:00,"Inside a multi-million-dollar, eight-year divo...",,Sarah Sparks split from Greg Olliver eight yea...,,"There always seemed to be plenty, even when he..."
4,https://www.nzherald.co.nz/business/news/artic...,By: Tim Harford,2020-04-29 11:33:00,Comment: Why we fail to prepare for disasters,,Financial Times: What makes us do nothing in t...,Financial Times,New Orleans's Times-Picayune noted the inadequ...


In [13]:
def contains_letters(text):
    for ch in text:
        if ch.isalpha():
            return True
    return False

def contains_no_numbers(text):
    for ch in text:
        if ch.isnumeric():
            return False
    return True

def clean_token(text):
    return re.sub('[^A-z\-]', '', text)

def get_māori_words(text):
    tokens = []
    for token in word_tokenize(text):
        cleaned = clean_token(token)
        if is_maori(token) and contains_letters(token) and \
           contains_no_numbers(token):
            tokens.append(token)
    return tokens

In [14]:
import cProfile

In [15]:
cProfile.run("[is_maori(text) for text in herald_data.paragraphs.sample(10000)]", sort = 'tottime')

         11844538 function calls (11662780 primitive calls) in 6.131 seconds

   Ordered by: internal time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
189395/7670    2.604    0.000    6.090    0.001 reo_toolkit.py:23(is_maori)
     7669    1.371    0.000    1.371    0.000 {method 'split' of '_sre.SRE_Pattern' objects}
4088347/4088343    0.278    0.000    0.278    0.000 {built-in method builtins.len}
  3684765    0.272    0.000    0.272    0.000 {method 'append' of 'list' objects}
   446376    0.269    0.000    0.269    0.000 {method 'search' of '_sre.SRE_Pattern' objects}
   180088    0.253    0.000    0.388    0.000 encoders.py:31(encode)
   372760    0.172    0.000    0.172    0.000 re.py:286(_compile)
   163441    0.146    0.000    0.321    0.000 __init__.py:1904(debug)
   720352    0.114    0.000    0.114    0.000 {method 'replace' of 'str' objects}
   181726    0.104    0.000    0.346    0.000 utils.py:12(is_camel_case)
   181726    0.079    0.000    0

In [16]:
%%time
herald_data['māori_words'] = parallel_process(herald_data.paragraphs, get_māori_words, n_jobs = cpu_count()-1)

100%|██████████| 448k/448k [02:26<00:00, 3.05kit/s]   
448084it [00:00, 557972.21it/s]


CPU times: user 1min 58s, sys: 17.4 s, total: 2min 16s
Wall time: 3min 34s


In [17]:
%%time
herald_data.to_csv("../data/herald_data.csv", index = False)

CPU times: user 29.8 s, sys: 837 ms, total: 30.6 s
Wall time: 32 s


In [18]:
herald_data.head()

Unnamed: 0,url,byline,date,headline,sponsor,subheader,syndicator_name,paragraphs,māori_words
0,https://www.nzherald.co.nz/nz/news/article.cfm...,,2020-05-02 12:44:00,Six more Covid-19 cases; Chch man dies of coro...,,The latest Covid-19 victim will be remembered as,NZ Herald,There are six new cases of Covid-19 today and ...,"[a, a, to, to, a, Kiwi, a, a, to, to, to, a, h..."
1,https://www.nzherald.co.nz/business/news/artic...,By: Keith Bradsher,2020-04-29 09:48:00,Covid 19 coronavirus: China's factories are ba...,,New York Times: Manufacturing giant once again...,New York Times,An unemployed young college graduate has stopp...,"[A, a, no, to, a, a, to, to, a, a, to, to, a, ..."
2,https://www.nzherald.co.nz/business/news/artic...,By: The editorial board,2020-04-29 10:54:00,Covid 19 coronavirus: Antivirus fight takes a ...,,Financial Times: Furlough schemes will not be ...,Financial Times,The Black Death is often credited with transfo...,"[to, a, to, to, to, to, to, a, to, a, a, to, a..."
3,https://www.nzherald.co.nz/business/news/artic...,"By: David Fisher Senior writer, NZ Herald da...",2020-03-07 05:00:00,"Inside a multi-million-dollar, eight-year divo...",,Sarah Sparks split from Greg Olliver eight yea...,,"There always seemed to be plenty, even when he...","[to, he, to, a, a, I, I, to, a, he, to, to, He..."
4,https://www.nzherald.co.nz/business/news/artic...,By: Tim Harford,2020-04-29 11:33:00,Comment: Why we fail to prepare for disasters,,Financial Times: What makes us do nothing in t...,Financial Times,New Orleans's Times-Picayune noted the inadequ...,"[a, a, a, a, A, a, to, a, to, to, He, to, no, ..."


In [19]:
class NoMacronEncoder():

    encoder_dict = {
        'ā': 'a',
        'ē': 'e',
        'ī': 'i',
        'ō': 'o',
        'ū': 'u'
    }

    decoder_dict = {v: k for k, v in encoder_dict.items()}

    def encode(self, text):
        for k, v in self.encoder_dict.items():
            text = text.replace(k, v)
        return text

    def decode(self, text):
        return text

In [20]:
words_by_date = (herald_data.sort_values('date', ascending = False)
     .loc[
    herald_data.māori_words.apply(len) > 0, 
    ['date', 'māori_words']
    ].explode('māori_words')
    .reset_index(drop = True)
)

In [21]:
words_by_date['no_macrons'] = words_by_date.māori_words.apply(lambda s: NoMacronEncoder().encode(s))

In [22]:
words_by_date['has_macrons'] = words_by_date.māori_words.apply(lambda s: NoMacronEncoder().encode(s) != s)

In [23]:
words_by_date[words_by_date.has_macrons]

Unnamed: 0,date,māori_words,no_macrons,has_macrons
420,2020-05-02 19:04:00,Taupō,Taupo,True
1268,2020-05-02 13:42:00,whānau,whanau,True
1399,2020-05-02 12:52:00,Māori,Maori,True
1408,2020-05-02 12:52:00,Māori,Maori,True
1417,2020-05-02 12:52:00,Māori,Maori,True
...,...,...,...,...
9758350,2011-07-06 10:21:00,anā,ana,True
9758352,2011-07-06 10:21:00,Anā,Ana,True
9758358,2011-07-06 10:21:00,Māori,Maori,True
10150816,2011-03-05 14:49:00,Taupō-Rotorua,Taupo-Rotorua,True


In [24]:
len(set(w for l in herald_data.māori_words for w in l)) * len(herald_data)

10577918988

In [None]:
pd.Series([w for l in herald_data.māori_words.values for w in l]).value_counts().head(500)