# Searching for Māori language text in the NZ Herald archives

I have collected about 450,000 NZ Herald articles. We want to search them for māori words to see how the use of Te Reo Māori in the NZ Herald has changed over time.

In [14]:
import re
import jsonlines
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

pd.set_option('display.max_rows', 500)

from utils import parallel_process
from multiprocessing import cpu_count
from nltk.tokenize import word_tokenize

from reo_toolkit import is_maori, vowels, consonants, ambiguous

In [15]:
nzherald = []
with jsonlines.open('../data/nzherald.json') as reader:
    for obj in reader:
        nzherald.append(obj)

In [16]:
nzherald[0]

{'url': 'https://www.nzherald.co.nz/nz/news/article.cfm?c_id=1&objectid=12329090',
 'byline': None,
 'date': '2 May, 2020 12:44pm',
 'headline': [' Six more Covid-19 cases; Chch man dies of coronavirus  '],
 'sponsor': [],
 'subheader': ['The latest Covid-19 victim will be remembered as '],
 'syndicator_name': ['NZ Herald'],
 'paragraphs': [' There are six new cases of Covid-19 today and the Ministry of Health reports that another person has died of coronavirus.',
  " George Hollings, a Rosewood Rest Home resident in his 80s, died in Christchurch's Burwood Hospital of the virus early this morning.",
  ' "George had a lot of friends who the family don\'t have contact details for and they\'d like for them to have the opportunity to grieve along with his family," the Ministry of Health says.',
  ' ',
  '"His family tell us that George will be remembered as a real Kiwi bloke, a rough diamond, who loved his deer stalking," it said.',
  ' Hollings\' family, in a statement to health officials

In [17]:
herald_data = pd.DataFrame(nzherald)

In [18]:
herald_data['paragraphs'] = herald_data.paragraphs.apply(lambda x: '\n'.join([s.strip() for s in x]).strip())

In [19]:
herald_data['byline'] = herald_data.byline.apply(lambda x: '\n'.join([s.strip() for s in x]).strip() if x else '')

In [20]:
herald_data['headline'] = (herald_data.headline
    .apply(lambda x: re.sub('\s{2,}', ' ', ' '.join([s.strip() for s in x]).strip()))
)

In [21]:
herald_data['subheader'] = herald_data.subheader.apply(lambda x: ' '.join(set(x)))

In [22]:
herald_data['sponsor'] = herald_data.sponsor.apply(lambda x: ''.join(x))

In [23]:
herald_data['syndicator_name'] = herald_data.syndicator_name.apply(lambda x: ''.join(x))

In [24]:
herald_data.head()

Unnamed: 0,url,byline,date,headline,sponsor,subheader,syndicator_name,paragraphs
0,https://www.nzherald.co.nz/nz/news/article.cfm...,,"2 May, 2020 12:44pm",Six more Covid-19 cases; Chch man dies of coro...,,The latest Covid-19 victim will be remembered as,NZ Herald,There are six new cases of Covid-19 today and ...
1,https://www.nzherald.co.nz/business/news/artic...,By: Keith Bradsher,"29 Apr, 2020 9:48am",Covid 19 coronavirus: China's factories are ba...,,New York Times: Manufacturing giant once again...,New York Times,An unemployed young college graduate has stopp...
2,https://www.nzherald.co.nz/business/news/artic...,By: The editorial board,"29 Apr, 2020 10:54am",Covid 19 coronavirus: Antivirus fight takes a ...,,Financial Times: Furlough schemes will not be ...,Financial Times,The Black Death is often credited with transfo...
3,https://www.nzherald.co.nz/business/news/artic...,"By: David Fisher Senior writer, NZ Herald da...","7 Mar, 2020 5:00am","Inside a multi-million-dollar, eight-year divo...",,Sarah Sparks split from Greg Olliver eight yea...,,"There always seemed to be plenty, even when he..."
4,https://www.nzherald.co.nz/business/news/artic...,By: Tim Harford,"29 Apr, 2020 11:33am",Comment: Why we fail to prepare for disasters,,Financial Times: What makes us do nothing in t...,Financial Times,New Orleans's Times-Picayune noted the inadequ...


In [42]:
def contains_letters(text):
    for ch in text:
        if ch.isalpha():
            return True
    return False

def contains_no_numbers(text):
    for ch in text:
        if ch.isnumeric():
            return False
    return True

def clean_token(text):
    return re.sub('[^A-z\-]', '', text)

def get_māori_words(text):
    tokens = []
    for token in word_tokenize(text):
        cleaned = clean_token(token)
        if is_maori(token) and contains_letters(token) and \
           contains_no_numbers(token) and not cleaned.lower() in ambiguous:
            tokens.append(token)
    return tokens

In [36]:
herald_data['māori_words'] = parallel_process(herald_data.paragraphs, get_māori_words, n_jobs = cpu_count()-1)

100%|██████████| 448k/448k [03:31<00:00, 2.12kit/s]   
448084it [00:00, 559280.31it/s]


CPU times: user 1min 59s, sys: 18 s, total: 2min 17s
Wall time: 4min 33s


In [48]:
herald_data.to_csv("../data/herald_data.csv", index = False)