# Cleaning the JSON data

Got a bunch of messy html wrapped in json, gotta clean it up.

In [1]:
import os
import re
import json
from bs4 import BeautifulSoup

import hashlib
from nltk import sent_tokenize, word_tokenize

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
json_dict = json.load(open("../ulukau/example.json", "r", encoding = "utf-8"))

In [3]:
def clean_text(texts):
    '''
    Clean the body text for a given xpath tag
    '''
    clean_texts = []
    for text in texts:
        soup = BeautifulSoup(text, 'lxml')
        for p in soup.findAll('p'):
            clean_texts.append(p.text.replace("\n", " "))
    return '\n'.join(clean_texts)

In [4]:
print(clean_text(json_dict['text']))

HAWAII HOLOMUA.
"Ua mau ke Ea o ka Aina i ka Pono."
BUKE  III. HELU 144  HONOLULU, POAKAHI, IANUARI 23, 1893.  HELUNA NUI 444
 
PAPA KUHIKUHI WA HOLO
—:o ka:—
HUI ALAHAO A ME AINA O OAHU.
MANAWA HOLO.
MAI A MAHOPE AKU O OCT. 1, 1892.
NA KAA AHI.
  A. M.  A. M.  P. M.  P. M.
HAALELEIA HONOLULU  6:15*  9:45  1:45  4:35□
HIKI I HONOULIULI  7:20*  9:57  1:57  5:35□
HAALELE HONOULIULI  7:36*  10:43  3:43  5:42
HIKI I HONOLULU  8:35*  11:55  4:55  6:50□
NO MANANA WALE NO.
HAALELEIA HONOLULU      5:48
HIKI I MANANA      5:48
HAALELEIA MANANA  6:55*
HIKI I HONOLULU  7:30
□No na POAONO wale no.
||Koe na la SABATI.
*Koe na POAONO.  tfd
 
KAUKA
Yong Kam Pung
(APANA)
Helu 81 Alanui Maunakea.
KAUKA LOEA O KA AINA PUA
Ua hiki ke hoola ia kela a me keia ano ma'i, mai ko na kane, wahine a me ko na keiki liilii. O na ma'i ha—no a me na ma'i e pili ana i ka maka, pau pu ia i ke ola. O na ma'i koko inoino a me ka hooulu hou ana i na wahi poino o ka puuwai a me ke kino, e hoola ia no me ka maalahi. Pela m

In [5]:
header = "<h3>KA HAE HAWAII. Buke 5, Ano Hou.--Helu 38, Aoao 155. Dekemaba 19, 1860.<br>19 Kekemapa 1860<br><a href=\"/gsdl2.5/cgi-bin/nupepa?a=d&amp;cl=CL1.7.5&amp;d=HASHaf2fb0ad2e93b8473408c5.4&amp;e=d-0nupepa--00-0-0--010---4-----text---0-1l--1haw-Zz-1---20-about---0003-1-0000utfZz-8-00\"> <span style=\"font-size:14px;\">&lt;&lt; previous issue</span> </a><br><div style=\"padding-left:61px;\"><a href=\"/gsdl2.5/cgi-bin/nupepa?a=d&amp;cl=CL1.7.5&amp;d=HASH018f094fdf8e2b2a76b3986f.1&amp;e=d-0nupepa--00-0-0--010---4-----text---0-1l--1haw-Zz-1---20-about---0003-1-0000utfZz-8-00\"> <span style=\"font-size:14px;\">next issue &gt;&gt;</span> </a></div></h3>"

In [6]:
soup = BeautifulSoup(header)

In [7]:
for h3 in soup.findAll("h3"):
    print(h3.text)

KA HAE HAWAII. Buke 5, Ano Hou.--Helu 38, Aoao 155. Dekemaba 19, 1860.19 Kekemapa 1860 << previous issue  next issue >> 


## Reading the json data

We've downloaded a bunch of the `json` data. Time to take a look at it:

In [8]:
ulukau_json = json.load(open("../ulukau/ulukau.json", "r", encoding = "utf-8"))

In [9]:
ulukau_data = pd.DataFrame.from_dict(ulukau_json)

In [10]:
ulukau_data.head()

Unnamed: 0,text,title,url
0,ULUKAU: HAWAIIAN ELECTRONIC LIBRARY Hoʻol...,"Buke 65, Helu 499 Kekemapa 1926 << previous is...",http://nupepa.org/gsdl2.5/cgi-bin/nupepa?e=d-0...
1,ULUKAU: HAWAIIAN ELECTRONIC LIBRARY Hoʻol...,"Buke 65, Helu 4618 Nowemapa 1926 << previous i...",http://nupepa.org/gsdl2.5/cgi-bin/nupepa?e=d-0...
2,ULUKAU: HAWAIIAN ELECTRONIC LIBRARY Hoʻol...,"Buke 65, Helu 482 Kekemapa 1926 << previous is...",http://nupepa.org/gsdl2.5/cgi-bin/nupepa?e=d-0...
3,ULUKAU: HAWAIIAN ELECTRONIC LIBRARY Hoʻol...,"Buke 65, Helu 4725 Nowemapa 1926 << previous i...",http://nupepa.org/gsdl2.5/cgi-bin/nupepa?e=d-0...
4,ULUKAU: HAWAIIAN ELECTRONIC LIBRARY Hoʻol...,"Buke 65, Helu 4511 Nowemapa 1926 << previous i...",http://nupepa.org/gsdl2.5/cgi-bin/nupepa?e=d-0...


In [11]:
def md5(text):
    return hashlib.md5(text.encode("utf-8")).hexdigest()

In [12]:
ulukau_data['title_hash'] = ulukau_data.title.apply(md5)

In [13]:
for x in ulukau_data.loc[ulukau_data['title_hash'] == "7bad8fd73c9d2511f07c1ab08c5097ed", 'text'].values:
    print(x)
    print("==========================")

KA NONANONA. "E ka mea hiamoe, e hele oe i ka nonanona, e nana i kona aoao a e hoonaauao iho." SOLOMONA. Buke 3. HONOLULU, OAHU, IULAI 4, 1843. Pepa 2 a me ke 3.
 
                He nui na haole i lawe i ka Nonanona, aole hoi ike maopopo lakou i ka olelo Hawaii, a nolaila ua paiia ka olelo a ke aliinui o Amerikahuipuia ma na olelo elua. O ka pono ole o ka hoohalike ana kekahi; aole maopopo loa keia olelo ma ka olelo Hawaii. Ma ka unuhi ana, aole au i nana nui i na hua olelo, a malaila i hoohalike ai; ma ke ano nui wale no au i hoohalike ai. Na ke aliinui o Amerikahuipuia e hoike aku i keia olelo imua o ka ahaolelo o ka poe i kohoia. Aole paha mea makemake ole e heluhelu i keia olelo.  
To the House of Representatives of the United States:                  I communicate herewith to Congress copies of a correspondence, which has recently taken place between certain agents of the Government of the Hawaiian, or Sandwich Islands, and the Secretary of State.                 The condition of

In [14]:
print("{} pages downloaded".format(len(ulukau_data)))

1377 pages downloaded


In [15]:
ulukau_data['numwords'] = ulukau_data['text'].str.count("\s+") + 1

In [16]:
for i, x in ulukau_data.sort_values('numwords').head().iterrows():
    print("Url:\n" + x.url, end = "\n\n")
    print(x.text)
    print("\n=======================\n")

Url:
http://nupepa.org/gsdl2.5/cgi-bin/nupepa?e=d-0nupepa--00-0-0--010---4-----text---0-1l--1haw-Zz-1---20-about---0003-1-0000utfZz-8-00&a=d&cl=CL1.22.1&d=HASHaabe4d4d413416dfc132cc&gg=text

   ULUKAU: HAWAIIAN ELECTRONIC LIBRARY   Hoʻolaupaʻi       He ʻOhina Nūpepa ʻŌlelo Hawaiʻi   
ULUKAU: HAWAIIAN ELECTRONIC LIBRARY
Hoʻolaupaʻi
    He ʻOhina Nūpepa ʻŌlelo Hawaiʻi
ULUKAU: HAWAIIAN ELECTRONIC LIBRARY
Hoʻolaupaʻi
    He ʻOhina Nūpepa ʻŌlelo Hawaiʻi
  English Text  Kōkua   Kekahi...   
English Text
Kōkua
Kekahi...
EnglishÂ Text

Kekahi...
 Buke 1, Helu 514 Kepakemapa 1841 << previous issue  next issue >>  Buke 1, Helu 5   S01  S02  S03  S04   [ hoʻokaʻawale kikokikona ][ mai hōʻike i ke kahiāuli ]


Buke 1, Helu 5

  S01  S02  S03  S04  


S01


S02


S03


S04


Buke 1, Helu 5

  S01  S02  S03  S04  


S01


S02


S03


S04


S01


S02


S03


S04
 
  << previous issue  
  next issue >>  


Url:
http://nupepa.org/gsdl2.5/cgi-bin/nupepa?e=d-0nupepa--00-0-0--010---4-----text---0-1l--1haw

# Cleaning the text

In [17]:
ulukau_data.title.values

array(['Buke 65, Helu 499 Kekemapa 1926 << previous issue  next issue >> ',
       'Buke 65, Helu 4618 Nowemapa 1926 << previous issue  next issue >> ',
       'Buke 65, Helu 482 Kekemapa 1926 << previous issue  next issue >> ',
       ...,
       'KA HOKU O KA PAKIPIKA. Buke I, Helu 3, Aoao 1. Okatoba 10, 1861.10 ʻOkakopa 1861 << previous issue  next issue >> ',
       'KA HOKU O KA PAKIPIKA. Buke I, Helu 1, Aoao 1. Sepetemaba 26, 1861.26 Kepakemapa 1861 next issue >> ',
       'KA HOKU O KA PAKIPIKA. Buke I, Helu 2, Aoao 1. Okatoba 3, 1861.3 ʻOkakopa 1861 << previous issue  next issue >> '],
      dtype=object)

In [18]:
# Delete the following
space_replace = ["—+", "Â"]
to_delete = ['\xa0', "<<", ">>", "next issue", "previous issue", "EnglishText", "ʻaoʻao aʻe",
             "ULUKAU: HAWAIIAN ELECTRONIC LIBRARY", "}", "{", ">", "<", "English Text"]

def clean_column(col):
    col = (col.str.replace("(<<[^>]+>>)", "")
              .str.replace("(\[[^>]+\])", "")
              .str.replace("(\([^>]+\))", "")
              .str.replace("|".join(to_delete), "")
              .str.replace("|".join(space_replace), " ")
              .str.replace('\-{2,}', "-")
              .str.replace('\.\-', ". ")
              .str.replace('\.{3,}', " ")
              .str.replace('\s+:', ': '))
    return col

ulukau_data['title'] = clean_column(ulukau_data.title)
ulukau_data['text'] = clean_column(ulukau_data.text)

# Remove multiple whitespace
ulukau_data['text'] = ulukau_data.text.str.replace("\s{2,}", " ")
ulukau_data['title'] = ulukau_data.title.str.replace("\s{2,}", " ")

# Remove trailing whitespace
ulukau_data['text'] = ulukau_data.text.str.strip()
ulukau_data['title'] = ulukau_data.title.str.strip()

In [19]:
ulukau_data.title.sample(10).values

array(['Buke 64, Helu 3710 Kepakemapa 1925',
       'KA HAE HAWAII. Buke 3, Ano Hou. Helu 51, Aoao 201. Maraki 23, 1859.23 Malaki 1859',
       'KA HAE HAWAII. Buke 4, Ano Hou. Helu 52, Aoao 205. Maraki 28, 1860.28 Malaki 1860',
       'KA NONANONA. Buke 1, Pepa 1, Aoao 1. Iulai 6, 1841.6 Iulai 1841',
       'KA HOKU O KA PAKIPIKA. Buke I, Helu 11, Aoao 1. Dekemaba 5, 1861.5 Kekemapa 1861',
       'Buke 2, Helu 3315 ʻAukake 1863',
       'KE KUMU HAWAII. Buke 2, Pepa 15, Aoao 57. Iulai 20, 1836.20 Iulai 1836',
       'KA HAE HAWAII. Buke I, Helu 36, Aoao 141. Novemaba 5, 1856.5 Nowemapa 1856',
       'KA NONANONA. Buke 3, Pepa 5, Aoao 17. Iulai 25, 1843.25 Iulai 1843',
       'KE ALAULA. Buke IV, Helu 11, Aoao 41. Feberuari 1870.1 Pepeluali 1870'],
      dtype=object)

In [20]:
for i, x in ulukau_data.sample(5).iterrows():
    print("Url:\n" + x.url, "\n")
    print(x.text)
    print("\n================\n")

Url:
http://nupepa.org/gsdl2.5/cgi-bin/nupepa?e=d-0nupepa--00-0-0--010---4-----text---0-1l--1haw-Zz-1---20-about---0003-1-0000utfZz-8-00&a=d&cl=CL1.4.1&d=HASH01c3fa1f9a42c6ca82837459&gg=text 

He ʻOhina Nūpepa ʻŌlelo Hawaiʻi Hoʻolaupaʻi
He ʻOhina Nūpepa ʻŌlelo Hawaiʻi EnglishText Kōkua Kekahi EnglishText
Kōkua


Url:
http://nupepa.org/gsdl2.5/cgi-bin/nupepa?e=d-0nupepa--00-0-0--010---4-----text---0-1l--1haw-Zz-1---20-about---0003-1-0000utfZz-8-00&a=d&cl=CL1.32.3&d=HASH01e397a7cf6327f39d1e6fbe&gg=text 

KE ALAULA. BUKE II. HONOLULU, IANUARI, 1868. HELU 10. KE KILAPE. 38 38 38 He holoholona hanohano loa keia e ku nei me kana keiki uuku ma kona aoao. O ke kamelopadi ka inoa kahiko i kapaia, aka i keia wa, ke kapaia nei he kilape. Ma na aina mehana loa o waena o Aferika kona wahi i noho paa ai. Ua hopuia ekolu o keia mau holoholona kupaianaha a hoounaia ma kekahi moku i Parisa, ma Farani, i ka makahiki 1853. No ke anu o ka hooilo mua a lakou i noho ai malaila, make hookahi o lakou. Mahope 

In [21]:
alphabet = list('abcdefghijklmnopqrstuvwxyz')
numbers = list('0123456789')
vowels = list('aeiouāēīōū')
consonants = list("hklmnpw'")

def pairwise(iterable):
    it = iter(iterable)
    a = next(it, None)

    for b in it:
        yield (a, b)
        a = b

def is_hawaiian(text, verbose = False):
    '''
    Returns True is the text provided matches Hawaiian orthography
    '''
    # Lowercase text
    text = text.lower()
    # Remove non alphabet characters
    text = re.sub("[^a-zāēīōū\s]", "", text)
    if len(text) == 0:
        # String is empty
        if verbose: print("String is empty after cleaning")
        return False
    if len(text) == 1:
        if text in vowels:
            return True
        else:
            if verbose: print("Single character word {} is not a vowel".format(text))
            return False
    for current_ch, next_ch in pairwise(text):
        if current_ch not in consonants + vowels + [" "]:
            # Character not in hawaiian character set
            if verbose: print("Character '{}' not in hawaiian character set".format(current_ch))
            return False
        if current_ch in consonants:
            # The current character is a consonant, so the next must be a vowel
            if next_ch in vowels:
                # The next character is a vowel, so this is ok
                continue
            else:
                # The next character is not a vowel, this is not ok
                if verbose: print("The consonant '{}' is followed by '{}' instead of a vowel".format(current_ch, next_ch))
                return False
    if not next_ch in vowels:
        # Last character in word is a consonant
        if verbose: print("The last character '{}' is a consonant".format(next_ch))
        return False
    return True

In [22]:
from collections import Counter
from functools import reduce

In [23]:
word_counts = Counter()
drop_words = Counter()
for i, text in enumerate(ulukau_data.text.values):
    print("\rProcessing {} out of {}".format(i + 1, len(ulukau_data)), end = "")
    for sent in sent_tokenize(text):
        for word in word_tokenize(sent):
            word = word.lower()
            if is_hawaiian(word):
                try:
                    word_counts[word] += 1
                except KeyError:
                    word_counts[word] = 1
                continue
            if any(ch in alphabet for ch in word):
                try:
                    drop_words[word] += 1
                except KeyError:
                    drop_words[word] = 1

Processing 1377 out of 1377

In [24]:
word_counts = pd.Series(word_counts).sort_values(ascending = False)

In [25]:
drop_words = pd.Series(drop_words).sort_values(ascending = False)

In [26]:
with open("../data/dictionary_list.txt", "r", encoding = 'utf-8') as f:
    dict_words = f.read().strip().split("\n")
    dict_words = [re.sub("^\-", "", word) for word in dict_words]

In [27]:
len([word for word in dict_words if not is_hawaiian(word)])

600

In [28]:
len([word for word in dict_words if is_hawaiian(word)])

19823

In [29]:
word_data = pd.DataFrame(word_counts).reset_index()
word_data.columns = ['word', 'word_count']
word_data.head()

Unnamed: 0,word,word_count
0,ka,291076
1,i,230892
2,o,224662
3,a,144110
4,e,127294


In [30]:
word_data['in_dictionary'] = word_data.word.apply(lambda x: x in dict_words)

In [31]:
word_data.to_csv("../data/word_data.csv", index = False)

In [32]:
sum(~word_data.in_dictionary)

16487

In [33]:
sum(word_data.in_dictionary)

2643

In [35]:
word_data[word_data.word_count < 30]

Unnamed: 0,word,word_count,in_dictionary
2813,lunamaka-,29,False
2814,waikane,29,False
2815,pelehu,29,False
2816,pepe-,29,False
2817,hohono,29,True
2818,pulo,29,True
2819,a-,29,False
2820,maheleheleia,29,False
2821,kamalei,29,True
2822,kekaki,29,False


In [36]:
ok_words = set(word_data.loc[word_data.word_count > 30, 'word'].values)

In [37]:
with open("../data/niupepa.txt", "w", encoding = "utf-8") as f:
    for i, text in enumerate(ulukau_data.text.values):
        print("\rProcessing {} out of {}".format(i + 1, len(ulukau_data)), end = "")
        for sent in sent_tokenize(text):
            for word in word_tokenize(sent):
                word = word.lower()
                if is_hawaiian(word):
                    f.write(word + " ")
                elif any(ch in alphabet and not ch in numbers for ch in word):
                    try:
                        if word in ok_words:
                            f.write(word + " ")
                    except:
                        pass
            f.write("\n")

Processing 1377 out of 1377