# Cleaning the JSON data

Got a bunch of messy html wrapped in json, gotta clean it up.

In [1]:
import os
import re
import json
from bs4 import BeautifulSoup

import hashlib
from nltk import sent_tokenize, word_tokenize

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

## Reading the json data

We've downloaded a bunch of the `json` data. Time to take a look at it:

In [3]:
ulukau_json = json.load(open("../ulukau/nupepa.json", "r", encoding = "utf-8"))

In [4]:
ulukau_data = pd.DataFrame.from_dict(ulukau_json)

In [5]:
ulukau_data.head()

Unnamed: 0,text,title,url
0,\n \nTHE NATIONAL HERALD\n\nKa Ahailono a ka ...,"Ka Ahailono a ka Lahui. Vol. 1, No. 28, Pg. 1....",http://nupepa.org/gsdl2.5/cgi-bin/nupepa?e=d-0...
1,"HAWAII HOLOMUA.\n""Ua mau ke Ea o ka Aina i ka ...","HAWAII HOLOMUA. Buke 3, Helu 159, Aoao 1. Febe...",http://nupepa.org/gsdl2.5/cgi-bin/nupepa?e=d-0...
2,"HAWAII HOLOMUA.\n""Ua mau ke Ea o ka Aina i ka ...","HAWAII HOLOMUA. Buke 3, Helu 158, Aoao 1. Febe...",http://nupepa.org/gsdl2.5/cgi-bin/nupepa?e=d-0...
3,"HAWAII HOLOMUA.\n""Ua mau ke Ea o ka Aina i ka ...","HAWAII HOLOMUA. Buke 3, Helu 157, Aoao 1. Febe...",http://nupepa.org/gsdl2.5/cgi-bin/nupepa?e=d-0...
4,"HAWAII HOLOMUA.\n""Ua mau ke Ea o ka Aina i ka ...","HAWAII HOLOMUA. Buke 3, Helu 156, Aoao 1. Febe...",http://nupepa.org/gsdl2.5/cgi-bin/nupepa?e=d-0...


In [6]:
def md5(text):
    return hashlib.md5(text.encode("utf-8")).hexdigest()

In [7]:
ulukau_data['title_hash'] = ulukau_data.title.apply(md5)

In [8]:
for x in ulukau_data.loc[ulukau_data['title_hash'] == "7bad8fd73c9d2511f07c1ab08c5097ed", 'text'].values:
    print(x)
    print("==========================")

KA NONANONA. "E ka mea hiamoe, e hele oe i ka nonanona, e nana i kona aoao a e hoonaauao iho." SOLOMONA. Buke 3. HONOLULU, OAHU, IULAI 4, 1843. Pepa 2 a me ke 3.
 
                He nui na haole i lawe i ka Nonanona, aole hoi ike maopopo lakou i ka olelo Hawaii, a nolaila ua paiia ka olelo a ke aliinui o Amerikahuipuia ma na olelo elua. O ka pono ole o ka hoohalike ana kekahi; aole maopopo loa keia olelo ma ka olelo Hawaii. Ma ka unuhi ana, aole au i nana nui i na hua olelo, a malaila i hoohalike ai; ma ke ano nui wale no au i hoohalike ai. Na ke aliinui o Amerikahuipuia e hoike aku i keia olelo imua o ka ahaolelo o ka poe i kohoia. Aole paha mea makemake ole e heluhelu i keia olelo.  
To the House of Representatives of the United States:                  I communicate herewith to Congress copies of a correspondence, which has recently taken place between certain agents of the Government of the Hawaiian, or Sandwich Islands, and the Secretary of State.                 The condition of

In [9]:
print("{} pages downloaded".format(len(ulukau_data)))

1377 pages downloaded


In [10]:
ulukau_data['numwords'] = ulukau_data['text'].str.count("\s+") + 1

In [11]:
for i, x in ulukau_data.sort_values('numwords').head().iterrows():
    print("Url:\n" + x.url, end = "\n\n")
    print(x.text)
    print("\n=======================\n")

Url:
http://nupepa.org/gsdl2.5/cgi-bin/nupepa?e=d-0nupepa--00-0-0--010---4-----text---0-1l--1haw-Zz-1---20-about---0003-1-0000utfZz-8-00&a=d&cl=CL1.22.1&d=HASHaabe4d4d413416dfc132cc&gg=text

   ULUKAU: HAWAIIAN ELECTRONIC LIBRARY   Hoʻolaupaʻi       He ʻOhina Nūpepa ʻŌlelo Hawaiʻi   
ULUKAU: HAWAIIAN ELECTRONIC LIBRARY
Hoʻolaupaʻi
    He ʻOhina Nūpepa ʻŌlelo Hawaiʻi
ULUKAU: HAWAIIAN ELECTRONIC LIBRARY
Hoʻolaupaʻi
    He ʻOhina Nūpepa ʻŌlelo Hawaiʻi
  English Text  Kōkua   Kekahi...   
English Text
Kōkua
Kekahi...
EnglishÂ Text

Kekahi...
 Buke 1, Helu 514 Kepakemapa 1841 << previous issue  next issue >>  Buke 1, Helu 5   S01  S02  S03  S04   [ hoʻokaʻawale kikokikona ][ mai hōʻike i ke kahiāuli ]


Buke 1, Helu 5

  S01  S02  S03  S04  


S01


S02


S03


S04


Buke 1, Helu 5

  S01  S02  S03  S04  


S01


S02


S03


S04


S01


S02


S03


S04
 
  << previous issue  
  next issue >>  


Url:
http://nupepa.org/gsdl2.5/cgi-bin/nupepa?e=d-0nupepa--00-0-0--010---4-----text---0-1l--1haw

# Cleaning the text

In [12]:
ulukau_data.title.values

array(['Ka Ahailono a ka Lahui. Vol. 1, No. 28, Pg. 1. February 11, 1890. 11 Pepeluali 1890 << previous issue ',
       'HAWAII HOLOMUA. Buke 3, Helu 159, Aoao 1. Febeluari 14, 1893.14 Pepeluali 1893 << previous issue ',
       'HAWAII HOLOMUA. Buke 3, Helu 158, Aoao 1. Febeluari 10, 1893.10 Pepeluali 1893 << previous issue  next issue >> ',
       ...,
       'KA HOKU O KA PAKIPIKA. Buke I, Helu 3, Aoao 1. Okatoba 10, 1861.10 ʻOkakopa 1861 << previous issue  next issue >> ',
       'KA HOKU O KA PAKIPIKA. Buke I, Helu 2, Aoao 1. Okatoba 3, 1861.3 ʻOkakopa 1861 << previous issue  next issue >> ',
       'KA HOKU O KA PAKIPIKA. Buke I, Helu 1, Aoao 1. Sepetemaba 26, 1861.26 Kepakemapa 1861 next issue >> '],
      dtype=object)

In [13]:
# Delete the following
space_replace = ["—+", "Â"]
to_delete = ['\xa0', "<<", ">>", "next issue", "previous issue", "EnglishText", "ʻaoʻao aʻe",
             "ULUKAU: HAWAIIAN ELECTRONIC LIBRARY", "}", "{", ">", "<", "English Text"]

def clean_column(col):
    col = (col.str.replace("(<<[^>]+>>)", "")
              .str.replace("(\[[^>]+\])", "")
              .str.replace("(\([^>]+\))", "")
              .str.replace("|".join(to_delete), "")
              .str.replace("|".join(space_replace), " ")
              .str.replace('\-{2,}', "-")
              .str.replace('\.\-', ". ")
              .str.replace('\.{3,}', " ")
              .str.replace('\s+:', ': '))
    return col

ulukau_data['title'] = clean_column(ulukau_data.title)
ulukau_data['text'] = clean_column(ulukau_data.text)

# Remove multiple whitespace
ulukau_data['text'] = ulukau_data.text.str.replace("\s{2,}", " ")
ulukau_data['title'] = ulukau_data.title.str.replace("\s{2,}", " ")

# Remove trailing whitespace
ulukau_data['text'] = ulukau_data.text.str.strip()
ulukau_data['title'] = ulukau_data.title.str.strip()

In [14]:
ulukau_data.title.sample(10).values

array(['Buke 1, Helu 629 Mei 1865',
       'KE KUMU HAWAII. Buke 4, Pepa 25, Aoao 97. Mei 8, 1839.8 Mei 1839',
       'Buke 64, Helu 3130 Iulai 1925',
       'KE ALAULA. Buke VI, Helu 12, Aoao 45. Maraki 1872.1 Malaki 1872',
       'KA HAE HAWAII. Buke I, Helu 25, Aoao 97. Aukake 20, 1856.20 ʻAukake 1856',
       'KE ALAULA. Buke I, Helu 3, Aoao 9. Iune 1866.1 Iune 1866',
       'KA HAE HAWAII. Buke 3, Ano Hou. Helu 28, Aoao 109. Okatoba 13, 1858.13 ʻOkakopa 1858',
       'KA HAE HAWAII. Buke I, Helu 35, Aoao 137. Okatoba 29, 1856.29 ʻOkakopa 1856',
       'KA LAHUI HAWAII. Buke 3, Helu 6, Aoao 1. Feberuari 8, 1877.8 Pepeluali 1877',
       'KE ALAULA. Buke VI, Helu 10, Aoao 37. Ianuari 1872.1 Ianuali 1872'],
      dtype=object)

In [15]:
for i, x in ulukau_data.sample(5).iterrows():
    print("Url:\n" + x.url, "\n")
    print(x.text)
    print("\n================\n")

Url:
http://nupepa.org/gsdl2.5/cgi-bin/nupepa?e=d-0nupepa--00-0-0--010---4-----text---0-1l--1haw-Zz-1---20-about---0003-1-0000utfZz-8-00&a=d&cl=CL1.37.1&d=HASHea8c9b04e230f46e543a44&gg=text 

KE KIAI HOOPUKAIA I KA POAKAHI A ME KA POAHA BUKE 1. HELU 5. HONOLULU, T. H. SEPATEMABA 18, 1902 5 KENETA NO KE KOPE KE KIAI HOOPUKAIA MA NA POAKAHI AME POAHA E hoouna mai i na Kauoha Pepa, na Uku Pepa, na Mea Hou, na Hoolaha a me na mea e ae a pau, ma ka inoa o ka Pepa. Pahu Leta KA AUHAU PEPA
Ekolu Mahina $1.00 Eono Mahina 2.00 Hookahi Makahiki 4.00 FRED. W. BECKLEY, LUNAHOOPONOPONO ME LUNA HOOPUKA WM. J. COELHO, HEPO-LUNAHOOPONOPONO HONOLULU, T. H., SEPT. 18, 1902 E KE KIAI, HEAHA KO KA PO? O ka manu hookahi i paa i ka lima ua oi aku ia mamua o na manu ekolu e lele ana; a mai kuko oe i na manu ekolu a hookuu i ka manu hookahi paa, o loaa ole auanei na manu ekolu a o ko hookuu e ana i ka manu i paa, nele loa oe. He nui no na opio i loaa ka hana mamuli o kekahi mau kumu liilii wale no, ua huhu, a

In [16]:
alphabet = list('abcdefghijklmnopqrstuvwxyz')
numbers = list('0123456789')
vowels = list('aeiouāēīōū')
consonants = list("hklmnpw'")

def pairwise(iterable):
    it = iter(iterable)
    a = next(it, None)

    for b in it:
        yield (a, b)
        a = b

def is_hawaiian(text, verbose = False):
    '''
    Returns True is the text provided matches Hawaiian orthography
    '''
    # Lowercase text
    text = text.lower()
    # Remove non alphabet characters
    text = re.sub("[^a-zāēīōū\s]", "", text)
    if len(text) == 0:
        # String is empty
        if verbose: print("String is empty after cleaning")
        return False
    if len(text) == 1:
        if text in vowels:
            return True
        else:
            if verbose: print("Single character word {} is not a vowel".format(text))
            return False
    for current_ch, next_ch in pairwise(text):
        if current_ch not in consonants + vowels + [" "]:
            # Character not in hawaiian character set
            if verbose: print("Character '{}' not in hawaiian character set".format(current_ch))
            return False
        if current_ch in consonants:
            # The current character is a consonant, so the next must be a vowel
            if next_ch in vowels:
                # The next character is a vowel, so this is ok
                continue
            else:
                # The next character is not a vowel, this is not ok
                if verbose: print("The consonant '{}' is followed by '{}' instead of a vowel".format(current_ch, next_ch))
                return False
    if not next_ch in vowels:
        # Last character in word is a consonant
        if verbose: print("The last character '{}' is a consonant".format(next_ch))
        return False
    return True

In [17]:
from collections import Counter
from functools import reduce

In [18]:
word_counts = Counter()
drop_words = Counter()
for i, text in enumerate(ulukau_data.text.values):
    print("\rProcessing {} out of {}".format(i + 1, len(ulukau_data)), end = "")
    for sent in sent_tokenize(text):
        for word in word_tokenize(sent):
            word = word.lower()
            if is_hawaiian(word):
                try:
                    word_counts[word] += 1
                except KeyError:
                    word_counts[word] = 1
                continue
            if any(ch in alphabet for ch in word):
                try:
                    drop_words[word] += 1
                except KeyError:
                    drop_words[word] = 1

Processing 1377 out of 1377

In [19]:
word_counts = pd.Series(word_counts).sort_values(ascending = False)

In [20]:
drop_words = pd.Series(drop_words).sort_values(ascending = False)

In [22]:
with open("../data/dictionary_list.txt", "r", encoding = 'utf-8') as f:
    dict_words = f.read().strip().split("\n")
    dict_words = [re.sub("^\-", "", word) for word in dict_words]

In [23]:
len([word for word in dict_words if not is_hawaiian(word)])

600

In [24]:
len([word for word in dict_words if is_hawaiian(word)])

19823

In [25]:
word_data = pd.DataFrame(word_counts).reset_index()
word_data.columns = ['word', 'word_count']
word_data.head()

Unnamed: 0,word,word_count
0,ka,291076
1,i,230892
2,o,224662
3,a,144110
4,e,127294


In [26]:
word_data['in_dictionary'] = word_data.word.apply(lambda x: x in dict_words)

In [27]:
word_data.to_csv("../data/word_data.csv", index = False)

In [28]:
sum(~word_data.in_dictionary)

16487

In [29]:
sum(word_data.in_dictionary)

2643

In [30]:
word_data[word_data.word_count < 30]

Unnamed: 0,word,word_count,in_dictionary
2813,paukiki,29,False
2814,lailai,29,False
2815,alima,29,False
2816,kanikele,29,True
2817,emanuela,29,False
2818,kawaihau,29,False
2819,pulo,29,True
2820,kulikone,29,False
2821,hoolaaia,29,False
2822,ioaa,29,False


In [31]:
ok_words = set(word_data.loc[word_data.word_count > 30, 'word'].values)

In [32]:
with open("../data/nupepa.txt", "w", encoding = "utf-8") as f:
    for i, text in enumerate(ulukau_data.text.values):
        print("\rProcessing {} out of {}".format(i + 1, len(ulukau_data)), end = "")
        for sent in sent_tokenize(text):
            for word in word_tokenize(sent):
                word = word.lower()
                if is_hawaiian(word):
                    f.write(word + " ")
                elif any(ch in alphabet and not ch in numbers for ch in word):
                    try:
                        if word in ok_words:
                            f.write(word + " ")
                    except:
                        pass
            f.write("\n")

Processing 1377 out of 1377