In [217]:
import re
from collections import Counter 

import pandas as pd
import numpy as np


import warnings
warnings.filterwarnings('ignore')

In [218]:
data_dir = "data/"
df = pd.read_csv(data_dir + "processed_train.csv")

In [219]:
### for testing purpose
# df = df[:2000]

In [220]:
df

Unnamed: 0,id,raw_address,POI/street
0,0,jl kapuk timur delta sili iii lippo cika 11 a ...,/jl kapuk timur delta sili iii lippo cika
1,2,setu siung 119 rt 5 1 13880 cipayung,/siung
2,3,"toko dita , kertosono",toko dita/
3,4,jl. orde baru,/jl. orde baru
4,5,"raya samb gede , 299 toko bb kids",toko bb kids/raya samb gede
...,...,...,...
265884,299994,karawaci baru kakap raya 156 rt 1 rw 3 karawaci,/kakap raya
265885,299995,jend ahmad yani 331 kertasari ciamis,/jend ahmad yani
265886,299996,"raya cila kko , cilandak timur kel.",/raya cila kko
265887,299998,jalan cipadu jaya taman asri gang bijaksana 3 ...,taman asri/


In [221]:
def getPOI(s) :
    return s.split("/")[0]

def getStreet(s) :
    return s.split("/")[1]

def split(s) :
    s = remove_multiple_whitespace(s)
    return s.split(" ")

### Remove whitespace

In [222]:
# remove multiple whitespace
# cover tabs, newlines, etc
def remove_multiple_whitespace(text) :
    return re.sub(' +', ' ', text.replace('\n', ' ').replace('\t', ' ')).strip()

In [223]:
remove_multiple_whitespace("asfua asfasbiuf \n \t  asfsa adfgas \t d ")

'asfua asfasbiuf asfsa adfgas d'

In [224]:
df["raw_address"] = df["raw_address"].apply(remove_multiple_whitespace)
df["POI/street"] = df["POI/street"].apply(remove_multiple_whitespace)

In [225]:
df["POI"] = df["POI/street"].apply(getPOI)
df["Street"] = df["POI/street"].apply(getStreet) 

df["SplitRaw"] = df["raw_address"].apply(split)
df["SplitPOI"] = df["POI"].apply(split)
df["SplitStreet"] = df["Street"].apply(split)

df

Unnamed: 0,id,raw_address,POI/street,POI,Street,SplitRaw,SplitPOI,SplitStreet
0,0,jl kapuk timur delta sili iii lippo cika 11 a ...,/jl kapuk timur delta sili iii lippo cika,,jl kapuk timur delta sili iii lippo cika,"[jl, kapuk, timur, delta, sili, iii, lippo, ci...",[],"[jl, kapuk, timur, delta, sili, iii, lippo, cika]"
1,2,setu siung 119 rt 5 1 13880 cipayung,/siung,,siung,"[setu, siung, 119, rt, 5, 1, 13880, cipayung]",[],[siung]
2,3,"toko dita , kertosono",toko dita/,toko dita,,"[toko, dita, ,, kertosono]","[toko, dita]",[]
3,4,jl. orde baru,/jl. orde baru,,jl. orde baru,"[jl., orde, baru]",[],"[jl., orde, baru]"
4,5,"raya samb gede , 299 toko bb kids",toko bb kids/raya samb gede,toko bb kids,raya samb gede,"[raya, samb, gede, ,, 299, toko, bb, kids]","[toko, bb, kids]","[raya, samb, gede]"
...,...,...,...,...,...,...,...,...
265884,299994,karawaci baru kakap raya 156 rt 1 rw 3 karawaci,/kakap raya,,kakap raya,"[karawaci, baru, kakap, raya, 156, rt, 1, rw, ...",[],"[kakap, raya]"
265885,299995,jend ahmad yani 331 kertasari ciamis,/jend ahmad yani,,jend ahmad yani,"[jend, ahmad, yani, 331, kertasari, ciamis]",[],"[jend, ahmad, yani]"
265886,299996,"raya cila kko , cilandak timur kel.",/raya cila kko,,raya cila kko,"[raya, cila, kko, ,, cilandak, timur, kel.]",[],"[raya, cila, kko]"
265887,299998,jalan cipadu jaya taman asri gang bijaksana 3 ...,taman asri/,taman asri,,"[jalan, cipadu, jaya, taman, asri, gang, bijak...","[taman, asri]",[]


In [226]:
def is_transformed(words, sentence): 
    if " ".join(words) in sentence :
        return False
    return True

In [227]:
words = ["taman", "meruya"]
sentence = "taman mer , 13 electr laun system , 2 meruya utara"
f(words, sentence)

True

In [228]:
df['TransformedPOI'] = df.apply(lambda x: is_transformed(x.SplitPOI, x.raw_address), axis=1)
df['TransformedStreet'] = df.apply(lambda x: is_transformed(x.SplitStreet, x.raw_address), axis=1)

In [229]:
df

Unnamed: 0,id,raw_address,POI/street,POI,Street,SplitRaw,SplitPOI,SplitStreet,TransformedPOI,TransformedStreet
0,0,jl kapuk timur delta sili iii lippo cika 11 a ...,/jl kapuk timur delta sili iii lippo cika,,jl kapuk timur delta sili iii lippo cika,"[jl, kapuk, timur, delta, sili, iii, lippo, ci...",[],"[jl, kapuk, timur, delta, sili, iii, lippo, cika]",False,False
1,2,setu siung 119 rt 5 1 13880 cipayung,/siung,,siung,"[setu, siung, 119, rt, 5, 1, 13880, cipayung]",[],[siung],False,False
2,3,"toko dita , kertosono",toko dita/,toko dita,,"[toko, dita, ,, kertosono]","[toko, dita]",[],False,False
3,4,jl. orde baru,/jl. orde baru,,jl. orde baru,"[jl., orde, baru]",[],"[jl., orde, baru]",False,False
4,5,"raya samb gede , 299 toko bb kids",toko bb kids/raya samb gede,toko bb kids,raya samb gede,"[raya, samb, gede, ,, 299, toko, bb, kids]","[toko, bb, kids]","[raya, samb, gede]",False,False
...,...,...,...,...,...,...,...,...,...,...
265884,299994,karawaci baru kakap raya 156 rt 1 rw 3 karawaci,/kakap raya,,kakap raya,"[karawaci, baru, kakap, raya, 156, rt, 1, rw, ...",[],"[kakap, raya]",False,False
265885,299995,jend ahmad yani 331 kertasari ciamis,/jend ahmad yani,,jend ahmad yani,"[jend, ahmad, yani, 331, kertasari, ciamis]",[],"[jend, ahmad, yani]",False,False
265886,299996,"raya cila kko , cilandak timur kel.",/raya cila kko,,raya cila kko,"[raya, cila, kko, ,, cilandak, timur, kel.]",[],"[raya, cila, kko]",False,False
265887,299998,jalan cipadu jaya taman asri gang bijaksana 3 ...,taman asri/,taman asri,,"[jalan, cipadu, jaya, taman, asri, gang, bijak...","[taman, asri]",[],False,False


**Note:**
there is a corrector for misspell word 

words = ['ds.', 'sukowiyono', 'dsn.', 'karangsono', 'rt', '01', 'rw', '03', 'utara', 'jembatan', 'simomasuk', 'gang', 'jembatan']

POI = ['jembatan', 'simo']

In [230]:
odds = []
def repair_words(words, repaireds, is_transformed) :
    if not is_transformed :
        return [words, []]
    res = words.copy()
    ori = []
    i = 0
    while i < len(words) :
        curr = words[i]
        if curr == repaireds[0][:len(curr)] :
            j = 1
            while j < len(repaireds) :
                if i + j >= len(words) :
                    odds.append({"words": words, "repaireds": repaireds})
                    print(i)
                    print(j)
                    print(words)
                    print(repaireds)
                    return [[],[]]
                curr = words[i+j]
                if curr == repaireds[j][:len(curr)] :
                    j += 1
                else :
                    break
            if j == len(repaireds) :
                ori = list(words[i:i+j])
                res[i:i+j] = repaireds
                return [res, ori]
        i += 1
    return [res, ori]

In [231]:
def get_index_zero(arr) :
    return arr[0]

def get_index_one(arr) :
    if len(arr) <= 1 :
        print(arr)
    return arr[1]

df["temp"] = df.apply(lambda x : repair_words(x.SplitRaw, x.SplitPOI, x.TransformedPOI), axis=1)
df["cleaned_address"] = df["temp"].apply(get_index_zero)
df["OriPOI"] = df["temp"].apply(get_index_one)
df["temp"] = df.apply(lambda x : repair_words(x.cleaned_address, x.SplitStreet, x.TransformedStreet), axis=1)
df["cleaned_address"] = df["temp"].apply(get_index_zero)
df["OriStreet"] = df["temp"].apply(get_index_one)

In [232]:
df[df["TransformedPOI"]].head()

Unnamed: 0,id,raw_address,POI/street,POI,Street,SplitRaw,SplitPOI,SplitStreet,TransformedPOI,TransformedStreet,temp,cleaned_address,OriPOI,OriStreet
9,10,"cikahuripan sd neg boj 02 klap boj , no 5 16877",sd negeri bojong 02/klap boj,sd negeri bojong 02,klap boj,"[cikahuripan, sd, neg, boj, 02, klap, boj, ,, ...","[sd, negeri, bojong, 02]","[klap, boj]",True,False,"[[cikahuripan, sd, negeri, bojong, 02, klap, b...","[cikahuripan, sd, negeri, bojong, 02, klap, bo...","[sd, neg, boj, 02]",[]
10,11,"yaya atohar ,",yayasan atohariyah/,yayasan atohariyah,,"[yaya, atohar, ,]","[yayasan, atohariyah]",[],True,False,"[[yayasan, atohariyah, ,], []]","[yayasan, atohariyah, ,]","[yaya, atohar]",[]
18,20,"toko bang ajs ,",toko bangunan ajs/,toko bangunan ajs,,"[toko, bang, ajs, ,]","[toko, bangunan, ajs]",[],True,False,"[[toko, bangunan, ajs, ,], []]","[toko, bangunan, ajs, ,]","[toko, bang, ajs]",[]
36,40,mar tabl metro iringmulyo metro timur,markaz tabligh metro/,markaz tabligh metro,,"[mar, tabl, metro, iringmulyo, metro, timur]","[markaz, tabligh, metro]",[],True,False,"[[markaz, tabligh, metro, iringmulyo, metro, t...","[markaz, tabligh, metro, iringmulyo, metro, ti...","[mar, tabl, metro]",[]
38,44,sd neg 12 anggrek,sd negeri 12 anggrek/,sd negeri 12 anggrek,,"[sd, neg, 12, anggrek]","[sd, negeri, 12, anggrek]",[],True,False,"[[sd, negeri, 12, anggrek], []]","[sd, negeri, 12, anggrek]","[sd, neg, 12, anggrek]",[]


### Save Cleaned Address

In [233]:
temp = df[["id", "cleaned_address", "POI/street"]].copy()
temp["cleaned_address"] = temp["cleaned_address"].apply(lambda x : remove_multiple_whitespace(" ".join(x)))
temp

Unnamed: 0,id,cleaned_address,POI/street
0,0,jl kapuk timur delta sili iii lippo cika 11 a ...,/jl kapuk timur delta sili iii lippo cika
1,2,setu siung 119 rt 5 1 13880 cipayung,/siung
2,3,"toko dita , kertosono",toko dita/
3,4,jl. orde baru,/jl. orde baru
4,5,"raya samb gede , 299 toko bb kids",toko bb kids/raya samb gede
...,...,...,...
265884,299994,karawaci baru kakap raya 156 rt 1 rw 3 karawaci,/kakap raya
265885,299995,jend ahmad yani 331 kertasari ciamis,/jend ahmad yani
265886,299996,"raya cila kko , cilandak timur kel.",/raya cila kko
265887,299998,jalan cipadu jaya taman asri gang bijaksana 3 ...,taman asri/


In [234]:
temp.rename(columns={"cleaned_address": "raw_address"}).to_csv("data/cleaned_train.csv", index=False)

# Create a Probabilistic Model for Word Transformation

In [235]:
df[["TransformedPOI", "SplitPOI", "OriPOI", "TransformedStreet", "SplitStreet", "OriStreet"]]

Unnamed: 0,TransformedPOI,SplitPOI,OriPOI,TransformedStreet,SplitStreet,OriStreet
0,False,[],[],False,"[jl, kapuk, timur, delta, sili, iii, lippo, cika]",[]
1,False,[],[],False,[siung],[]
2,False,"[toko, dita]",[],False,[],[]
3,False,[],[],False,"[jl., orde, baru]",[]
4,False,"[toko, bb, kids]",[],False,"[raya, samb, gede]",[]
...,...,...,...,...,...,...
265884,False,[],[],False,"[kakap, raya]",[]
265885,False,[],[],False,"[jend, ahmad, yani]",[]
265886,False,[],[],False,"[raya, cila, kko]",[]
265887,False,"[taman, asri]",[],False,[],[]


### List of Transformation

In [236]:
trans = df[df.apply(lambda x: not (x.TransformedPOI == False and x.TransformedStreet == False), axis=1)]
trans = trans[["TransformedPOI", "SplitPOI", "OriPOI", "TransformedStreet", "SplitStreet", "OriStreet"]]
trans.head()

Unnamed: 0,TransformedPOI,SplitPOI,OriPOI,TransformedStreet,SplitStreet,OriStreet
9,True,"[sd, negeri, bojong, 02]","[sd, neg, boj, 02]",False,"[klap, boj]",[]
10,True,"[yayasan, atohariyah]","[yaya, atohar]",False,[],[]
18,True,"[toko, bangunan, ajs]","[toko, bang, ajs]",False,[],[]
36,True,"[markaz, tabligh, metro]","[mar, tabl, metro]",False,[],[]
38,True,"[sd, negeri, 12, anggrek]","[sd, neg, 12, anggrek]",False,[],[]


In [237]:
def sort_dict(d) :
    return {k: v for k, v in sorted(d.items(), key=lambda item: item[1], reverse=True)}

In [238]:
# WORD TRANSFORMATION NEED TO CONSIDER PREVIOUS AND NEXT WORD

In [239]:
trans.head()

Unnamed: 0,TransformedPOI,SplitPOI,OriPOI,TransformedStreet,SplitStreet,OriStreet
9,True,"[sd, negeri, bojong, 02]","[sd, neg, boj, 02]",False,"[klap, boj]",[]
10,True,"[yayasan, atohariyah]","[yaya, atohar]",False,[],[]
18,True,"[toko, bangunan, ajs]","[toko, bang, ajs]",False,[],[]
36,True,"[markaz, tabligh, metro]","[mar, tabl, metro]",False,[],[]
38,True,"[sd, negeri, 12, anggrek]","[sd, neg, 12, anggrek]",False,[],[]


### 1 Word Transformation

In [291]:
unitransforms = {}

def process_unitransform(unitransforms, is_transformeds, repaireds, originals) :
    for is_transformed, repair, ori in zip(is_transformeds, repaireds, originals) :
        if is_transformed :
            for r, o in zip(repair, ori) :
                if r != o :
                    if o not in unitransforms :
                        unitransforms[o] = {}
                        unitransforms[o][r] = 1
                    else :
                        if r in unitransforms[o] :
                            unitransforms[o][r] += 1
                        else :
                            unitransforms[o][r] = 1

process_unitransform(unitransforms, trans["TransformedPOI"], trans["SplitPOI"], trans["OriPOI"])
process_unitransform(unitransforms, trans["TransformedStreet"], trans["SplitStreet"], trans["OriStreet"])

for k, v in unitransforms.items() :
    unitransforms[k] = sort_dict(v)

In [292]:
# unitransforms 

In [293]:
unitransforms["cak"]

{'cakung': 15, 'cakruk': 1, "cake's": 1, 'cakery': 1, 'cakrad': 1, 'cakrab': 1}

### 2 Word Transformation

In [294]:
bitransforms = {}

def process_bitransform(bitransforms, is_transformeds, repaireds, originals) :
    for is_transformed, repair, ori in zip(is_transformeds, repaireds, originals) :
        if is_transformed :
            if len(repair) == len(ori) and len(repair) > 1:
                i = 0
                while i < len(repair)-1 :
                    r = repair[i] + " " + repair[i+1]
                    o = ori[i] + " " + ori[i+1]
                    if r != o :
                        if o not in bitransforms :
                            bitransforms[o] = {}
                            bitransforms[o][r] = 1
                        else :
                            if r in bitransforms[o] :
                                bitransforms[o][r] += 1
                            else :
                                bitransforms[o][r] = 1
                    i += 1

process_bitransform(bitransforms, trans["TransformedPOI"], trans["SplitPOI"], trans["OriPOI"])
process_bitransform(bitransforms, trans["TransformedStreet"], trans["SplitStreet"], trans["OriStreet"])

for k, v in bitransforms.items() :
    bitransforms[k] = sort_dict(v)

In [295]:
bitransforms["taman mer"]

{'taman meruya': 2}

In [245]:
# bitransforms 

### 3 Word Transformation

In [296]:
tritransforms = {}
def process_tritransform(bitransforms, is_transformeds, repaireds, originals) :
    for is_transformed, repair, ori in zip(is_transformeds, repaireds, originals) :
        if is_transformed :
            if len(repair) == len(ori) and len(repair) > 2:
                i = 0
                while i < len(repair)-2 :
                    r = repair[i] + " " + repair[i+1] + " " + repair[i+2]
                    o = ori[i] + " " + ori[i+1] + " " + ori[i+2]
                    if r != o :
                        if o not in tritransforms :
                            tritransforms[o] = {}
                            tritransforms[o][r] = 1
                        else :
                            if r in tritransforms[o] :
                                tritransforms[o][r] += 1
                            else :
                                tritransforms[o][r] = 1
                    i += 1

process_tritransform(tritransforms, trans["TransformedPOI"], trans["SplitPOI"], trans["OriPOI"])
process_tritransform(tritransforms, trans["TransformedStreet"], trans["SplitStreet"], trans["OriStreet"])                
                
for k, v in tritransforms.items() :
    tritransforms[k] = sort_dict(v)

In [247]:
# tritransforms 

### 4 Word Transformation

In [297]:
qtransforms = {}
def process_qtransform(qtransforms, is_transformeds, repaireds, originals) :
    for is_transformed, repair, ori in zip(is_transformeds, repaireds, originals) :
        if is_transformed :
            if len(repair) == len(ori) and len(repair) > 3:
                i = 0
                while i < len(repair)-3 :
                    r = repair[i] + " " + repair[i+1] + " " + repair[i+2] + " " + repair[i+3]
                    o = ori[i] + " " + ori[i+1] + " " + ori[i+2] + " " + ori[i+3]
                    if r != o :
                        if o not in qtransforms :
                            qtransforms[o] = {}
                            qtransforms[o][r] = 1
                        else :
                            if r in qtransforms[o] :
                                qtransforms[o][r] += 1
                            else :
                                qtransforms[o][r] = 1
                    i += 1

process_qtransform(qtransforms, trans["TransformedPOI"], trans["SplitPOI"], trans["OriPOI"])
process_qtransform(qtransforms, trans["TransformedStreet"], trans["SplitStreet"], trans["OriStreet"])                


for k, v in qtransforms.items() :
    qtransforms[k] = sort_dict(v)

In [298]:
# qtransforms 

# Combine n-grams

### Create Unigram

In [299]:
df = pd.read_csv('data/processed_train.csv')
df.head()

Unnamed: 0,id,raw_address,POI/street
0,0,jl kapuk timur delta sili iii lippo cika 11 a ...,/jl kapuk timur delta sili iii lippo cika
1,2,setu siung 119 rt 5 1 13880 cipayung,/siung
2,3,"toko dita , kertosono",toko dita/
3,4,jl. orde baru,/jl. orde baru
4,5,"raya samb gede , 299 toko bb kids",toko bb kids/raya samb gede


In [300]:
tokens = []
for s in df["raw_address"] :
    words = s.split(" ")
    for word in words :
        tokens.append(word)
unigrams = Counter(tokens) 

In [301]:
unigrams.most_common()[:10]

[(',', 180846),
 ('no', 47575),
 ('rt', 39233),
 ('raya', 31761),
 ('1', 21386),
 ('2', 19532),
 ('rw', 18853),
 ('3', 16272),
 ('4', 13234),
 ('barat', 13224)]

### Create Bigram

In [302]:
tokens = []
for s in df["raw_address"] :
    words = s.split(" ")
    n = len(words)
    if n > 1 :
        i = 0
        while i < n - 1 : 
            text = words[i] + " " + words[i+1]
            tokens.append(text)
            i += 1 
bigrams = Counter(tokens) 

In [303]:
bigrams.most_common()[:10]

[(', no', 18757),
 (', raya', 5498),
 ('rt 1', 5238),
 ('raya ,', 4590),
 ('rt 2', 4485),
 ('rt 3', 4038),
 ('rt 4', 3538),
 ('rt 5', 3052),
 ('rt 6', 2462),
 ('2 ,', 2239)]

### Create Trigram

In [304]:
tokens = []
for s in df["raw_address"] :
    words = s.split(" ")
    n = len(words)
    if n > 2 :
        i = 0
        while i < n - 2 : 
            text = words[i] + " " + words[i+1] + " " + words[i+2]
            tokens.append(text)
            i += 1 
trigrams = Counter(tokens) 

In [305]:
trigrams.most_common()[:10]

[('rt 1 rw', 1120),
 ('raya , no', 1020),
 ('ahmad yani ,', 965),
 ('rt 2 rw', 940),
 ('rt 3 rw', 872),
 ('rt 4 rw', 737),
 ('jend ahmad yani', 727),
 ('jend sudi ,', 701),
 ('rt 5 rw', 655),
 (', no 1', 600)]

# Transformation

In [306]:
def repair_qgram(text) :
    words = text.split(" ")
    n = len(words)
    if n > 3 :
        for i in range(n-3) :
            token = words[i] + " " + words[i+1] + " " + words[i+2] + " " + words[i+3]
            if token in qtransforms :
                text = text.replace(token, list(qtransforms[token])[0])
    return text

repair_qgram(text)

'b2 no 58'

In [307]:
trirepair = {}
for k, v in tritransforms.items() :
    words = list(v.keys())
    counts = list(v.values())
    first_word, first_count = words[0], counts[0]
    uncertain = True
    if len(v) > 1 :
        second_word, second_count = words[1], counts[1]
        if first_count > second_count * 2 :
            uncertain = False
    else :
        uncertain = False

    if not uncertain :
        if 1.5 * first_count > trigrams[k] :
            trirepair[k] = first_word

In [308]:
def repair_trigram(text) :
    words = text.split(" ")
    n = len(words)
    if n > 2 :
        for i in range(n-2) :
            token = words[i] + " " + words[i+1] + " " + words[i+2]
            if token in trirepair :
                text = text.replace(token, trirepair[token])
    return text

text = "cikahuripan sd neg boj 02 klap boj , no 5 16877"
text = "raya samb gede , 299 toko bb kids"
repair_trigram(text)

'raya samb gede , 299 toko bb kids'

In [309]:
birepair = {}
for k, v in bitransforms.items() :
    words = list(v.keys())
    counts = list(v.values())
    first_word, first_count = words[0], counts[0]
    uncertain = True
    if len(v) > 1 :
        second_word, second_count = words[1], counts[1]
        if first_count > second_count * 3 :
            uncertain = False
    else :
        uncertain = False

    if not uncertain :
        if 1.5 * first_count > bigrams[k] :
            birepair[k] = first_word

In [310]:
def repair_bigram(text) :
    words = text.split(" ")
    n = len(words)
    if n > 1 :
        for i in range(n-1) :
            token = words[i] + " " + words[i+1]
            if token in birepair :
                text = text.replace(token, birepair[token])
    return text

text = "cikahuripan sd neg boj 02 klap boj , no 5 16877"
# text = "delta sili"
repair_bigram(text)

'cikahuripan sd negeri bojong 02 klap boj , no 5 16877'

In [311]:
unirepair = {}
for k, v in unitransforms.items() :
    words = list(v.keys())
    counts = list(v.values())
    first_word, first_count = words[0], counts[0]

    uncertain = True
    if len(v) > 1 :
        second_word, second_count = words[1], counts[1]
        if first_count > second_count * 10 :
            uncertain = False
    else :
        uncertain = False

    if not uncertain :
        if 1.1 * first_count > unigrams[k]:
            unirepair[k] = first_word

In [312]:
def repair_unigram(text) :
    words = text.split(" ")
    for i in range(len(words)) :
        if words[i] in unirepair :
            words[i] = unirepair[words[i]]
    return " ".join(words)

text = "cikahuripan sd neg boj 02 klap boj , no 5 16877"
repair_bigram(text)

'cikahuripan sd negeri bojong 02 klap boj , no 5 16877'

In [313]:
# unirepair

## Debugging for Repair

In [314]:
def repair_debug(text) : 
    text = repair_qgram(text)
    print("4: ", text)
    text = repair_trigram(text)
    print("3: ", text)
    text = repair_bigram(text)
    print("2: ", text)
#     text = repair_unigram(text)
#     print("1: ", text)
    return text

In [315]:
text = "jl kapuk timur delta sili iii lippo cika 11 a cicau cikarang pusat" 
# text = "toko sae , tulakan-tegalombo"
# text = "cak 11 nagasari karawang barat"
# text = "raya samb gede , 299 toko bb kids"
text = "beng las listrik rah , raya sorea , cangkuang"
text = "simpang tiga kah nasu no 112 28284 bukit raya"
repair_debug(text)

4:  simpang tiga kah nasu no 112 28284 bukit raya
3:  simpang tiga kah nasu no 112 28284 bukit raya
2:  simpang tiga kah nasu no 112 28284 bukit raya


'simpang tiga kah nasu no 112 28284 bukit raya'

In [316]:
bitext = "kah nasu"
# bitext = ""
print(bitransforms[bitext])
print(bigrams[bitext])

{'kaharu nasu': 5}
20


In [317]:
unitext = "cika"
print(unitransforms[unitext])
print(unigrams[unitext])

{'cikarang': 26, 'cikande': 6, 'cikampek': 5, 'cikajang': 4, 'cikalong': 4, 'cikawao': 2, 'cikakak': 1, 'cikanteh': 1, 'cikalang': 1, 'cikadut': 1}
381


In [318]:
text = "raya samb gede"
print(tritransforms[text])
print(trigrams[text])

{'raya sambong gede': 2}
13


# Check Number of Repair with Cleaned Data

In [319]:
cdf = pd.read_csv('data/cleaned_train.csv')
cdf.head()

Unnamed: 0,id,raw_address,POI/street
0,0,jl kapuk timur delta sili iii lippo cika 11 a ...,/jl kapuk timur delta sili iii lippo cika
1,2,setu siung 119 rt 5 1 13880 cipayung,/siung
2,3,"toko dita , kertosono",toko dita/
3,4,jl. orde baru,/jl. orde baru
4,5,"raya samb gede , 299 toko bb kids",toko bb kids/raya samb gede


In [320]:
def repair(text) : 
    text = repair_qgram(text)
    text = repair_trigram(text)
    text = repair_bigram(text)
    text = repair_unigram(text)
    return text

### Before Repair

In [321]:
check = df["raw_address"] == cdf["raw_address"]
print("Similarity with Cleaned Data: {:.2f}%".format(100 * sum(check)/len(cdf)))

Similarity with Cleaned Data: 77.84%


### After Repair

In [322]:
df["repaired_address"] = df["raw_address"].apply(repair)
df["cleaned_address"] = cdf["raw_address"]
check = df["repaired_address"] == df["cleaned_address"]
print("Similarity with Cleaned Data: {:.2f}%".format(100 * sum(check)/len(cdf)))

Similarity with Cleaned Data: 92.58%


## Apply Transformation To Test Data

In [323]:
dft = pd.read_csv(data_dir + "processed_test.csv")

In [324]:
dft["cleaned_address"] = dft["raw_address"].apply(repair)

In [325]:
dft

Unnamed: 0,id,raw_address,cleaned_address
0,0,s. par 53 sidanegara 4 cilacap tengah,s. par 53 sidanegara 4 cilacap tengah
1,1,"angg per , baloi indah kel. lubuk baja","angg per , baloi indah kel. lubuk baja"
2,2,"asma laun , mand imog ,","asma laundry , mand imog ,"
3,3,"ud agung rej , raya nga sri wedari karanganyar","ud agung rejeki , raya nga sri wedari karanganyar"
4,4,"cut mutia , 35 baiturrahman","cut mutia , 35 baiturrahman"
...,...,...,...
49995,49995,toko mbak farid semboro semboro,toko mbak farid semboro semboro
49996,49996,"vie - tk. ridho kids , vete 3 cari , 16720 ciawi","vienna - tk. ridho kids , vete 3 cari , 16720 ..."
49997,49997,"mart dan roti bakar malabar , nasio ,","martabak dan roti bakar malabar , nasio ,"
49998,49998,graha indah pamulang jl. mujair raya bambu apu...,graha indah pamulang jl. mujair raya bambu apu...


In [326]:
dft[dft["raw_address"] != dft["cleaned_address"]]

Unnamed: 0,id,raw_address,cleaned_address
2,2,"asma laun , mand imog ,","asma laundry , mand imog ,"
3,3,"ud agung rej , raya nga sri wedari karanganyar","ud agung rejeki , raya nga sri wedari karanganyar"
13,13,war bakso lata kota waingapu,warung bakso lata kota waingapu
16,16,"dimas laun demak , jl tanjung ii , katonsari","dimas laundry demak , jl tanjung ii , katonsari"
30,30,"lemb bimbi belajar gane opera ahmad yani 19 ,",lembaga bimbingan belajar ganesha operation ah...
...,...,...,...
49979,49979,toko mona kayu jati tembilahan hulu,toko monalisa kayu jati tembilahan hulu
49987,49987,toko suha kap marz,toko suhanda kap marz
49994,49994,m. tau kandai dua woja,m. taufiq kandai dua woja
49996,49996,"vie - tk. ridho kids , vete 3 cari , 16720 ciawi","vienna - tk. ridho kids , vete 3 cari , 16720 ..."


In [327]:
dft.drop(columns=["raw_address"]).rename(columns={"cleaned_address" : "raw_address"}).to_csv("data/cleaned_test.csv")