In [96]:
import re
from collections import Counter 

import pandas as pd
import numpy as np


import warnings
warnings.filterwarnings('ignore')

In [97]:
data_dir = "data/"
df = pd.read_csv(data_dir + "train.csv")
# df = df.drop(columns=["id"])

In [98]:
# df = df[:200]
df

Unnamed: 0,id,raw_address,POI/street
0,0,jl kapuk timur delta sili iii lippo cika 11 a ...,/jl kapuk timur delta sili iii lippo cika
1,1,"aye, jati sampurna",/
2,2,setu siung 119 rt 5 1 13880 cipayung,/siung
3,3,"toko dita, kertosono",toko dita/
4,4,jl. orde baru,/jl. orde baru
...,...,...,...
299995,299995,jend ahmad yani 331 kertasari ciamis,/jend ahmad yani
299996,299996,"raya cila kko, cilandak timur kel.",/raya cila kko
299997,299997,tanjung gusta jl. yaya 2 no 17,/
299998,299998,jalan cipadu jaya taman asri gang bijaksana 3 ...,taman asri/


### Remove whitespace

In [99]:
# remove multiple whitespace
# cover tabs, newlines, etc
def remove_multiple_whitespace(text) :
    return re.sub(' +', ' ', text.replace('\n', ' ').replace('\t', ' ')).strip()

In [100]:
remove_multiple_whitespace("asfua asfasbiuf \n \t  asfsa adfgas \t d ")

'asfua asfasbiuf asfsa adfgas d'

In [101]:
df["raw_address"] = df["raw_address"].apply(remove_multiple_whitespace)
df["POI/street"] = df["POI/street"].apply(remove_multiple_whitespace)

### Only use data with having at least POI or street

In [102]:
df = df[df["POI/street"] != "/"]
# df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,id,raw_address,POI/street
0,0,jl kapuk timur delta sili iii lippo cika 11 a ...,/jl kapuk timur delta sili iii lippo cika
2,2,setu siung 119 rt 5 1 13880 cipayung,/siung
3,3,"toko dita, kertosono",toko dita/
4,4,jl. orde baru,/jl. orde baru
5,5,"raya samb gede, 299 toko bb kids",toko bb kids/raya samb gede
...,...,...,...
299994,299994,karawaci baru kakap raya 156 rt 1 rw 3 karawaci,/kakap raya
299995,299995,jend ahmad yani 331 kertasari ciamis,/jend ahmad yani
299996,299996,"raya cila kko, cilandak timur kel.",/raya cila kko
299998,299998,jalan cipadu jaya taman asri gang bijaksana 3 ...,taman asri/


In [103]:
def getPOI(s) :
    return s.split("/")[0]

def getStreet(s) :
    return s.split("/")[1]

def split(s) :
    s = remove_multiple_whitespace(s)
    return s.split(" ")

In [104]:
df["POI"] = df["POI/street"].apply(getPOI)
df["Street"] = df["POI/street"].apply(getStreet) 

In [105]:
# Empty POI
len(df[df['POI'] == '']) 

146516

In [106]:
# Empty Street
len(df[df['Street'] == '']) 

38150

## Number of Dot in POI and Street

In [107]:
x = df["POI"].apply(lambda x : "." in x)
sum(x)

6464

In [108]:
x = df["Street"].apply(lambda x : "." in x)
sum(x)

29080

In [109]:
df["SplitPOI"] = df["POI"].apply(split)
df["SplitStreet"] = df["Street"].apply(split)
df["SplitRaw"] = df["raw_address"].apply(split)

In [110]:
dot_tokens = []

In [111]:
for tokens in df["SplitPOI"].tolist() + df["SplitStreet"].tolist() :
    for token in tokens :
        if "." in token :
            dot_tokens.append(token)

In [112]:
count = Counter(dot_tokens)

In [113]:
count.most_common()[:20]

[('gg.', 10709),
 ('jl.', 9731),
 ('h.', 2013),
 ('dr.', 959),
 ('pt.', 723),
 ('kh.', 650),
 ('a.', 602),
 ('r.', 588),
 ('s.', 575),
 ('kel.', 547),
 ('jln.', 507),
 ('m.', 492),
 ('ud.', 456),
 ('ir.', 448),
 ('p.', 384),
 ('jend.', 291),
 ('cv.', 256),
 ('prof.', 221),
 ('k.', 221),
 ('kp.', 205)]

In [114]:
unique_dot_tokens = set(dot_tokens)

In [115]:
## remove dot if the token doesn't has 
def remove_dot(text) :
    words = text.split(" ")
    for i in range(len(words)) :
        if "." in words[i] and words[i] not in unique_dot_tokens :
            words[i] = words[i].replace(".", " ").strip()
    return " ".join(words)

In [116]:
words = ['batu', 'flower', 'garden.', 'ds.', 'dresel.', 'oro-oro', 'ombo.', 'batu.', 'kota', 'batu']
text = " ".join(words)
remove_dot(text)

'batu flower garden ds. dresel oro-oro ombo batu kota batu'

In [117]:
df["raw_address"] = df["raw_address"].apply(remove_dot)

df["POI"] = df["POI/street"].apply(getPOI)
df["Street"] = df["POI/street"].apply(getStreet) 

df["SplitRaw"] = df["raw_address"].apply(split)
df["SplitPOI"] = df["POI"].apply(split)
df["SplitStreet"] = df["Street"].apply(split)
df.head()

Unnamed: 0,id,raw_address,POI/street,POI,Street,SplitPOI,SplitStreet,SplitRaw
0,0,jl kapuk timur delta sili iii lippo cika 11 a ...,/jl kapuk timur delta sili iii lippo cika,,jl kapuk timur delta sili iii lippo cika,[],"[jl, kapuk, timur, delta, sili, iii, lippo, cika]","[jl, kapuk, timur, delta, sili, iii, lippo, ci..."
2,2,setu siung 119 rt 5 1 13880 cipayung,/siung,,siung,[],[siung],"[setu, siung, 119, rt, 5, 1, 13880, cipayung]"
3,3,"toko dita, kertosono",toko dita/,toko dita,,"[toko, dita]",[],"[toko, dita,, kertosono]"
4,4,jl. orde baru,/jl. orde baru,,jl. orde baru,[],"[jl., orde, baru]","[jl., orde, baru]"
5,5,"raya samb gede, 299 toko bb kids",toko bb kids/raya samb gede,toko bb kids,raya samb gede,"[toko, bb, kids]","[raya, samb, gede]","[raya, samb, gede,, 299, toko, bb, kids]"


## Number of Comma in POI and Street

In [118]:
x = df["POI"].apply(lambda x : "," in x)
sum(x)

918

In [119]:
x = df["Street"].apply(lambda x : "," in x)
sum(x)

1176

In [120]:
comma_tokens = []

In [121]:
for tokens in df["SplitPOI"].tolist() + df["SplitStreet"].tolist() :
    for token in tokens :
        if "," in token :
            comma_tokens.append(token)

In [122]:
count = Counter(comma_tokens)

In [123]:
count.most_common()[:15]

[('ii,', 107),
 ('raya,', 87),
 ('i,', 66),
 ('iii,', 56),
 ('2,', 46),
 ('baru,', 40),
 ('iv,', 36),
 ('1,', 35),
 ('v,', 35),
 ('jaya,', 30),
 ('4,', 27),
 ('sh.,', 26),
 ('jl,', 26),
 ('3,', 23),
 ('lor,', 22)]

In [124]:
# since the number is small, and the occurance is odd
# we can omit space in raw_address
def remove_comma(text) :
    return remove_multiple_whitespace(text.replace(",", " , ")).strip()

In [125]:
df["raw_address"] = df["raw_address"].apply(remove_comma)
df["POI/street"] = df["POI/street"].apply(remove_comma)

df["POI"] = df["POI/street"].apply(getPOI)
df["Street"] = df["POI/street"].apply(getStreet) 

df["SplitRaw"] = df["raw_address"].apply(split)
df["SplitPOI"] = df["POI"].apply(split)
df["SplitStreet"] = df["Street"].apply(split)

df.head()

Unnamed: 0,id,raw_address,POI/street,POI,Street,SplitPOI,SplitStreet,SplitRaw
0,0,jl kapuk timur delta sili iii lippo cika 11 a ...,/jl kapuk timur delta sili iii lippo cika,,jl kapuk timur delta sili iii lippo cika,[],"[jl, kapuk, timur, delta, sili, iii, lippo, cika]","[jl, kapuk, timur, delta, sili, iii, lippo, ci..."
2,2,setu siung 119 rt 5 1 13880 cipayung,/siung,,siung,[],[siung],"[setu, siung, 119, rt, 5, 1, 13880, cipayung]"
3,3,"toko dita , kertosono",toko dita/,toko dita,,"[toko, dita]",[],"[toko, dita, ,, kertosono]"
4,4,jl. orde baru,/jl. orde baru,,jl. orde baru,[],"[jl., orde, baru]","[jl., orde, baru]"
5,5,"raya samb gede , 299 toko bb kids",toko bb kids/raya samb gede,toko bb kids,raya samb gede,"[toko, bb, kids]","[raya, samb, gede]","[raya, samb, gede, ,, 299, toko, bb, kids]"


## Number of ( in POI and Street

In [126]:
x = df["POI"].apply(lambda x : "(" in x)
sum(x)

548

In [127]:
x = df["Street"].apply(lambda x : "(" in x)
sum(x)

9

In [128]:
lb_tokens = []

In [129]:
for tokens in df["SplitPOI"].tolist() + df["SplitStreet"].tolist() :
    for token in tokens :
        if "(" in token :
            lb_tokens.append(token)

In [130]:
count = Counter(lb_tokens)

In [131]:
count.most_common()[:15]

[('(', 30),
 ('(ra)', 24),
 ('(sdn)', 20),
 ('(mt)', 18),
 ('(persero)', 14),
 ('(mis)', 13),
 ('(tpa', 12),
 ('(tpq', 11),
 ('(kua)', 9),
 ('(tk)', 8),
 ('(alfamart)', 7),
 ('(tpq)', 7),
 ('(smk)', 7),
 ('(depan', 6),
 ('(lkp)', 6)]

In [132]:
# remove "(" and ")" if it's not in one word
def remove_bracket(text) :
    words = text.split(" ")
    for i in range(len(words)) :
        if "(" in words[i] and  ")" not in words[i] :
            words[i] = words[i].replace("(", "")
        elif ")" in words[i] and  "(" not in words[i] :
            words[i] = words[i].replace(")", "")
    return " ".join(words).strip()

In [133]:
df["raw_address"] = df["raw_address"].apply(remove_bracket)
df["POI/street"] = df["POI/street"].apply(remove_bracket)

df["POI"] = df["POI/street"].apply(getPOI)
df["Street"] = df["POI/street"].apply(getStreet) 

df["SplitRaw"] = df["raw_address"].apply(split)
df["SplitPOI"] = df["POI"].apply(split)
df["SplitStreet"] = df["Street"].apply(split)

df.head()

Unnamed: 0,id,raw_address,POI/street,POI,Street,SplitPOI,SplitStreet,SplitRaw
0,0,jl kapuk timur delta sili iii lippo cika 11 a ...,/jl kapuk timur delta sili iii lippo cika,,jl kapuk timur delta sili iii lippo cika,[],"[jl, kapuk, timur, delta, sili, iii, lippo, cika]","[jl, kapuk, timur, delta, sili, iii, lippo, ci..."
2,2,setu siung 119 rt 5 1 13880 cipayung,/siung,,siung,[],[siung],"[setu, siung, 119, rt, 5, 1, 13880, cipayung]"
3,3,"toko dita , kertosono",toko dita/,toko dita,,"[toko, dita]",[],"[toko, dita, ,, kertosono]"
4,4,jl. orde baru,/jl. orde baru,,jl. orde baru,[],"[jl., orde, baru]","[jl., orde, baru]"
5,5,"raya samb gede , 299 toko bb kids",toko bb kids/raya samb gede,toko bb kids,raya samb gede,"[toko, bb, kids]","[raya, samb, gede]","[raya, samb, gede, ,, 299, toko, bb, kids]"


In [134]:
df[["raw_address", "POI/street"]].to_csv(data_dir + "processed_train.csv", index=False)

In [135]:
def f(words, sentence): 
    res = []
    for word in words :
        if sentence.find(word) == -1 :
            res.append(word)
    if len(res) == 0 :
        return False
    return res

In [136]:
df['TransformedPOI'] = df.apply(lambda x: f(x.SplitPOI, x.raw_address), axis=1)
df['TransformedStreet'] = df.apply(lambda x: f(x.SplitStreet, x.raw_address), axis=1)

In [137]:
# dp = df[df["TransformedPOI"] != False]
# dp.head()

In [138]:
# ds = df[df["TransformedStreet"] != False]
# ds.head()

words = ['ds.', 'sukowiyono', 'dsn.', 'karangsono', 'rt', '01', 'rw', '03', 'utara', 'jembatan', 'simomasuk', 'gang', 'jembatan']

POI = ['jembatan', 'simo']

there is a corrector for misspell word

In [149]:
odds = []

def repair_words(words, repaireds) :
    i = 0
    while i < len(words) :
        curr = words[i]
        if curr == repaireds[0][:len(curr)] :
            j = 1
            while j < len(repaireds) :
                if i + j >= len(words) :
                    odds.append({"words": words, "repaireds": repaireds})
#                     print(i)
#                     print(j)
#                     print(words)
#                     print(repaireds)
#                     return []
                    return words
                curr = words[i+j]
                if curr == repaireds[j][:len(curr)] :
                    j += 1
                else :
                    break
            if j == len(repaireds) :
                words[i:i+j] = repaireds
                return words
        i += 1
    return words

In [150]:
df["clean_address"] = df.apply(lambda x : repair_words(x.SplitRaw, x.SplitPOI), axis=1)
df["clean_address"] = df.apply(lambda x : repair_words(x.clean_address, x.SplitStreet), axis=1)

In [151]:
### Number of odd case
len(odds)

4

In [152]:
for odd in odds :
    print("Words\t\t: ", " ".join(odd["words"]))
    print("Repaireds\t: ", " ".join(odd["repaireds"]))
#     print(odd["repaireds"])
    print()

Words		:  pelab tadett ,
Repaireds	:  pelabuhan tadette , , ,

Words		:  cirebon waterland ade irma suryani. jl. yos sudarso no 1 , lemahwungkuk , cirebon
Repaireds	:  cirebon waterland ade irma suryani

Words		:  pamulang barat pamu squa lt. dasar siliw 3-6 15417 pamulang
Repaireds	:  pamulang square , lt. dasar

Words		:  asemr gg. i asem rowo asemrowo
Repaireds	:  asemrowo , gg. i



In [153]:
df

Unnamed: 0,id,raw_address,POI/street,POI,Street,SplitPOI,SplitStreet,SplitRaw,TransformedPOI,TransformedStreet,clean_address
0,0,jl kapuk timur delta sili iii lippo cika 11 a ...,/jl kapuk timur delta sili iii lippo cika,,jl kapuk timur delta sili iii lippo cika,[],"[jl, kapuk, timur, delta, sili, iii, lippo, cika]","[jl, kapuk, timur, delta, sili, iii, lippo, ci...",False,False,"[jl, kapuk, timur, delta, sili, iii, lippo, ci..."
2,2,setu siung 119 rt 5 1 13880 cipayung,/siung,,siung,[],[siung],"[setu, siung, 119, rt, 5, 1, 13880, cipayung]",False,False,"[setu, siung, 119, rt, 5, 1, 13880, cipayung]"
3,3,"toko dita , kertosono",toko dita/,toko dita,,"[toko, dita]",[],"[toko, dita, ,, kertosono]",False,False,"[toko, dita, ,, kertosono]"
4,4,jl. orde baru,/jl. orde baru,,jl. orde baru,[],"[jl., orde, baru]","[jl., orde, baru]",False,False,"[jl., orde, baru]"
5,5,"raya samb gede , 299 toko bb kids",toko bb kids/raya samb gede,toko bb kids,raya samb gede,"[toko, bb, kids]","[raya, samb, gede]","[raya, samb, gede, ,, 299, toko, bb, kids]",False,False,"[raya, samb, gede, ,, 299, toko, bb, kids]"
...,...,...,...,...,...,...,...,...,...,...,...
299994,299994,karawaci baru kakap raya 156 rt 1 rw 3 karawaci,/kakap raya,,kakap raya,[],"[kakap, raya]","[karawaci, baru, kakap, raya, 156, rt, 1, rw, ...",False,False,"[karawaci, baru, kakap, raya, 156, rt, 1, rw, ..."
299995,299995,jend ahmad yani 331 kertasari ciamis,/jend ahmad yani,,jend ahmad yani,[],"[jend, ahmad, yani]","[jend, ahmad, yani, 331, kertasari, ciamis]",False,False,"[jend, ahmad, yani, 331, kertasari, ciamis]"
299996,299996,"raya cila kko , cilandak timur kel.",/raya cila kko,,raya cila kko,[],"[raya, cila, kko]","[raya, cila, kko, ,, cilandak, timur, kel.]",False,False,"[raya, cila, kko, ,, cilandak, timur, kel.]"
299998,299998,jalan cipadu jaya taman asri gang bijaksana 3 ...,taman asri/,taman asri,,"[taman, asri]",[],"[jalan, cipadu, jaya, taman, asri, gang, bijak...",False,False,"[jalan, cipadu, jaya, taman, asri, gang, bijak..."


In [154]:
def array_to_string(arr) :
    return " ".join(arr)

In [155]:
df["clean_address"] = df["clean_address"].apply(array_to_string)

In [156]:
df["clean_address"] = df["clean_address"].replace("", np.nan)
# df = df.dropna().reset_index(drop=True)
df

Unnamed: 0,id,raw_address,POI/street,POI,Street,SplitPOI,SplitStreet,SplitRaw,TransformedPOI,TransformedStreet,clean_address
0,0,jl kapuk timur delta sili iii lippo cika 11 a ...,/jl kapuk timur delta sili iii lippo cika,,jl kapuk timur delta sili iii lippo cika,[],"[jl, kapuk, timur, delta, sili, iii, lippo, cika]","[jl, kapuk, timur, delta, sili, iii, lippo, ci...",False,False,jl kapuk timur delta sili iii lippo cika 11 a ...
2,2,setu siung 119 rt 5 1 13880 cipayung,/siung,,siung,[],[siung],"[setu, siung, 119, rt, 5, 1, 13880, cipayung]",False,False,setu siung 119 rt 5 1 13880 cipayung
3,3,"toko dita , kertosono",toko dita/,toko dita,,"[toko, dita]",[],"[toko, dita, ,, kertosono]",False,False,"toko dita , kertosono"
4,4,jl. orde baru,/jl. orde baru,,jl. orde baru,[],"[jl., orde, baru]","[jl., orde, baru]",False,False,jl. orde baru
5,5,"raya samb gede , 299 toko bb kids",toko bb kids/raya samb gede,toko bb kids,raya samb gede,"[toko, bb, kids]","[raya, samb, gede]","[raya, samb, gede, ,, 299, toko, bb, kids]",False,False,"raya samb gede , 299 toko bb kids"
...,...,...,...,...,...,...,...,...,...,...,...
299994,299994,karawaci baru kakap raya 156 rt 1 rw 3 karawaci,/kakap raya,,kakap raya,[],"[kakap, raya]","[karawaci, baru, kakap, raya, 156, rt, 1, rw, ...",False,False,karawaci baru kakap raya 156 rt 1 rw 3 karawaci
299995,299995,jend ahmad yani 331 kertasari ciamis,/jend ahmad yani,,jend ahmad yani,[],"[jend, ahmad, yani]","[jend, ahmad, yani, 331, kertasari, ciamis]",False,False,jend ahmad yani 331 kertasari ciamis
299996,299996,"raya cila kko , cilandak timur kel.",/raya cila kko,,raya cila kko,[],"[raya, cila, kko]","[raya, cila, kko, ,, cilandak, timur, kel.]",False,False,"raya cila kko , cilandak timur kel."
299998,299998,jalan cipadu jaya taman asri gang bijaksana 3 ...,taman asri/,taman asri,,"[taman, asri]",[],"[jalan, cipadu, jaya, taman, asri, gang, bijak...",False,False,jalan cipadu jaya taman asri gang bijaksana 3 ...


In [157]:
cleaned_df = df[["clean_address", "POI/street"]]
cleaned_df.rename(columns={"clean_address": "raw_address"}, inplace=True)
cleaned_df

Unnamed: 0,raw_address,POI/street
0,jl kapuk timur delta sili iii lippo cika 11 a ...,/jl kapuk timur delta sili iii lippo cika
2,setu siung 119 rt 5 1 13880 cipayung,/siung
3,"toko dita , kertosono",toko dita/
4,jl. orde baru,/jl. orde baru
5,"raya samb gede , 299 toko bb kids",toko bb kids/raya samb gede
...,...,...
299994,karawaci baru kakap raya 156 rt 1 rw 3 karawaci,/kakap raya
299995,jend ahmad yani 331 kertasari ciamis,/jend ahmad yani
299996,"raya cila kko , cilandak timur kel.",/raya cila kko
299998,jalan cipadu jaya taman asri gang bijaksana 3 ...,taman asri/


In [158]:
cleaned_df.to_csv(data_dir + "cleaned_train.csv", index=False)

## Create the whole text processing pipeline

In [53]:
dft = pd.read_csv(data_dir + "test.csv")

In [54]:
dft["raw_address"] = dft["raw_address"].apply(remove_multiple_whitespace)
dft["raw_address"] = dft["raw_address"].apply(remove_dot)
dft["raw_address"] = dft["raw_address"].apply(remove_comma)
dft["raw_address"] = dft["raw_address"].apply(remove_bracket)

In [55]:
dft

Unnamed: 0,id,raw_address
0,0,s. par 53 sidanegara 4 cilacap tengah
1,1,"angg per , baloi indah kel. lubuk baja"
2,2,"asma laun , mand imog ,"
3,3,"ud agung rej , raya nga sri wedari karanganyar"
4,4,"cut mutia , 35 baiturrahman"
...,...,...
49995,49995,toko mbak farid semboro semboro
49996,49996,"vie - tk. ridho kids , vete 3 cari , 16720 ciawi"
49997,49997,"mart dan roti bakar malabar , nasio ,"
49998,49998,graha indah pamulang jl. mujair raya bambu apu...


In [56]:
dft.to_csv(data_dir + "processed_test.csv", index=False)