In [1]:
import re
from collections import Counter 

import pandas as pd
import numpy as np


import warnings
warnings.filterwarnings('ignore')

In [2]:
data_dir = "data/"
df = pd.read_csv(data_dir + "train.csv")
dft = pd.read_csv(data_dir + "test.csv")
# df = df.drop(columns=["id"])

In [3]:
# df = df[:200]
df

Unnamed: 0,id,raw_address,POI/street
0,0,jl kapuk timur delta sili iii lippo cika 11 a ...,/jl kapuk timur delta sili iii lippo cika
1,1,"aye, jati sampurna",/
2,2,setu siung 119 rt 5 1 13880 cipayung,/siung
3,3,"toko dita, kertosono",toko dita/
4,4,jl. orde baru,/jl. orde baru
...,...,...,...
299995,299995,jend ahmad yani 331 kertasari ciamis,/jend ahmad yani
299996,299996,"raya cila kko, cilandak timur kel.",/raya cila kko
299997,299997,tanjung gusta jl. yaya 2 no 17,/
299998,299998,jalan cipadu jaya taman asri gang bijaksana 3 ...,taman asri/


### Remove whitespace

In [4]:
# remove multiple whitespace
# cover tabs, newlines, etc
def remove_multiple_whitespace(text) :
    return re.sub(' +', ' ', text.replace('\n', ' ').replace('\t', ' ')).strip()

In [5]:
remove_multiple_whitespace("asfua asfasbiuf \n \t  asfsa adfgas \t d ")

'asfua asfasbiuf asfsa adfgas d'

In [6]:
df["raw_address"] = df["raw_address"].apply(remove_multiple_whitespace)
df["POI/street"] = df["POI/street"].apply(remove_multiple_whitespace)

### Only use data with having at least POI or street

In [7]:
df = df[df["POI/street"] != "/"]
# df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,id,raw_address,POI/street
0,0,jl kapuk timur delta sili iii lippo cika 11 a ...,/jl kapuk timur delta sili iii lippo cika
2,2,setu siung 119 rt 5 1 13880 cipayung,/siung
3,3,"toko dita, kertosono",toko dita/
4,4,jl. orde baru,/jl. orde baru
5,5,"raya samb gede, 299 toko bb kids",toko bb kids/raya samb gede
...,...,...,...
299994,299994,karawaci baru kakap raya 156 rt 1 rw 3 karawaci,/kakap raya
299995,299995,jend ahmad yani 331 kertasari ciamis,/jend ahmad yani
299996,299996,"raya cila kko, cilandak timur kel.",/raya cila kko
299998,299998,jalan cipadu jaya taman asri gang bijaksana 3 ...,taman asri/


In [8]:
def getPOI(s) :
    return s.split("/")[0]

def getStreet(s) :
    return s.split("/")[1]

def split(s) :
    s = remove_multiple_whitespace(s)
    return s.split(" ")

In [9]:
df["POI"] = df["POI/street"].apply(getPOI)
df["Street"] = df["POI/street"].apply(getStreet) 

In [10]:
# Empty POI
len(df[df['POI'] == '']) 

146516

In [11]:
# Empty Street
len(df[df['Street'] == '']) 

38150

## Number of Dot in POI and Street

In [12]:
x = df["POI"].apply(lambda x : "." in x)
sum(x)

6464

In [13]:
x = df["Street"].apply(lambda x : "." in x)
sum(x)

29080

In [14]:
df["SplitPOI"] = df["POI"].apply(split)
df["SplitStreet"] = df["Street"].apply(split)
df["SplitRaw"] = df["raw_address"].apply(split)

In [15]:
dot_tokens = []

In [16]:
for tokens in df["SplitPOI"].tolist() + df["SplitStreet"].tolist() :
    for token in tokens :
        if "." in token :
            dot_tokens.append(token)

In [17]:
count = Counter(dot_tokens)

In [18]:
count.most_common()[:20]

[('gg.', 10709),
 ('jl.', 9731),
 ('h.', 2013),
 ('dr.', 959),
 ('pt.', 723),
 ('kh.', 650),
 ('a.', 602),
 ('r.', 588),
 ('s.', 575),
 ('kel.', 547),
 ('jln.', 507),
 ('m.', 492),
 ('ud.', 456),
 ('ir.', 448),
 ('p.', 384),
 ('jend.', 291),
 ('cv.', 256),
 ('prof.', 221),
 ('k.', 221),
 ('kp.', 205)]

In [19]:
unique_dot_tokens = set(dot_tokens)

In [20]:
## remove dot if the token doesn't has 
def remove_dot(text) :
    words = text.split(" ")
    for i in range(len(words)) :
        if "." in words[i] and words[i] not in unique_dot_tokens :
            words[i] = words[i].replace(".", " ").strip()
    return " ".join(words)

In [21]:
words = ['batu', 'flower', 'garden.', 'ds.', 'dresel.', 'oro-oro', 'ombo.', 'batu.', 'kota', 'batu']
text = " ".join(words)
remove_dot(text)

'batu flower garden ds. dresel oro-oro ombo batu kota batu'

In [22]:
df["raw_address"] = df["raw_address"].apply(remove_dot)

df["POI"] = df["POI/street"].apply(getPOI)
df["Street"] = df["POI/street"].apply(getStreet) 

df["SplitRaw"] = df["raw_address"].apply(split)
df["SplitPOI"] = df["POI"].apply(split)
df["SplitStreet"] = df["Street"].apply(split)
df.head()

Unnamed: 0,id,raw_address,POI/street,POI,Street,SplitPOI,SplitStreet,SplitRaw
0,0,jl kapuk timur delta sili iii lippo cika 11 a ...,/jl kapuk timur delta sili iii lippo cika,,jl kapuk timur delta sili iii lippo cika,[],"[jl, kapuk, timur, delta, sili, iii, lippo, cika]","[jl, kapuk, timur, delta, sili, iii, lippo, ci..."
2,2,setu siung 119 rt 5 1 13880 cipayung,/siung,,siung,[],[siung],"[setu, siung, 119, rt, 5, 1, 13880, cipayung]"
3,3,"toko dita, kertosono",toko dita/,toko dita,,"[toko, dita]",[],"[toko, dita,, kertosono]"
4,4,jl. orde baru,/jl. orde baru,,jl. orde baru,[],"[jl., orde, baru]","[jl., orde, baru]"
5,5,"raya samb gede, 299 toko bb kids",toko bb kids/raya samb gede,toko bb kids,raya samb gede,"[toko, bb, kids]","[raya, samb, gede]","[raya, samb, gede,, 299, toko, bb, kids]"


## Number of Comma in POI and Street

In [23]:
x = df["POI"].apply(lambda x : "," in x)
sum(x)

918

In [24]:
x = df["Street"].apply(lambda x : "," in x)
sum(x)

1176

The number is not significant compared to 300k, thus we can drop it to make the training data cleaner

In [25]:
comma_tokens = []

In [26]:
for tokens in df["SplitPOI"].tolist() + df["SplitStreet"].tolist() :
    for token in tokens :
        if "," in token :
            comma_tokens.append(token)

In [27]:
count = Counter(comma_tokens)

In [28]:
count.most_common()[:15]

[('ii,', 107),
 ('raya,', 87),
 ('i,', 66),
 ('iii,', 56),
 ('2,', 46),
 ('baru,', 40),
 ('iv,', 36),
 ('1,', 35),
 ('v,', 35),
 ('jaya,', 30),
 ('4,', 27),
 ('sh.,', 26),
 ('jl,', 26),
 ('3,', 23),
 ('lor,', 22)]

drop train data if the POI or street contain comma ","

In [29]:
def drop_column_contain_comma(df, column_name) :
    return df[df[column_name].apply(lambda x : "," not in x)]
    
df = drop_column_contain_comma(df, "POI")
df = drop_column_contain_comma(df, "Street")
df

Unnamed: 0,id,raw_address,POI/street,POI,Street,SplitPOI,SplitStreet,SplitRaw
0,0,jl kapuk timur delta sili iii lippo cika 11 a ...,/jl kapuk timur delta sili iii lippo cika,,jl kapuk timur delta sili iii lippo cika,[],"[jl, kapuk, timur, delta, sili, iii, lippo, cika]","[jl, kapuk, timur, delta, sili, iii, lippo, ci..."
2,2,setu siung 119 rt 5 1 13880 cipayung,/siung,,siung,[],[siung],"[setu, siung, 119, rt, 5, 1, 13880, cipayung]"
3,3,"toko dita, kertosono",toko dita/,toko dita,,"[toko, dita]",[],"[toko, dita,, kertosono]"
4,4,jl. orde baru,/jl. orde baru,,jl. orde baru,[],"[jl., orde, baru]","[jl., orde, baru]"
5,5,"raya samb gede, 299 toko bb kids",toko bb kids/raya samb gede,toko bb kids,raya samb gede,"[toko, bb, kids]","[raya, samb, gede]","[raya, samb, gede,, 299, toko, bb, kids]"
...,...,...,...,...,...,...,...,...
299994,299994,karawaci baru kakap raya 156 rt 1 rw 3 karawaci,/kakap raya,,kakap raya,[],"[kakap, raya]","[karawaci, baru, kakap, raya, 156, rt, 1, rw, ..."
299995,299995,jend ahmad yani 331 kertasari ciamis,/jend ahmad yani,,jend ahmad yani,[],"[jend, ahmad, yani]","[jend, ahmad, yani, 331, kertasari, ciamis]"
299996,299996,"raya cila kko, cilandak timur kel.",/raya cila kko,,raya cila kko,[],"[raya, cila, kko]","[raya, cila, kko,, cilandak, timur, kel.]"
299998,299998,jalan cipadu jaya taman asri gang bijaksana 3 ...,taman asri/,taman asri,,"[taman, asri]",[],"[jalan, cipadu, jaya, taman, asri, gang, bijak..."


In [30]:
# since the number is small, and the occurance is odd
# we can restructure space in raw_address
def restrusture_comma(text) :
    return remove_multiple_whitespace(text.replace(",", " , ")).strip()

In [31]:
df["raw_address"] = df["raw_address"].apply(restrusture_comma)
df["POI/street"] = df["POI/street"].apply(restrusture_comma)

df["POI"] = df["POI/street"].apply(getPOI)
df["Street"] = df["POI/street"].apply(getStreet) 

df["SplitRaw"] = df["raw_address"].apply(split)
df["SplitPOI"] = df["POI"].apply(split)
df["SplitStreet"] = df["Street"].apply(split)

df.head()

Unnamed: 0,id,raw_address,POI/street,POI,Street,SplitPOI,SplitStreet,SplitRaw
0,0,jl kapuk timur delta sili iii lippo cika 11 a ...,/jl kapuk timur delta sili iii lippo cika,,jl kapuk timur delta sili iii lippo cika,[],"[jl, kapuk, timur, delta, sili, iii, lippo, cika]","[jl, kapuk, timur, delta, sili, iii, lippo, ci..."
2,2,setu siung 119 rt 5 1 13880 cipayung,/siung,,siung,[],[siung],"[setu, siung, 119, rt, 5, 1, 13880, cipayung]"
3,3,"toko dita , kertosono",toko dita/,toko dita,,"[toko, dita]",[],"[toko, dita, ,, kertosono]"
4,4,jl. orde baru,/jl. orde baru,,jl. orde baru,[],"[jl., orde, baru]","[jl., orde, baru]"
5,5,"raya samb gede , 299 toko bb kids",toko bb kids/raya samb gede,toko bb kids,raya samb gede,"[toko, bb, kids]","[raya, samb, gede]","[raya, samb, gede, ,, 299, toko, bb, kids]"


## Number of ( in POI and Street

In [32]:
x = df["POI"].apply(lambda x : "(" in x)
sum(x)

519

In [33]:
df[df["POI"].apply(lambda x : "(" in x)]

Unnamed: 0,id,raw_address,POI/street,POI,Street,SplitPOI,SplitStreet,SplitRaw
898,898,"duy , kary novo hotel batam (kop novo) , lubuk...",karyawan novotel hotel batam (kopkar novo)/duy,karyawan novotel hotel batam (kopkar novo),duy,"[karyawan, novotel, hotel, batam, (kopkar, novo)]",[duy],"[duy, ,, kary, novo, hotel, batam, (kop, novo)..."
967,967,eka sari lor transport. pt - agen andongsari -...,eka sari lorena transport. pt - agen andongsar...,eka sari lorena transport. pt - agen andongsar...,dharmaw,"[eka, sari, lorena, transport., pt, -, agen, a...",[dharmaw],"[eka, sari, lor, transport., pt, -, agen, ando..."
4382,4382,"bank rakyat indonesia (persero) tbk. pt , haur...",bank rakyat indonesia (persero) tbk. pt/,bank rakyat indonesia (persero) tbk. pt,,"[bank, rakyat, indonesia, (persero), tbk., pt]",[],"[bank, rakyat, indonesia, (persero), tbk., pt,..."
4547,4547,"askes. pt (persero) - sema , sul agung gajah m...",askes. pt (persero) - semarang/sul agung,askes. pt (persero) - semarang,sul agung,"[askes., pt, (persero), -, semarang]","[sul, agung]","[askes., pt, (persero), -, sema, ,, sul, agung..."
7850,7850,"dr raji no 107 batik artha (mbak muji) , sri w...",batik artha (mbak muji)/dr raji,batik artha (mbak muji),dr raji,"[batik, artha, (mbak, muji)]","[dr, raji]","[dr, raji, no, 107, batik, artha, (mbak, muji)..."
...,...,...,...,...,...,...,...,...
296842,296842,"kebun sayur organik (kso) dusun wuni , pasuruh...",kebun sayur organik (kso)/,kebun sayur organik (kso),,"[kebun, sayur, organik, (kso)]",[],"[kebun, sayur, organik, (kso), dusun, wuni, ,,..."
297334,297334,"pao telo (bak telo) , borob mojolangu lowokwaru",pao telo (bakpao telo)/borob,pao telo (bakpao telo),borob,"[pao, telo, (bakpao, telo)]",[borob],"[pao, telo, (bak, telo), ,, borob, mojolangu, ..."
297514,297514,raudhatul athfa (ra) nasiatul mubta 1,raudhatul athfa (ra) nasiatul mubtadiin 1/,raudhatul athfa (ra) nasiatul mubtadiin 1,,"[raudhatul, athfa, (ra), nasiatul, mubtadiin, 1]",[],"[raudhatul, athfa, (ra), nasiatul, mubta, 1]"
298010,298010,"lya & nurul (lyn) , m h tham boule , tanah abang",lya & nurul (lyn)/m h tham boule,lya & nurul (lyn),m h tham boule,"[lya, &, nurul, (lyn)]","[m, h, tham, boule]","[lya, &, nurul, (lyn), ,, m, h, tham, boule, ,..."


In [34]:
x = df["Street"].apply(lambda x : "(" in x)
sum(x)

9

In [35]:
df[df["Street"].apply(lambda x : "(" in x)]

Unnamed: 0,id,raw_address,POI/street,POI,Street,SplitPOI,SplitStreet,SplitRaw
6230,6230,"indo alam (da pemb , tegalsawah karawang timur",/indo alam (da pemb,,indo alam (da pemb,[],"[indo, alam, (da, pemb]","[indo, alam, (da, pemb, ,, tegalsawah, karawan..."
34918,34918,(seb jemb sun pelus kedungmalang sumbang,/(seb jemb sun pelus,,(seb jemb sun pelus,[],"[(seb, jemb, sun, pelus]","[(seb, jemb, sun, pelus, kedungmalang, sumbang]"
41168,41168,a.r.h (kom beji beji,/a.r.h (kom,,a.r.h (kom,[],"[a.r.h, (kom]","[a.r.h, (kom, beji, beji]"
53618,53618,hidup baru 3 (haji ahm gandaria utara kebayora...,/hidup baru 3 (haji ahm,,hidup baru 3 (haji ahm,[],"[hidup, baru, 3, (haji, ahm]","[hidup, baru, 3, (haji, ahm, gandaria, utara, ..."
176537,176537,( setia jaya 7 jelambar baru rt 1 6 grogol pet...,/( setia jaya,,( setia jaya,[],"[(, setia, jaya]","[(, setia, jaya, 7, jelambar, baru, rt, 1, 6, ..."
213027,213027,"rusunawa pesakih (rptra) , jl. daan mogot(desa...",rusunawa pesakih/jl. daan mogot(desa semanan),rusunawa pesakih,jl. daan mogot(desa semanan),"[rusunawa, pesakih]","[jl., daan, mogot(desa, semanan)]","[rusunawa, pesakih, (rptra), ,, jl., daan, mog..."
225190,225190,(de sd haur gg. ii lebakgede coblong,/(de sd haur gg. ii,,(de sd haur gg. ii,[],"[(de, sd, haur, gg., ii]","[(de, sd, haur, gg., ii, lebakgede, coblong]"
230539,230539,(seb spbu motor kelur sumbe sumber rejo balikp...,/(seb spbu motor kelur sumbe,,(seb spbu motor kelur sumbe,[],"[(seb, spbu, motor, kelur, sumbe]","[(seb, spbu, motor, kelur, sumbe, sumber, rejo..."
273268,273268,(j-2) 16 dadap kosambi,/(j-2),,(j-2),[],[(j-2)],"[(j-2), 16, dadap, kosambi]"


drop if the POI/street contain left bracket ( without right bracket ) or vice versa

In [36]:
df = df[df["POI/street"].apply(lambda x : not( ( "(" in x and ")" not in x ) or ( ")" in x and "(" not in x ) ) )]

In [37]:
df[df["POI"].apply(lambda x : "(" in x)]

Unnamed: 0,id,raw_address,POI/street,POI,Street,SplitPOI,SplitStreet,SplitRaw
898,898,"duy , kary novo hotel batam (kop novo) , lubuk...",karyawan novotel hotel batam (kopkar novo)/duy,karyawan novotel hotel batam (kopkar novo),duy,"[karyawan, novotel, hotel, batam, (kopkar, novo)]",[duy],"[duy, ,, kary, novo, hotel, batam, (kop, novo)..."
967,967,eka sari lor transport. pt - agen andongsari -...,eka sari lorena transport. pt - agen andongsar...,eka sari lorena transport. pt - agen andongsar...,dharmaw,"[eka, sari, lorena, transport., pt, -, agen, a...",[dharmaw],"[eka, sari, lor, transport., pt, -, agen, ando..."
4382,4382,"bank rakyat indonesia (persero) tbk. pt , haur...",bank rakyat indonesia (persero) tbk. pt/,bank rakyat indonesia (persero) tbk. pt,,"[bank, rakyat, indonesia, (persero), tbk., pt]",[],"[bank, rakyat, indonesia, (persero), tbk., pt,..."
4547,4547,"askes. pt (persero) - sema , sul agung gajah m...",askes. pt (persero) - semarang/sul agung,askes. pt (persero) - semarang,sul agung,"[askes., pt, (persero), -, semarang]","[sul, agung]","[askes., pt, (persero), -, sema, ,, sul, agung..."
7850,7850,"dr raji no 107 batik artha (mbak muji) , sri w...",batik artha (mbak muji)/dr raji,batik artha (mbak muji),dr raji,"[batik, artha, (mbak, muji)]","[dr, raji]","[dr, raji, no, 107, batik, artha, (mbak, muji)..."
...,...,...,...,...,...,...,...,...
296842,296842,"kebun sayur organik (kso) dusun wuni , pasuruh...",kebun sayur organik (kso)/,kebun sayur organik (kso),,"[kebun, sayur, organik, (kso)]",[],"[kebun, sayur, organik, (kso), dusun, wuni, ,,..."
297334,297334,"pao telo (bak telo) , borob mojolangu lowokwaru",pao telo (bakpao telo)/borob,pao telo (bakpao telo),borob,"[pao, telo, (bakpao, telo)]",[borob],"[pao, telo, (bak, telo), ,, borob, mojolangu, ..."
297514,297514,raudhatul athfa (ra) nasiatul mubta 1,raudhatul athfa (ra) nasiatul mubtadiin 1/,raudhatul athfa (ra) nasiatul mubtadiin 1,,"[raudhatul, athfa, (ra), nasiatul, mubtadiin, 1]",[],"[raudhatul, athfa, (ra), nasiatul, mubta, 1]"
298010,298010,"lya & nurul (lyn) , m h tham boule , tanah abang",lya & nurul (lyn)/m h tham boule,lya & nurul (lyn),m h tham boule,"[lya, &, nurul, (lyn)]","[m, h, tham, boule]","[lya, &, nurul, (lyn), ,, m, h, tham, boule, ,..."


In [38]:
df[df["Street"].apply(lambda x : "(" in x)]

Unnamed: 0,id,raw_address,POI/street,POI,Street,SplitPOI,SplitStreet,SplitRaw
213027,213027,"rusunawa pesakih (rptra) , jl. daan mogot(desa...",rusunawa pesakih/jl. daan mogot(desa semanan),rusunawa pesakih,jl. daan mogot(desa semanan),"[rusunawa, pesakih]","[jl., daan, mogot(desa, semanan)]","[rusunawa, pesakih, (rptra), ,, jl., daan, mog..."
273268,273268,(j-2) 16 dadap kosambi,/(j-2),,(j-2),[],[(j-2)],"[(j-2), 16, dadap, kosambi]"


In [39]:
# check the frequency on test data
x = dft["raw_address"].apply(lambda x : "(" in x)
sum(x)

1133

In [40]:
dft[dft["raw_address"].apply(lambda x : "(" in x)]

Unnamed: 0,id,raw_address
113,113,sd negeri 3 seunuddon 24393 seunuddon (seunudon)
143,143,"sisun kel., 22736 angkola barat (padang sidemp..."
161,161,marga 13 buahbatu (margacinta)
224,224,jln karya baru no 41 b rt 12 rw 03 kel srengse...
242,242,( konter data cell ) komplek puri citra blok a...
...,...,...
49706,49706,"buana asri iv, 15 pinang pinang (penang)"
49854,49854,pinang buana asri 12 15144 pinang (penang)
49915,49915,"kp. tanah ungkuk, rt.002 003, ds. sarimukti (p..."
49926,49926,dk. wijaya rt 1 rw 1 (selepan lor lapangan kop...


In [41]:
lb_tokens = []

In [42]:
for tokens in df["SplitPOI"].tolist() + df["SplitStreet"].tolist() :
    for token in tokens :
        if "(" in token :
            lb_tokens.append(token)

In [43]:
count = Counter(lb_tokens)

In [44]:
count.most_common()[:15]

[('(', 26),
 ('(ra)', 24),
 ('(sdn)', 20),
 ('(mt)', 18),
 ('(mis)', 13),
 ('(persero)', 12),
 ('(tpa', 12),
 ('(tpq', 11),
 ('(kua)', 9),
 ('(tk)', 8),
 ('(alfamart)', 7),
 ('(tpq)', 7),
 ('(smk)', 7),
 ('(lkp)', 6),
 ('(mission', 4)]

In [45]:
# remove "(" and ")" if it's not in one word
def remove_bracket(text) :
    words = text.split(" ")
    for i in range(len(words)) :
        if "(" in words[i] and  ")" not in words[i] :
            words[i] = words[i].replace("(", "")
        elif ")" in words[i] and  "(" not in words[i] :
            words[i] = words[i].replace(")", "")
    return " ".join(words).strip()

In [46]:
df["raw_address"] = df["raw_address"].apply(remove_bracket)
df["POI/street"] = df["POI/street"].apply(remove_bracket)

df["POI"] = df["POI/street"].apply(getPOI)
df["Street"] = df["POI/street"].apply(getStreet) 

df["SplitRaw"] = df["raw_address"].apply(split)
df["SplitPOI"] = df["POI"].apply(split)
df["SplitStreet"] = df["Street"].apply(split)

df

Unnamed: 0,id,raw_address,POI/street,POI,Street,SplitPOI,SplitStreet,SplitRaw
0,0,jl kapuk timur delta sili iii lippo cika 11 a ...,/jl kapuk timur delta sili iii lippo cika,,jl kapuk timur delta sili iii lippo cika,[],"[jl, kapuk, timur, delta, sili, iii, lippo, cika]","[jl, kapuk, timur, delta, sili, iii, lippo, ci..."
2,2,setu siung 119 rt 5 1 13880 cipayung,/siung,,siung,[],[siung],"[setu, siung, 119, rt, 5, 1, 13880, cipayung]"
3,3,"toko dita , kertosono",toko dita/,toko dita,,"[toko, dita]",[],"[toko, dita, ,, kertosono]"
4,4,jl. orde baru,/jl. orde baru,,jl. orde baru,[],"[jl., orde, baru]","[jl., orde, baru]"
5,5,"raya samb gede , 299 toko bb kids",toko bb kids/raya samb gede,toko bb kids,raya samb gede,"[toko, bb, kids]","[raya, samb, gede]","[raya, samb, gede, ,, 299, toko, bb, kids]"
...,...,...,...,...,...,...,...,...
299994,299994,karawaci baru kakap raya 156 rt 1 rw 3 karawaci,/kakap raya,,kakap raya,[],"[kakap, raya]","[karawaci, baru, kakap, raya, 156, rt, 1, rw, ..."
299995,299995,jend ahmad yani 331 kertasari ciamis,/jend ahmad yani,,jend ahmad yani,[],"[jend, ahmad, yani]","[jend, ahmad, yani, 331, kertasari, ciamis]"
299996,299996,"raya cila kko , cilandak timur kel.",/raya cila kko,,raya cila kko,[],"[raya, cila, kko]","[raya, cila, kko, ,, cilandak, timur, kel.]"
299998,299998,jalan cipadu jaya taman asri gang bijaksana 3 ...,taman asri/,taman asri,,"[taman, asri]",[],"[jalan, cipadu, jaya, taman, asri, gang, bijak..."


In [47]:
def f(words, sentence): 
    res = []
    for word in words :
        if sentence.find(word) == -1 :
            res.append(word)
    if len(res) == 0 :
        return False
    return res

In [48]:
df['TransformedPOI'] = df.apply(lambda x: f(x.SplitPOI, x.raw_address), axis=1)
df['TransformedStreet'] = df.apply(lambda x: f(x.SplitStreet, x.raw_address), axis=1)

In [49]:
# dp = df[df["TransformedPOI"] != False]
# dp.head()

In [50]:
# ds = df[df["TransformedStreet"] != False]
# ds.head()

words = ['ds.', 'sukowiyono', 'dsn.', 'karangsono', 'rt', '01', 'rw', '03', 'utara', 'jembatan', 'simomasuk', 'gang', 'jembatan']

POI = ['jembatan', 'simo']

there is a corrector for misspell word

In [51]:
odds = []

def repair_words(words, repaireds) :
    i = 0
    while i < len(words) :
        curr = words[i]
        if curr == repaireds[0][:len(curr)] :
            j = 1
            while j < len(repaireds) :
                if i + j >= len(words) :
                    odds.append({"words": words, "repaireds": repaireds})
#                     print(i)
#                     print(j)
#                     print(words)
#                     print(repaireds)
                    return []
#                     return words
                curr = words[i+j]
                if curr == repaireds[j][:len(curr)] :
                    j += 1
                else :
                    break
            if j == len(repaireds) :
                words[i:i+j] = repaireds
                return words
        i += 1
    return words

In [52]:
df["clean_address"] = df.apply(lambda x : repair_words(x.SplitRaw, x.SplitPOI), axis=1)
df["clean_address"] = df.apply(lambda x : repair_words(x.clean_address, x.SplitStreet), axis=1)

In [53]:
### Number of odd case
len(odds)

1

In [54]:
for odd in odds :
    print("Words\t\t: ", " ".join(odd["words"]))
    print("Repaireds\t: ", " ".join(odd["repaireds"]))
#     print(odd["repaireds"])
    print()

Words		:  cirebon waterland ade irma suryani. jl. yos sudarso no 1 , lemahwungkuk , cirebon
Repaireds	:  cirebon waterland ade irma suryani



In [55]:
df = df[df["clean_address"].apply(lambda x : len(x) > 0)]

In [56]:
def array_to_string(arr) :
    return " ".join(arr)

In [57]:
df["clean_address"] = df["clean_address"].apply(array_to_string)

In [58]:
# df["clean_address"] = df["clean_address"].replace("", np.nan)
# df = df.dropna().reset_index(drop=True)
df

Unnamed: 0,id,raw_address,POI/street,POI,Street,SplitPOI,SplitStreet,SplitRaw,TransformedPOI,TransformedStreet,clean_address
0,0,jl kapuk timur delta sili iii lippo cika 11 a ...,/jl kapuk timur delta sili iii lippo cika,,jl kapuk timur delta sili iii lippo cika,[],"[jl, kapuk, timur, delta, sili, iii, lippo, cika]","[jl, kapuk, timur, delta, sili, iii, lippo, ci...",False,False,jl kapuk timur delta sili iii lippo cika 11 a ...
2,2,setu siung 119 rt 5 1 13880 cipayung,/siung,,siung,[],[siung],"[setu, siung, 119, rt, 5, 1, 13880, cipayung]",False,False,setu siung 119 rt 5 1 13880 cipayung
3,3,"toko dita , kertosono",toko dita/,toko dita,,"[toko, dita]",[],"[toko, dita, ,, kertosono]",False,False,"toko dita , kertosono"
4,4,jl. orde baru,/jl. orde baru,,jl. orde baru,[],"[jl., orde, baru]","[jl., orde, baru]",False,False,jl. orde baru
5,5,"raya samb gede , 299 toko bb kids",toko bb kids/raya samb gede,toko bb kids,raya samb gede,"[toko, bb, kids]","[raya, samb, gede]","[raya, samb, gede, ,, 299, toko, bb, kids]",False,False,"raya samb gede , 299 toko bb kids"
...,...,...,...,...,...,...,...,...,...,...,...
299994,299994,karawaci baru kakap raya 156 rt 1 rw 3 karawaci,/kakap raya,,kakap raya,[],"[kakap, raya]","[karawaci, baru, kakap, raya, 156, rt, 1, rw, ...",False,False,karawaci baru kakap raya 156 rt 1 rw 3 karawaci
299995,299995,jend ahmad yani 331 kertasari ciamis,/jend ahmad yani,,jend ahmad yani,[],"[jend, ahmad, yani]","[jend, ahmad, yani, 331, kertasari, ciamis]",False,False,jend ahmad yani 331 kertasari ciamis
299996,299996,"raya cila kko , cilandak timur kel.",/raya cila kko,,raya cila kko,[],"[raya, cila, kko]","[raya, cila, kko, ,, cilandak, timur, kel.]",False,False,"raya cila kko , cilandak timur kel."
299998,299998,jalan cipadu jaya taman asri gang bijaksana 3 ...,taman asri/,taman asri,,"[taman, asri]",[],"[jalan, cipadu, jaya, taman, asri, gang, bijak...",False,False,jalan cipadu jaya taman asri gang bijaksana 3 ...


In [59]:
df[["raw_address", "POI/street"]].to_csv(data_dir + "processed_train.csv", index=False)

In [60]:
cleaned_df = df[["clean_address", "POI/street"]]
cleaned_df.rename(columns={"clean_address": "raw_address"}, inplace=True)
cleaned_df

Unnamed: 0,raw_address,POI/street
0,jl kapuk timur delta sili iii lippo cika 11 a ...,/jl kapuk timur delta sili iii lippo cika
2,setu siung 119 rt 5 1 13880 cipayung,/siung
3,"toko dita , kertosono",toko dita/
4,jl. orde baru,/jl. orde baru
5,"raya samb gede , 299 toko bb kids",toko bb kids/raya samb gede
...,...,...
299994,karawaci baru kakap raya 156 rt 1 rw 3 karawaci,/kakap raya
299995,jend ahmad yani 331 kertasari ciamis,/jend ahmad yani
299996,"raya cila kko , cilandak timur kel.",/raya cila kko
299998,jalan cipadu jaya taman asri gang bijaksana 3 ...,taman asri/


In [61]:
cleaned_df.to_csv(data_dir + "cleaned_train.csv", index=False)

## Create the whole text processing pipeline

In [62]:
dft = pd.read_csv(data_dir + "test.csv")

In [63]:
dft["raw_address"] = dft["raw_address"].apply(remove_multiple_whitespace)
dft["raw_address"] = dft["raw_address"].apply(remove_dot)
dft["raw_address"] = dft["raw_address"].apply(remove_comma)
dft["raw_address"] = dft["raw_address"].apply(remove_bracket)

NameError: name 'remove_comma' is not defined

In [None]:
dft

In [None]:
dft.to_csv(data_dir + "processed_test.csv", index=False)