In [1]:
import pandas as pd
import numpy as np
from time import time
from collections import Counter
import collections

# natural language processing: n-gram ranking
import re
import unicodedata
import nltk

In [2]:
df_train = pd.read_csv('data/processed_train.csv')
df_train.head()

Unnamed: 0,raw_address,POI/street
0,jl kapuk timur delta sili iii lippo cika 11 a ...,/jl kapuk timur delta sili iii lippo cika
1,setu siung 119 rt 5 1 13880 cipayung,/siung
2,"toko dita , kertosono",toko dita/
3,jl. orde baru,/jl. orde baru
4,"raya samb gede , 299 toko bb kids",toko bb kids/raya samb gede


## Token Counter

In [3]:
tokens = []
for s in df_train["raw_address"] :
    words = s.split(" ")
    for word in words :
        tokens.append(word)

len(tokens)

2010583

In [4]:
count = Counter(tokens) 

In [5]:
count.most_common()[:20]

[(',', 180844),
 ('no', 47574),
 ('rt', 39233),
 ('raya', 31761),
 ('1', 21385),
 ('2', 19532),
 ('rw', 18853),
 ('3', 16272),
 ('4', 13234),
 ('barat', 13224),
 ('timur', 13057),
 ('5', 12170),
 ('utara', 12110),
 ('kel.', 11115),
 ('jaya', 11020),
 ('6', 10509),
 ('selatan', 10103),
 ('gg.', 9842),
 ('jl.', 9706),
 ('7', 9514)]

In [6]:
count["mas"]

5094

In [7]:
count["bang"]

399

In [8]:
count["percet"]

86

In [9]:
def getInterest(_list, arr_index):
    return _list[arr_index]

df_train['POI'] = df_train['POI/street'].str.split('/').apply(getInterest, args=(0,))
df_train['street'] = df_train['POI/street'].str.split('/').apply(getInterest, args=(1,))
df_train.head(20)

Unnamed: 0,raw_address,POI/street,POI,street
0,jl kapuk timur delta sili iii lippo cika 11 a ...,/jl kapuk timur delta sili iii lippo cika,,jl kapuk timur delta sili iii lippo cika
1,setu siung 119 rt 5 1 13880 cipayung,/siung,,siung
2,"toko dita , kertosono",toko dita/,toko dita,
3,jl. orde baru,/jl. orde baru,,jl. orde baru
4,"raya samb gede , 299 toko bb kids",toko bb kids/raya samb gede,toko bb kids,raya samb gede
5,"kem mel raya , no 4 bojong rawalumbu rt 1 36 r...",/kem mel raya,,kem mel raya
6,tela keuramat kuta alam,/tela,,tela
7,gg. i wates magersari,/gg. i,,gg. i
8,bunga ncole ix 2,/bunga ncole ix,,bunga ncole ix
9,"cikahuripan sd neg boj 02 klap boj , no 5 16877",sd negeri bojong 02/klap boj,sd negeri bojong 02,klap boj


In [10]:
def getListOfWords(sentence):
    sentence = sentence.replace('\n', ' ').replace('\t', ' ')
    sentence = sentence.replace('/', ' ').replace('.', ' ').replace(',', ' ')
    words = sentence.split(' ')
    while '' in words:
        words.remove('')
        
    return [word for word in words if len(words) > 0]

In [11]:
def getDifference(row):
    raw = getListOfWords(row['raw_address'])
    poi_street = getListOfWords(row['POI/street'])
    temp = set(raw)
    output = [x for x in poi_street if x not in temp]
    if len(output) > 0:
        return output
    else:
        return np.nan

In [12]:
start = time()

df_train['delta_words'] = df_train.apply(getDifference, axis=1)

print("Executed in {} seconds.".format(time() - start))

Executed in 11.115409135818481 seconds.


In [13]:
df_need_repair = df_train.dropna()
df_need_repair

Unnamed: 0,raw_address,POI/street,POI,street,delta_words
9,"cikahuripan sd neg boj 02 klap boj , no 5 16877",sd negeri bojong 02/klap boj,sd negeri bojong 02,klap boj,"[negeri, bojong]"
10,"yaya atohar ,",yayasan atohariyah/,yayasan atohariyah,,"[yayasan, atohariyah]"
18,"toko bang ajs ,",toko bangunan ajs/,toko bangunan ajs,,[bangunan]
36,mar tabl metro iringmulyo metro timur,markaz tabligh metro/,markaz tabligh metro,,"[markaz, tabligh]"
38,sd neg 12 anggrek,sd negeri 12 anggrek/,sd negeri 12 anggrek,,[negeri]
...,...,...,...,...,...
265866,moha toha no 167,/mohammad toha,,mohammad toha,[mohammad]
265867,"islamic training cen pare , brawi , pelem pare",islamic training center pare/brawi,islamic training center pare,brawi,[center]
265873,"la banda minima , cile raya , pesanggrahan",la banda minimarket/cile raya,la banda minimarket,cile raya,[minimarket]
265875,"ginzi cafe , siliw ,",ginzi cafe/siliwangi,ginzi cafe,siliwangi,[siliwangi]


# N-gram analysis

## find the pair

In [14]:
def getDistance(str1, str2):
    return nltk.edit_distance(str1, str2)

getDistance('negeri', 'neg')

3

In [15]:
for ngram in nltk.ngrams(getListOfWords("cikahuripan sd neg boj 02 klap boj, no 5 16877"), 2):
    print (' '.join(ngram))

cikahuripan sd
sd neg
neg boj
boj 02
02 klap
klap boj
boj no
no 5
5 16877


In [16]:
x = {1: 2, 3: 4, 4: 3, 2: 1, 0: 0}
sorted_x = sorted(x.items(), key=lambda kv: kv[1])
sorted_dict = collections.OrderedDict(sorted_x)
sorted_x[1][0]

2

In [17]:
def getPair(row):
    raw = row['raw_address']
    repaired = row['delta_words']
    _dict1 = {}
    _dict2 = {}
    for repaired_word in repaired:
        for raw_word in getListOfWords(raw):
            _dict2[raw_word] = getDistance(repaired_word, raw_word)
        # get minimum distance word
        sorted_dict2 = sorted(_dict2.items(), key=lambda kv: kv[1])
        _dict1[repaired_word] = sorted_dict2[0][0]
        # inverse dict from raw to repaired
        inv_map = {v: k for k, v in _dict1.items()}
    return inv_map

In [18]:
df_need_repair['Pair'] = df_need_repair.apply(getPair, axis=1)
df_need_repair

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,raw_address,POI/street,POI,street,delta_words,Pair
9,"cikahuripan sd neg boj 02 klap boj , no 5 16877",sd negeri bojong 02/klap boj,sd negeri bojong 02,klap boj,"[negeri, bojong]","{'neg': 'negeri', 'boj': 'bojong'}"
10,"yaya atohar ,",yayasan atohariyah/,yayasan atohariyah,,"[yayasan, atohariyah]","{'yaya': 'yayasan', 'atohar': 'atohariyah'}"
18,"toko bang ajs ,",toko bangunan ajs/,toko bangunan ajs,,[bangunan],{'bang': 'bangunan'}
36,mar tabl metro iringmulyo metro timur,markaz tabligh metro/,markaz tabligh metro,,"[markaz, tabligh]","{'mar': 'markaz', 'tabl': 'tabligh'}"
38,sd neg 12 anggrek,sd negeri 12 anggrek/,sd negeri 12 anggrek,,[negeri],{'neg': 'negeri'}
...,...,...,...,...,...,...
265866,moha toha no 167,/mohammad toha,,mohammad toha,[mohammad],{'moha': 'mohammad'}
265867,"islamic training cen pare , brawi , pelem pare",islamic training center pare/brawi,islamic training center pare,brawi,[center],{'cen': 'center'}
265873,"la banda minima , cile raya , pesanggrahan",la banda minimarket/cile raya,la banda minimarket,cile raya,[minimarket],{'minima': 'minimarket'}
265875,"ginzi cafe , siliw ,",ginzi cafe/siliwangi,ginzi cafe,siliwangi,[siliwangi],{'siliw': 'siliwangi'}


In [19]:
corpus = []
for index, row in df_need_repair.iterrows():
    for key, value in row['Pair'].items():
        temp = [key,value]
        corpus.append(temp)
corpus = Counter(tuple(item) for item in corpus)
corpus = dict(sorted(corpus.items(), key=lambda item: item[1], reverse=True))

In [20]:
x = {}
x['a'] = [{'abc': 190}]
x['a'].append({'abd': 100})

In [21]:
if 'b' in x:
    x['b'].append({'bcd': 300})
else:
    x['b'] = [{'bcd': 300}]
x

{'a': [{'abc': 190}, {'abd': 100}], 'b': [{'bcd': 300}]}

In [22]:
dict_1gram = {}
for key, value in corpus.items():
    if key[0] in dict_1gram:
        dict_1gram[key[0]].append({key[1]: value})
    else:
        dict_1gram[key[0]] = [{key[1]: value}]
# dict_1gram

In [23]:
small_dict = {}

In [24]:
for k, v in dict_1gram.items() :
#     print(k)
#     print(v[0])
#     print(list(v[0].items()))
    first_word, first_count = list(v[0].items())[0]
    uncertain = True
    if len(v) > 1 :
        second_word, second_count = list(v[1].items())[0]
        if first_count < second_count * 5 :
            uncertain = False
    else :
        uncertain = False
        
    
    if not uncertain :
        if count[k] < 2 * first_count :
            small_dict[k] = first_word
    
    #     if count[k] < 2 * v[0].
        

In [25]:
small_dict

{'indon': 'indonesia',
 'apo': 'apotek',
 'pendid': 'pendidikan',
 'cah': 'cahaya',
 'yaya': 'yayasan',
 'kelon': 'kelontong',
 'kabup': 'kabupaten',
 'percet': 'percetakan',
 'nota': 'notaris',
 'seder': 'sederhana',
 'ist': 'istana',
 'muhamma': 'muhammadiyah',
 'sch': 'school',
 'lese': 'lesehan',
 'pegad': 'pegadaian',
 'barber': 'barbershop',
 'seaf': 'seafood',
 'ibtida': 'ibtidaiyah',
 'dae': 'daerah',
 'kecam': 'kecamatan',
 'saud': 'saudara',
 'pemak': 'pemakaman',
 'interna': 'international',
 'wedd': 'wedding',
 'asur': 'asuransi',
 'berm': 'bermain',
 'al-hid': 'al-hidayah',
 'off': 'office',
 'cemer': 'cemerlang',
 'bukal': 'bukalapak',
 'electr': 'electronic',
 'kato': 'katolik',
 'onl': 'online',
 'kitc': 'kitchen',
 'masyar': 'masyarakat',
 'text': 'textile',
 'tsanaw': 'tsanawiyah',
 'kuli': 'kuliner',
 'asu': 'asuhan',
 'stat': 'station',
 'pengin': 'penginapan',
 'univer': 'universitas',
 'educa': 'education',
 'paka': 'pakaian',
 'advert': 'advertising',
 'organ': '

## Checking

In [40]:
df = pd.read_csv('data/processed_train.csv')
df.head()

Unnamed: 0,raw_address,POI/street
0,jl kapuk timur delta sili iii lippo cika 11 a ...,/jl kapuk timur delta sili iii lippo cika
1,setu siung 119 rt 5 1 13880 cipayung,/siung
2,"toko dita , kertosono",toko dita/
3,jl. orde baru,/jl. orde baru
4,"raya samb gede , 299 toko bb kids",toko bb kids/raya samb gede


In [41]:
def repair(text) :
    words = text.split(" ")
    for i in range(len(words)) :
        if words[i] in small_dict :
            words[i] = small_dict[words[i]]
    return " ".join(words)

In [47]:
df["raw_address"] = df["raw_address"].apply(repair)

In [48]:
cdf = pd.read_csv('data/cleaned_train.csv')
cdf.head()

Unnamed: 0,raw_address,POI/street
0,jl kapuk timur delta sili iii lippo cika 11 a ...,/jl kapuk timur delta sili iii lippo cika
1,setu siung 119 rt 5 1 13880 cipayung,/siung
2,"toko dita , kertosono",toko dita/
3,jl. orde baru,/jl. orde baru
4,"raya samb gede , 299 toko bb kids",toko bb kids/raya samb gede


In [49]:
len(df)

265888

In [50]:
len(cdf)

265888

In [51]:
check = df["raw_address"] == cdf["raw_address"]
sum(check)/len(check)

0.7944510470574077

In [33]:
tdf = pd.read_csv('data/processed_test.csv')
tdf["raw_address"] = tdf["raw_address"].apply(repair)

In [98]:
tdf.to_csv("data/cleaned_test.csv", index=False)