# Fix Typo

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import Levenshtein as metrics
import string
import re

In [2]:
df = pd.read_csv('train.csv')
df.head(10)

Unnamed: 0,id,raw_address,POI/street
0,0,jl kapuk timur delta sili iii lippo cika 11 a ...,/jl kapuk timur delta sili iii lippo cika
1,1,"aye, jati sampurna",/
2,2,setu siung 119 rt 5 1 13880 cipayung,/siung
3,3,"toko dita, kertosono",toko dita/
4,4,jl. orde baru,/jl. orde baru
5,5,"raya samb gede, 299 toko bb kids",toko bb kids/raya samb gede
6,6,"kem mel raya, no 4 bojong rawalumbu rt 1 36 ra...",/kem mel raya
7,7,tela keuramat kuta alam,/tela
8,8,gg. i wates magersari,/gg. i
9,9,bunga ncole ix 2,/bunga ncole ix


In [3]:
df.loc[199].raw_address, df.loc[199]["POI/street"]

('beng spesia ac mobil ac rio, ling sala,',
 'bengkel spesialist ac mobil ac rio/ling sala')

In [4]:
punctuation = string.punctuation.replace('/', '')


def remove_punctuation(value):
    return ''.join(value.translate(str.maketrans('', '', punctuation)))


def fix_typo(raw, label):
    temp_memory = label.replace('/', ' ').split(' ')
    raw_list = raw.strip().split(' ')
    
    for raw_index, word in enumerate(raw_list):
        if word == '':
            continue
            
        have_punctuation = False
        punc = ""
        last_char = word[len(word) - 1]
        
        if last_char in punctuation:
            punc = last_char
            have_punctuation = True
                
        if have_punctuation:
            word = remove_punctuation(word)
            
        if not word in temp_memory:
            highest_score = -1
            the_most_similar = word
            for label in temp_memory:
                # cek accuracy
                score = metrics.jaro(word, label)
                if score > highest_score:
                    highest_score = score
                    the_most_similar = label

            if highest_score > 0.75:
                if have_punctuation:
                    if not the_most_similar[len(the_most_similar) - 1] in punctuation:
                        the_most_similar = the_most_similar + punc
                    
                raw_list[raw_index] = the_most_similar
        
    return ' '.join(raw_list)

In [5]:
fix_typo("cv. hin oto kenc, sido ii, q 29", "cv. hingdi oto kencana")

'cv. hingdi oto kencana, sido ii, q 29'

In [6]:
df.loc[199].raw_address, fix_typo(df.loc[199].raw_address, df.loc[199]["POI/street"])

('beng spesia ac mobil ac rio, ling sala,',
 'bengkel spesialist ac mobil ac rio, ling sala,')

In [7]:
df.loc[(df["raw_address"] == "cv. hin oto kenc, sido ii, q 29")]

Unnamed: 0,id,raw_address,POI/street
110,110,"cv. hin oto kenc, sido ii, q 29",cv. hingdi oto kencana/sido ii


In [8]:
train_np = df.values
preprocessed = []

for row in tqdm(train_np):
    raw = row[1]
    poi, street = row[2].split('/')
    
    if poi != '' and not poi in raw:
        raw = fix_typo(raw, poi)
                
    if street != '' and not street in raw:
        raw = fix_typo(raw, street)
    
    preprocessed.append({"id": row[0], "raw_address": raw, 'POI/street': row[2]})

100%|███████████████████████████████████████████| 300000/300000 [00:00<00:00, 332261.49it/s]


In [9]:
prep_df = pd.DataFrame(preprocessed)
prep_df.head(10)

Unnamed: 0,id,raw_address,POI/street
0,0,jl kapuk timur delta sili iii lippo cika 11 a ...,/jl kapuk timur delta sili iii lippo cika
1,1,"aye, jati sampurna",/
2,2,setu siung 119 rt 5 1 13880 cipayung,/siung
3,3,"toko dita, kertosono",toko dita/
4,4,jl. orde baru,/jl. orde baru
5,5,"raya samb gede, 299 toko bb kids",toko bb kids/raya samb gede
6,6,"kem mel raya, no 4 bojong rawalumbu rt 1 36 ra...",/kem mel raya
7,7,tela keuramat kuta alam,/tela
8,8,gg. i wates magersari,/gg. i
9,9,bunga ncole ix 2,/bunga ncole ix


In [10]:
df["prep_raw_address"] = prep_df["raw_address"]
df.head(10)

Unnamed: 0,id,raw_address,POI/street,prep_raw_address
0,0,jl kapuk timur delta sili iii lippo cika 11 a ...,/jl kapuk timur delta sili iii lippo cika,jl kapuk timur delta sili iii lippo cika 11 a ...
1,1,"aye, jati sampurna",/,"aye, jati sampurna"
2,2,setu siung 119 rt 5 1 13880 cipayung,/siung,setu siung 119 rt 5 1 13880 cipayung
3,3,"toko dita, kertosono",toko dita/,"toko dita, kertosono"
4,4,jl. orde baru,/jl. orde baru,jl. orde baru
5,5,"raya samb gede, 299 toko bb kids",toko bb kids/raya samb gede,"raya samb gede, 299 toko bb kids"
6,6,"kem mel raya, no 4 bojong rawalumbu rt 1 36 ra...",/kem mel raya,"kem mel raya, no 4 bojong rawalumbu rt 1 36 ra..."
7,7,tela keuramat kuta alam,/tela,tela keuramat kuta alam
8,8,gg. i wates magersari,/gg. i,gg. i wates magersari
9,9,bunga ncole ix 2,/bunga ncole ix,bunga ncole ix 2


In [11]:
df.iloc[110]

id                                                    110
raw_address               cv. hin oto kenc, sido ii, q 29
POI/street                 cv. hingdi oto kencana/sido ii
prep_raw_address    cv. hingdi oto kencana, sido ii, q 29
Name: 110, dtype: object

In [12]:
counter = 0
np_array = df.values
for row in np_array:
    raw1 = row[1]
    raw2 = row[3]
    if raw1 != raw2:
        print("Label: ", row[2])
        print("Original:", raw1)
        print("Fixed:", raw2, "\n")
        counter += 1
        
        if counter >= 20:
            break

Label:  sd negeri bojong 02/klap boj
Original: cikahuripan sd neg boj 02 klap boj, no 5 16877
Fixed: cikahuripan sd negeri bojong 02 klap bojong, no 5 16877 

Label:  yayasan atohariyah/
Original: yaya atohar,
Fixed: yayasan atohariyah, 

Label:  toko bangunan ajs/
Original: toko bang ajs,
Fixed: toko bangunan ajs, 

Label:  markaz tabligh metro/
Original: mar tabl metro iringmulyo metro timur
Fixed: markaz tabligh metro iringmulyo metro timur 

Label:  sd negeri 12 anggrek/
Original: sd neg 12 anggrek
Fixed: sd negeri 12 anggrek 

Label:  rumah makan pelangi/raya jomb
Original: rumah makan pela, raya jomb,
Fixed: rumah makan pelangi, raya jomb, 

Label:  /cakrad
Original: cak 11 nagasari karawang barat
Fixed: cakrad 11 nagasari karawang barat 

Label:  rnd printing/gang pinak
Original: rnd prin, gang pinak, sukarame
Fixed: rnd printing, gang pinak, sukarame 

Label:  pp minhajutthollab/kh abdul manan
Original: pp minhajutt, kh abdul manan, sumberberas muncar
Fixed: pp minhajutthollab,

In [13]:
df["raw_address"] = df["prep_raw_address"]
df = df.drop(columns=["prep_raw_address"])

In [14]:
df.head()

Unnamed: 0,id,raw_address,POI/street
0,0,jl kapuk timur delta sili iii lippo cika 11 a ...,/jl kapuk timur delta sili iii lippo cika
1,1,"aye, jati sampurna",/
2,2,setu siung 119 rt 5 1 13880 cipayung,/siung
3,3,"toko dita, kertosono",toko dita/
4,4,jl. orde baru,/jl. orde baru


In [15]:
df.to_csv('train_fix_typo.csv', index=False)

In [17]:
df.head()

Unnamed: 0,id,raw_address,POI/street
0,0,jl kapuk timur delta sili iii lippo cika 11 a ...,/jl kapuk timur delta sili iii lippo cika
1,1,"aye, jati sampurna",/
2,2,setu siung 119 rt 5 1 13880 cipayung,/siung
3,3,"toko dita, kertosono",toko dita/
4,4,jl. orde baru,/jl. orde baru


In [50]:
def replace_coma(value):
    return value.replace(',', ' , ')


def add_space_to_coma(value):
    s = re.sub('([,])', r' \1 ', value)
    s = re.sub('\s{2,}', ' ', value)
    return s


def add_space_to_dot(value):
    return re.sub(r'(?<=[.])(?=[^\s])', r' ', value)


df["raw_address"] = df["raw_address"].apply(add_space_to_coma)
# df["raw_address"] = df["raw_address"].apply(add_space_to_dot)

In [51]:
df.head(10)

Unnamed: 0,id,raw_address,POI/street
0,0,jl kapuk timur delta sili iii lippo cika 11 a ...,/jl kapuk timur delta sili iii lippo cika
1,1,"aye, jati sampurna",/
2,2,setu siung 119 rt 5 1 13880 cipayung,/siung
3,3,"toko dita, kertosono",toko dita/
4,4,jl. orde baru,/jl. orde baru
5,5,"raya samb gede, 299 toko bb kids",toko bb kids/raya samb gede
6,6,"kem mel raya, no 4 bojong rawalumbu rt 1 36 ra...",/kem mel raya
7,7,tela keuramat kuta alam,/tela
8,8,gg. i wates magersari,/gg. i
9,9,bunga ncole ix 2,/bunga ncole ix


In [40]:
df.to_csv('preprocessed_train_7.csv', index=False)

In [48]:
s = "aye, jati sampurna"
s = re.sub('([,])', r' \1 ', s)
s = re.sub('\s{2,}', ' ', s)
s

'aye , jati sampurna'