In [2]:
import pandas as pd 
from sklearn.model_selection import train_test_split
import re

In [102]:
# load in data, 300000 rows 
train_df = pd.read_csv('train.csv')

In [103]:
from string import punctuation
import re

def clean(s):
    res = re.sub(r'(\w)(\()(\w)', '\g<1> \g<2>\g<3>', s)
    res = re.sub(r'(\w)([),.:;]+)(\w)', '\g<1>\g<2> \g<3>', res)
    res = re.sub(r'(\w)(\.\()(\w)', '\g<1>. (\g<3>', res)
    res = re.sub(r'\s+', ' ', res)
    res = res.strip()
    return res

def stripclean(arr):
    return [s.strip().strip(punctuation) for s in arr]

def dummy(x):
    # stupid workaround to deep copy array cause i couldn't get it to work properly
    return [s for s in x]

In [106]:
train_df['raw_address'] = train_df['raw_address'].apply(lambda x: x.strip())
train_df['POI'] = train_df['POI/street'].str.split('/').str[0].apply(clean).str.split().apply(stripclean)
train_df['STR'] = train_df['POI/street'].str.split('/').str[1].apply(clean).str.split().apply(stripclean)
train_df['tokens'] = train_df['raw_address'].apply(clean).str.split()
train_df['strip_tokens'] = train_df['tokens'].apply(stripclean)
train_df['full_tokens'] = train_df['tokens'].apply(dummy)
train_df['labels'] = train_df['tokens'].apply(lambda x: ['O'] * len(x))
train_df['pos_poi'] = train_df['tokens'].apply(lambda x: [-1, -1])
train_df['pos_str'] = train_df['tokens'].apply(lambda x: [-1, -1])

In [107]:
wordlist_raw = {}
POI_ERR_IDX = []
STR_ERR_IDX = []
SHORTEN_IDX = []
OVERLAP_IDX = set()

from tqdm import tqdm

for idx in tqdm(range(len(train_df))):
    row = train_df.iloc[idx]
    found_poi, found_str, shorten = False, False, False
    for i in range(len(row['strip_tokens'])):
        if row['strip_tokens'][i] == '': continue
        if len(row['POI']) > 0 and row['POI'][0].startswith(row['strip_tokens'][i]):
            ok = True
            for j in range(len(row['POI'])):
                if i + j >= len(row['strip_tokens']) or not row['POI'][j].startswith(row['strip_tokens'][i + j]):
                    ok = False
                    break
            if ok:
                found_poi = True
                row['pos_poi'][0] = i
                row['pos_poi'][1] = i + len(row['POI']) - 1
                for j in range(len(row['POI'])):
                    if row['labels'][i + j] != 'O':
                        OVERLAP_IDX.add(row['id'])
                    if len(row['POI']) == 1:       row['labels'][i + j] = 'S-POI'
                    elif j == 0:                   row['labels'][i + j] = 'B-POI'
                    elif j == len(row['POI']) - 1: row['labels'][i + j] = 'E-POI'
                    else:                          row['labels'][i + j] = 'I-POI'
                    if row['strip_tokens'][i + j] != row['POI'][j]:
                        row['full_tokens'][i + j] = row['full_tokens'][i + j].replace(row['strip_tokens'][i + j], row['POI'][j])
                        row['labels'][i + j] += '-SHORT'
                        shorten = True
                        if not row['strip_tokens'][i + j] in wordlist_raw: wordlist_raw[row['strip_tokens'][i + j]] = {}
                        if not row['POI'][j] in wordlist_raw[row['strip_tokens'][i + j]]: wordlist_raw[row['strip_tokens'][i + j]][row['POI'][j]] = 0
                        wordlist_raw[row['strip_tokens'][i + j]][row['POI'][j]] += 1
        
        if len(row['STR']) > 0 and row['STR'][0].startswith(row['strip_tokens'][i]):
            ok = True
            for j in range(len(row['STR'])):
                if i + j >= len(row['strip_tokens']) or not row['STR'][j].startswith(row['strip_tokens'][i + j]):
                    ok = False
                    break
            if ok:
                found_str = True
                row['pos_str'][0] = i
                row['pos_str'][1] = i + len(row['STR']) - 1
                for j in range(len(row['STR'])):
                    if row['labels'][i + j] != 'O':
                        OVERLAP_IDX.add(row['id'])
                    if len(row['STR']) == 1:       row['labels'][i + j] = 'S-STR'
                    elif j == 0:                   row['labels'][i + j] = 'B-STR'
                    elif j == len(row['STR']) - 1: row['labels'][i + j] = 'E-STR'
                    else:                          row['labels'][i + j] = 'I-STR'
                    if row['strip_tokens'][i + j] != row['STR'][j]:
                        row['full_tokens'][i + j] = row['full_tokens'][i + j].replace(row['strip_tokens'][i + j], row['STR'][j])
                        row['labels'][i + j] += '-SHORT'
                        shorten = True
                        if not row['strip_tokens'][i + j] in wordlist_raw: wordlist_raw[row['strip_tokens'][i + j]] = {}
                        if not row['STR'][j] in wordlist_raw[row['strip_tokens'][i + j]]: wordlist_raw[row['strip_tokens'][i + j]][row['STR'][j]] = 0
                        wordlist_raw[row['strip_tokens'][i + j]][row['STR'][j]] += 1
    
    if len(row['POI']) > 0 and not found_poi:
        POI_ERR_IDX.append(row['id'])
    if len(row['STR']) > 0 and not found_str:
        STR_ERR_IDX.append(row['id'])
    if shorten:
        SHORTEN_IDX.append(row['id'])

100%|██████████| 300000/300000 [02:29<00:00, 2010.69it/s]


In [108]:
ERR_IDX = set(POI_ERR_IDX + STR_ERR_IDX + list(OVERLAP_IDX))
len(ERR_IDX)
train_df = train_df[~train_df['id'].isin(ERR_IDX)]


In [109]:
def cleanshort(arr):
    return [s.replace('-SHORT', '') for s in arr]

new_train_df = train_df[train_df['id'].isin(SHORTEN_IDX)].copy(deep=True)
new_train_df['tokens'] = new_train_df['full_tokens'].apply(dummy)
new_train_df['labels'] = new_train_df['labels'].apply(cleanshort)

train_df = train_df[~train_df['id'].isin(SHORTEN_IDX)]
train_df = train_df.append(new_train_df).reset_index()

In [117]:
from itertools import chain 
import numpy as np 
train_df


Unnamed: 0,index,id,raw_address,POI/street,POI,STR,tokens,strip_tokens,full_tokens,labels,pos_poi,pos_str
0,0,0,jl kapuk timur delta sili iii lippo cika 11 a ...,/jl kapuk timur delta sili iii lippo cika,[],"[jl, kapuk, timur, delta, sili, iii, lippo, cika]","[jl, kapuk, timur, delta, sili, iii, lippo, ci...","[jl, kapuk, timur, delta, sili, iii, lippo, ci...","[jl, kapuk, timur, delta, sili, iii, lippo, ci...","[B-STR, I-STR, I-STR, I-STR, I-STR, I-STR, I-S...","[-1, -1]","[0, 7]"
1,1,1,"aye, jati sampurna",/,[],[],"[aye,, jati, sampurna]","[aye, jati, sampurna]","[aye,, jati, sampurna]","[O, O, O]","[-1, -1]","[-1, -1]"
2,2,2,setu siung 119 rt 5 1 13880 cipayung,/siung,[],[siung],"[setu, siung, 119, rt, 5, 1, 13880, cipayung]","[setu, siung, 119, rt, 5, 1, 13880, cipayung]","[setu, siung, 119, rt, 5, 1, 13880, cipayung]","[O, S-STR, O, O, O, O, O, O]","[-1, -1]","[1, 1]"
3,3,3,"toko dita, kertosono",toko dita/,"[toko, dita]",[],"[toko, dita,, kertosono]","[toko, dita, kertosono]","[toko, dita,, kertosono]","[B-POI, E-POI, O]","[0, 1]","[-1, -1]"
4,4,4,jl. orde baru,/jl. orde baru,[],"[jl, orde, baru]","[jl., orde, baru]","[jl, orde, baru]","[jl., orde, baru]","[B-STR, I-STR, E-STR]","[-1, -1]","[0, 2]"
...,...,...,...,...,...,...,...,...,...,...,...,...
298784,299973,299973,moha toha no 167,/mohammad toha,[],"[mohammad, toha]","[mohammad, toha, no, 167]","[moha, toha, no, 167]","[mohammad, toha, no, 167]","[B-STR, E-STR, O, O]","[-1, -1]","[0, 1]"
298785,299974,299974,"islamic training cen pare, brawi, pelem pare",islamic training center pare/brawi,"[islamic, training, center, pare]",[brawi],"[islamic, training, center, pare,, brawi,, pel...","[islamic, training, cen, pare, brawi, pelem, p...","[islamic, training, center, pare,, brawi,, pel...","[B-POI, I-POI, I-POI, E-POI, S-STR, O, O]","[0, 3]","[4, 4]"
298786,299983,299983,"la banda minima, cile raya, pesanggrahan",la banda minimarket/cile raya,"[la, banda, minimarket]","[cile, raya]","[la, banda, minimarket,, cile, raya,, pesanggr...","[la, banda, minima, cile, raya, pesanggrahan]","[la, banda, minimarket,, cile, raya,, pesanggr...","[B-POI, I-POI, E-POI, B-STR, E-STR, O]","[0, 2]","[3, 4]"
298787,299986,299986,"ginzi cafe, siliw,",ginzi cafe/siliwangi,"[ginzi, cafe]",[siliwangi],"[ginzi, cafe,, siliwangi,]","[ginzi, cafe, siliw]","[ginzi, cafe,, siliwangi,]","[B-POI, E-POI, S-STR]","[0, 1]","[2, 2]"


In [118]:
train_df.drop(columns=['id', 'raw_address', 'POI/street', 'POI', 'STR', 'strip_tokens', 'full_tokens', 'pos_poi', 'pos_str'], inplace=True)

In [123]:
train_df.to_csv("train_df_pretokenization.csv", index = False)