In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import pickle
import string
import re

In [2]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,id,raw_address,POI/street
0,0,jl kapuk timur delta sili iii lippo cika 11 a ...,/jl kapuk timur delta sili iii lippo cika
1,1,"aye, jati sampurna",/
2,2,setu siung 119 rt 5 1 13880 cipayung,/siung
3,3,"toko dita, kertosono",toko dita/
4,4,jl. orde baru,/jl. orde baru


In [3]:
def replace_coma(value):
    return value.replace(', ', ' , ')

df["raw_address"] = df["raw_address"].apply(replace_coma)

In [4]:
df.loc[199].raw_address, df.loc[199]["POI/street"]

('beng spesia ac mobil ac rio , ling sala,',
 'bengkel spesialist ac mobil ac rio/ling sala')

In [15]:
def alignment_valid(raw, start_index, end_index):
    if end_index < len(raw) - 1:
        if raw[end_index + 1] != ' ':
            return False

    if start_index > 0:
        if raw[start_index - 1] != ' ':
            return False
    
    return True


def fix_alignment(raw, value):
    start_index = raw.find(value)
    end_index = start_index + len(value)
    if start_index > 0 and raw[start_index - 1] != " ":
        raw = raw[:start_index] + " " + raw[start_index:]
    
    if end_index < len(raw) - 1 and raw[end_index + 1] != " ":
        raw = raw[:end_index] + " " + raw[end_index:]
    
    return raw


def words_overlap(slice1, slice2):
    """Take two strings representing slices (e.g. 'x:y') and
    return a boolean indicating whether they overlap"""
    
    start_word_1, end_word_1 = slice1.split(':')
    
    start_word_2, end_word_2 = slice2.split(':')
    
    start_word_1 = int(start_word_1)
    end_word_1 = int(end_word_1)
    start_word_2 = int(start_word_2)
    end_word_2 = int(end_word_2)
    
    if start_word_1 < start_word_2:  # slice1 is leftmost
        return start_word_2 < end_word_1  # slice2 ends before slice1 starts
    else:
        return start_word_1 < end_word_2


In [28]:
train_np = df.values
train_data = []

for row in tqdm(train_np):
    raw_address = row[1]
    poi, street = row[2].split('/')

    temp_entities = []

    if poi != '' and poi in raw_address:
        try:
            poi_indexes = [(m.start(0), m.end(0)) for m in re.finditer("\\b({})\\b".format(poi), raw_address)]
            for poi_start_index, poi_end_index in poi_indexes:
                temp_entities.append((poi_start_index, poi_end_index, 'POI'))
                break
        except:
            pass

    if street != '' and street in raw_address:
        try:
            street_indexes = [(m.start(0), m.end(0)) for m in re.finditer("\\b({})\\b".format(street), raw_address)]
            for street_start_index, street_end_index in street_indexes:
                
                if temp_entities:
                    slice1 = "{}:{}".format(temp_entities[0][0], temp_entities[0][1])
                    slice2 = "{}:{}".format(street_start_index, street_end_index)
                    
                    if words_overlap(slice1, slice2):
                        continue
                
                temp_entities.append((street_start_index, street_end_index, 'STREET'))
                break
        except:
            pass

    if temp_entities:
        train_data.append((raw_address, {"entities": temp_entities}))

100%|████████████████████████████████████████████| 300000/300000 [00:13<00:00, 22815.46it/s]


In [25]:
len(train_data)

229414

In [21]:
df[(df["raw_address"] == 'jl. duta harapan indah , kapuk muara , kec. penjaringan , kota jkt utara , blok jj no. 69')]

Unnamed: 0,id,raw_address,POI/street
53150,53150,"jl. duta harapan indah , kapuk muara , kec. pe...",duta harapan indah/jl. duta harapan indah


In [22]:
[(m.start(0), m.end(0)) for m in re.finditer("\\b(maha)\\b", "knalpot putra mahakam, maha 11 65111 klojen")]

[(23, 27)]

## Check overlap

In [26]:
counter = 0
for data in tqdm(train_data):
    raw = data[0]
    entities = data[1]['entities']
    if len(entities) > 1:
        slice1 = "{}:{}".format(entities[0][0], entities[0][1])
        slice2 = "{}:{}".format(entities[1][0], entities[1][1])
        if words_overlap(slice1, slice2):
            print(entities)
            print(slice1, slice2)
            counter += 1
counter

100%|██████████████████████████████████████████| 229414/229414 [00:00<00:00, 1622596.30it/s]


0

## Save train data

In [27]:
with open("train_data_all-2.data", "wb+") as fp:
    pickle.dump(train_data, fp)