In [30]:
import nltk
import string
import pandas as pd
from tqdm import tqdm
from scipy import stats
from os.path import join
from nltk.util import ngrams

from nltk.tokenize import word_tokenize
tqdm.pandas()
# nltk.download('all')



In [31]:
train_csv = 'train.csv'
test_csv = 'test.csv'
root = './'
num_words = 15
train_df = pd.read_csv(train_csv)
test_df = pd.read_csv(test_csv)

In [32]:
train_df.apply(lambda row: len(row['raw_address'].split()), axis=1).describe()

count    300000.000000
mean          6.842183
std           2.827218
min           1.000000
25%           5.000000
50%           6.000000
75%           9.000000
max          32.000000
dtype: float64

In [33]:
test_df.apply(lambda row: len(row['raw_address'].split()), axis=1).describe()

count    50000.000000
mean         6.832440
std          2.818035
min          1.000000
25%          5.000000
50%          6.000000
75%          9.000000
max         25.000000
dtype: float64

In [34]:
train_df['POI'] = train_df.apply(lambda row: row['POI/street'].split('/')[0], axis=1)
train_df['street'] = train_df.apply(lambda row: row['POI/street'].split('/')[-1], axis=1)
train_df.head()

Unnamed: 0,id,raw_address,POI/street,POI,street
0,0,jl kapuk timur delta sili iii lippo cika 11 a ...,/jl kapuk timur delta sili iii lippo cika,,jl kapuk timur delta sili iii lippo cika
1,1,"aye, jati sampurna",/,,
2,2,setu siung 119 rt 5 1 13880 cipayung,/siung,,siung
3,3,"toko dita, kertosono",toko dita/,toko dita,
4,4,jl. orde baru,/jl. orde baru,,jl. orde baru


In [35]:
train_df.apply(lambda row: len(row['POI'].split()), axis=1).describe()

count    300000.000000
mean          1.143783
std           1.579493
min           0.000000
25%           0.000000
50%           0.000000
75%           2.000000
max          20.000000
dtype: float64

In [36]:
train_df.apply(lambda row: len(row['street'].split()), axis=1).describe()

count    300000.000000
mean          1.716237
std           1.270470
min           0.000000
25%           1.000000
50%           2.000000
75%           3.000000
max          15.000000
dtype: float64

In [37]:
street_corpus = dict()
poi_corpus = dict()

In [38]:
for row in train_df.itertuples(index=False):
    keys = ['_'.join(word_tokenize(row.POI)), '_'.join(word_tokenize(row.street))]
    
    if keys[0] in poi_corpus.keys():
        poi_corpus[keys[0]] += 1
    else:
        poi_corpus[keys[0]] = 1

    if keys[1] in street_corpus.keys():
        street_corpus[keys[1]] += 1
    else:
        street_corpus[keys[1]] = 1

In [39]:
stats.describe(list(street_corpus.values()))
stats.describe(list(poi_corpus.values()))

DescribeResult(nobs=93412, minmax=(1, 178509), mean=3.21157881214405, variance=341133.5479799631, skewness=305.6149366133018, kurtosis=93401.32360912373)

In [40]:
def get_ngrams(text, n):
    n_grams = ngrams(word_tokenize(text), n)
    return ['_'.join(grams) for grams in n_grams]

def all_ngrams(text, num_words = 8):
    result = {}
    for i in range(1, num_words + 1):
        result[i] = get_ngrams(text, i)
    return result


In [41]:
def get_result(text: str, num_words = 8, thrs_poi = 0, thrs_street = 0):
    result = {
        'poi': None,
        'street': None,
    }
    data = all_ngrams(text,num_words)

    for k in range(num_words, 0, -1):
        v = {
            'poi': None,
            'street': None,
        }
        data[k].sort(key=len, reverse=True) # sorts by descending length
        for item in data[k]:
            if item in poi_corpus.keys():
                if poi_corpus[item] > thrs_poi:
                    v['poi'] = detokenize(item)
            if item in street_corpus:
                if street_corpus[item] > thrs_street:
                    v['street'] = detokenize(item)
                    
            for k in result.keys():
                result[k] = result[k] if result[k] else v[k]
        if (result['poi'] and result['street']):
            break
        
    for k in result.keys():
        result[k] = result[k] if result[k] else ''
    return '/'.join(['', result['street']]) # hard code 
    # return '/'.join([result['poi'], result['street']])
import string
def detokenize(seq :str, split_token = '_'):
    tokens = seq.split(split_token)
    res = "".join([" "+i if not i.startswith("'") and i not in string.punctuation else i for i in tokens]).strip()
    return res 
get_result('jl.c suj 1, jogoyudan lumajang', num_words=num_words)

'/jl.c suj 1'

In [42]:

def validate(df : pd.DataFrame, metric = "accuracy"):
    df = df.copy()
    total = len(df)
    corrects = [0, 0, 0, 0]
    preds = []
    gt = []
    
    for row in tqdm(df.itertuples(index=False)):
        poi, street = row.preds.split('/')
        if (poi == row.POI) and (street == row.street):
            corrects[0] += 1
        if (poi == row.POI):
            corrects[1] += 1
        if (street == row.street):
            corrects[2] += 1
        else:
            preds.append(street)
            gt.append(row.street)

        if ('' == row.street):
            corrects[3] += 1
    false_cases = pd.DataFrame({'gt':gt, 'preds':preds})
    return false_cases, [c * 1.0 / total for c in corrects]

# train_df['preds'] = train_df.progress_apply(lambda row: get_result(row['raw_address']), axis=1)
sample_df = train_df.head(10000).copy()
sample_df['preds'] = sample_df.progress_apply(lambda row: get_result(row['raw_address'],  num_words=num_words), axis=1)
false_df, acc = validate(sample_df)
print(acc) 
#[0.3721466666666667, 0.59503, 0.5920733333333333, 0.23381]


100%|██████████| 10000/10000 [00:11<00:00, 858.05it/s]
10000it [00:00, 965428.47it/s][0.3786, 0.5966, 0.5981, 0.2395]



In [43]:
false_df.to_csv('false.csv',index=False)
false_df.head()

Unnamed: 0,gt,preds
0,,sampurna
1,,yaya
2,,kamp utan jaya
3,,kabupaten
4,,padang


In [44]:
test_df['preds'] = test_df.progress_apply(lambda row: get_result(row['raw_address'], num_words=num_words), axis=1)

100%|██████████| 50000/50000 [00:58<00:00, 860.62it/s]


In [45]:
submission = pd.DataFrame({
    'id':test_df['id'],
    'POI/street':test_df['preds']
    })

submission.to_csv('submission.csv',index=False)

In [None]:
# from spacy.lang.en import English
# from spacy.matcher import PhraseMatcher

# nlp = English()
# matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
# patterns = [nlp.make_doc(name) for name in ["Angela Merkel", "Barack Obama"]]
# matcher.add("Names", patterns)

# doc = nlp("angela merkel and us president barack Obama")
# for match_id, start, end in matcher(doc):
#     print("Matched based on lowercase token text:", doc[start:end])