In [1]:
import pandas as pd
import os
import numpy as np
from string_search import *

data_dir = r'C:\Users\ozano\Desktop\senet'
data_path = os.path.join(data_dir, 'results_me.csv')

In [2]:
df = pd.read_csv(data_path, sep = ';')
cols_to_use = ['AD', 'ADRES']

df = df[cols_to_use]
df.shape

(1000, 2)

## Preprocess

In [3]:
for col in df.columns:
    df[col] = df[col].apply(lambda x: preprocess(x))
    df[col] = '#' + df[col] + '#'
df.head(10)

Unnamed: 0,AD,ADRES
0,#mehmet kandemir#,#palazoglu sok n2 sisli#
1,#mehmet halil elceoglu#,#demirtas mah deveoglu sok tikde is mrk no 3...
2,#meryem yildiz#,#yesilkent mah 1807 sk no 233 esenyurt#
3,#metsan sosyal hizmetler limited sirketi#,#orhantepe mah tokdemir sokak no 101 kartal#
4,#mehmet keser#,#baris mah akdeniz cadde beyaz center avm 12...
5,#med egitimdersane yayincilik basimm pazarlama...,#hayat sokak no 30 sariyer#
6,#mehmet ali gedik#,#baskopru mah d100 karayolu cad no 208 serdi...
7,#mehmet nurullah altas#,#bagcilar mah alinak sitesi otopark karsisi ...
8,#mese kimya danismanlik makina san ve tic ltd...,#sepetlipinar mah demokratlar cd no 16 basisk...
9,#mesut tufekcioglu#,#yildiztepe mah 829 sokak no 1924 altindag#


## Get N-Grams

In [4]:
df_ngram = pd.DataFrame()
for col in df.columns:
    df_ngram[col] = df[col].apply(lambda x: get_n_grams(x))

In [5]:
df_ngram.head()

Unnamed: 0,AD,ADRES
0,"[#me, meh, ehm, hme, met, et , ka, kan, and, ...","[#pa, pal, ala, laz, azo, zog, ogl, glu, lu , ..."
1,"[#me, meh, ehm, hme, met, et , ha, hal, ali, ...","[#de, dem, emi, mir, irt, rta, tas, as , ma, ..."
2,"[#me, mer, ery, rye, yem, em , yi, yil, ild, ...","[#ye, yes, esi, sil, ilk, lke, ken, ent, nt , ..."
3,"[#me, met, ets, tsa, san, an , so, sos, osy, ...","[#or, orh, rha, han, ant, nte, tep, epe, pe , ..."
4,"[#me, meh, ehm, hme, met, et , ke, kes, ese, ...","[#ba, bar, ari, ris, is , ma, mah, ah , ak, ..."


## Create Index

In [6]:
import math

def get_n_gram_length(x):
    return max(1, len(x))

n_index_tokens = np.array([4, 7, 10, 15, 20, 30, 50, 70])
labels = ['AD', 'ADRES']

length_data = {}
index_data = {label: [] for label in labels}

for label in labels:
    length_data[label] = df_ngram[label].apply(get_n_gram_length).values

for n_index_token in n_index_tokens:
    for label in labels:
        index_data[label].append(create_ngram_index(df_ngram[label].values, n_index_tokens = n_index_token))

## Search

In [76]:
#%%timeit
# 1 search 10us, full search 400us
from time import time
start_time = time()

input_string = 'palazoglu sok n 2 sisli'
search_person = False

search_label = 'ADRES'#'AD'#'ADRES'

# mehmet kocamanoglu erkan calik
# mehmet caliskan ahmet doger
# metin aydinhusamettin aydin
# mehmet ali emirbayer
# mehmet caliskan

# med egitimdersane yayincilik basimm pazarlama ve d
# mer su urunleri hayvancilik nakliye  pazarlama ithalat ihrac
# mer ihracat

# yildiz mahdogus 1ara skhibrhm eren apk3 d13
# bagcilar mah  alinak sitesi otopark karsisi  no 8 kat 4 baglar
# istoc 3 ada no 56mahmutbey bagcilar istanbul
# istasyon mh19sk 20a etimesgut ankara

input_string = '#' + input_string + '#'
input_n_grams = get_n_grams(input_string)
input_ngram_count = len(input_n_grams)

index_size = len(n_index_tokens) - 1
if not search_person:
    for i, n_index_token in enumerate(n_index_tokens[:-1]):
        if input_ngram_count <= n_index_token:
            index_size = i
            break

print(f'Input n_gram count: {input_ngram_count}')
print(f'Searching with index size: {n_index_tokens[index_size]}')

matches = get_matches(input_n_grams, index_data[search_label][index_size], length_data[search_label])
match_bins = bin_matches(matches, get_sorted = False)
match_values = get_match_values(match_bins, df[search_label].values)

while not search_person and len(match_values[1.0]) == 0 and len(match_values[0.8]) == 0 and index_size < len(n_index_tokens) - 1:

    index_size = index_size + 1
    print(f'Searching with index size: {n_index_tokens[index_size]}')
    matches = get_matches(input_n_grams, index_data[search_label][index_size], length_data[search_label])
    match_bins = bin_matches(matches, get_sorted = False)
    match_values = get_match_values(match_bins, df[search_label].values)

print('Time: {:.3f} ms'.format(time() - start_time))

Input n_gram count: 13
Searching with index size: 15
Time: 0.001 ms


## Get values

In [77]:
match_values[1.0]

[]

In [78]:
match_values[0.8]

[(137,
  '#mer su urunleri hayvancilik nakliye  pazarlama ithalat ihrac#',
  0.9230769230769231)]

In [79]:
match_values[0.6]

[]

In [80]:
match_values[0.4][:20] # Shows first n

[]

In [82]:
match_values[0.0][:20] # Shows first n

[(0, '#mehmet kandemir#', 0.07692307692307693),
 (1, '#mehmet halil elceoglu#', 0.07692307692307693),
 (2, '#meryem yildiz#', 0.15384615384615385),
 (3, '#metsan sosyal hizmetler limited sirketi#', 0.07692307692307693),
 (4, '#mehmet keser#', 0.07692307692307693),
 (5,
  '#med egitimdersane yayincilik basimm pazarlama ve d#',
  0.07692307692307693),
 (6, '#mehmet ali gedik#', 0.07692307692307693),
 (7, '#mehmet nurullah altas#', 0.15384615384615385),
 (8,
  '#mese kimya danismanlik makina san  ve tic ltd sti#',
  0.07692307692307693),
 (9, '#mesut tufekcioglu#', 0.07692307692307693),
 (10, '#mehmet zeki dogan#', 0.07692307692307693),
 (11, '#mehmet nuri alim#', 0.07692307692307693),
 (12, '#mehmet duran#', 0.07692307692307693),
 (13, '#mehmet serkan konur#', 0.07692307692307693),
 (14, '#mesut idil#', 0.07692307692307693),
 (15,
  '#medar dent dis protez laboratuari san ve tic ltd s#',
  0.07692307692307693),
 (16, '#metin buyuksoy#', 0.07692307692307693),
 (17, '#mehmet yaylaci#', 0.0