In [401]:
import numpy as np
import pandas as pd
import usaddress as ad
from vincenty import vincenty
from jellyfish import levenshtein_distance, jaro_winkler

In [402]:
df_locu_train = pd.read_json('locu_train.json').replace([None], [''])
df_foursquare_train = pd.read_json('foursquare_train.json').replace([None], [''])
df_locu_test = pd.read_json('locu_test.json').replace([None], [''])
df_foursquare_test = pd.read_json('foursquare_test.json').replace([None], [''])
df_matches = pd.read_csv('matches_train.csv')

In [202]:
list_feature = ['id','latitude', 'longitude', 'name', 'phone', 'postal_code', 'street_address', 'website']

In [203]:
locu_train = df_locu_train[list_feature]
four_train = df_foursquare_train[list_feature]

In [396]:
four_train.shape

(600, 9)

### Str cleaning

In [204]:
list_remove = ['!',',','(',')','?','.', '\'', '/','\"','-',' ']
def str_cleaning(i):
    i = str(i)
    i = i.lower()
    for j in list_remove:
        if j in i:
            i = i.translate({ord(x): '' for x in list_remove})
    return i

def phone_cleaning(i):
    if i != '':
        first_half, second_half = i.split(' ')
        first = first_half.split('(')[1].split(')')[0]
        second, third = second_half.split('-')
        phone_number = first + second + third
    else:
        phone_number = ''
    return phone_number

def website_cleaning(string):
    for i in ['.com', '.net', '.org']:
        string = string.split(i)[0]
    L = ['http://', 'www.']
    for i in L:
        string = string.replace(i, "")
    return string

In [205]:
def cleaning_locu(df):
    df['name'] = df['name'].apply(str_cleaning)
    df['website'] = df['website'].apply(website_cleaning).apply(str_cleaning)
    df['street_address_norm'] = df['street_address'].apply(str_cleaning)
    return df
def cleaning_four(df):
    df['name'] = df['name'].apply(str_cleaning)
    df['phone'] = df['phone'].apply(phone_cleaning)
    df['website'] = df['website'].apply(website_cleaning).apply(str_cleaning)
    df['street_address_norm'] = df['street_address'].apply(str_cleaning)
    return df

In [206]:
locu_train = cleaning_locu(locu_train)
four_train = cleaning_four(four_train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cavea

### Address

In [207]:
for i in df_locu.columns:
    print(i, 100 - df_locu[i].isnull().sum()/len(df_locu[i])*100)

AddressNumber 91.1371237458
StreetNamePreDirectional 100.0
StreetName 100.0
StreetNamePostType 100.0
id 100.0


In [208]:
def get_address(df):
    list_add = ['AddressNumber', 'StreetNamePreDirectional', 'StreetName', 'StreetNamePostType']
    L = []
    for index, row in df.iterrows():
        try:
            dic = ad.tag(row['street_address'])[0]
        except ad.RepeatedLabelError as e:
            continue    
        dic['id'] = row['id']
        L.append(dic)
    df_result = pd.DataFrame(L)
    
    df_result = df_result[list_add + ['id']]
    df_result['StreetName'] = df_result['StreetName'].apply(str_cleaning).replace("nan","")
    df_result['AddressNumber'] = df_result['AddressNumber'].replace("NaN","")
    df_result['StreetNamePreDirectional'] = df_result['StreetNamePreDirectional'].apply(str_cleaning).replace(['e', 'w', 'n', 's', 'sw', 'se', 'nw', 'ne'], 
                                                                                                          ['east', 'west', 'north', 'south', 'southwest', 'southeast', 'northwest', 'northeast'])
    df_result['StreetNamePostType'] = df_result['StreetNamePostType'].apply(str_cleaning).replace(['st', 'ave', 'plz', 'pl', 'sq', 'blvd', 'pkwy', 'ln', 'riv'], 
                                                                                              ['street', 'avenue', 'plaza', 'place', 'square', 'boulevard', 'parkways', 'lane', 'river'])
    return df_result

In [209]:
df_locu = get_address(locu_train)
df_four = get_address(four_train)
locu = pd.merge(df_locu, locu_train, on='id', how='outer')
four = pd.merge(df_four, four_train, on='id', how='outer')

### All possible matches

In [210]:
df1 = locu
df1['key'] = 0
df2 = four
df2['key'] = 0
all_in = pd.merge(df1, df2, on='key')
all_in = all_in.fillna('')

In [211]:
L_matches = []
for index, row in df_matches.iterrows():
    L_matches.append([row['locu_id'], row['foursquare_id']])

In [212]:
l_match = []
for index, row in all_in.iterrows():
    if [row['id_x'],row['id_y']] in L_matches:
        l_match.append(1)
    else:
        l_match.append(0)

In [213]:
all_in['is_match'] = l_match

In [214]:
def compute_distances(field_x, field_y, row):
    if (row[field_x] != '') | (row[field_y] != ''):
        leven_distance = levenshtein_distance(row[field_x], row[field_y])
        jw_distance = jaro_winkler(row[field_x], row[field_y])
    else:
        leven_distance = np.NaN
        jw_distance = np.NaN
    return leven_distance, jw_distance

def check_equality(field_x, field_y, row):
    if (row[field_x] == row[field_y]) & (row[field_x] != ''):
        value = 1
    elif (row[field_x] == '') | (row[field_y] == ''):
        value = np.NaN
    else:
        value = 0
    return value

In [215]:
def feature_creation(df):
    
    dist = []
    leven_phone = []
    jw_phone = []
    leven_name = []
    jw_name = []
    leven_street_name = []
    jw_street_name = []
    leven_address = []
    jw_address = []
    same_postal_code = []
    same_street_number = []
    same_address = []
    same_website = []
    same_phone = []
    same_name = []
    same_street_name = []
    

    for index, row in df.iterrows():
    
        if (row['latitude_x'] != '') & (row['longitude_x'] != '') & (row['latitude_y'] != '') & (row['longitude_y'] != ''): 
            distance = vincenty((row['latitude_x'],row['longitude_x']), (row['latitude_y'],row['longitude_y'])) 
        else:
            distance = np.NaN
        dist.append(distance)
    
        leven_distance, jw_distance = compute_distances('phone_x', 'phone_y', row)
        leven_phone.append(leven_distance)
        jw_phone.append(jw_distance)
        
        leven_distance, jw_distance = compute_distances('name_x', 'name_y', row)
        leven_name.append(leven_distance)
        jw_name.append(jw_distance)
    
        leven_distance, jw_distance = compute_distances('StreetName_x', 'StreetName_y', row)
        leven_street_name.append(leven_distance)
        jw_street_name.append(jw_distance)
        
        leven_distance, jw_distance = compute_distances('street_address_norm_x', 'street_address_norm_y', row)
        leven_address.append(leven_distance)
        jw_address.append(jw_distance)
    
        value = check_equality('postal_code_x', 'postal_code_y', row)
        same_postal_code.append(value)
    
        value = check_equality('AddressNumber_x', 'AddressNumber_y', row)
        same_street_number.append(value)
    
        value = check_equality('street_address_norm_x', 'street_address_norm_y', row)
        same_address.append(value)
        
        value = check_equality('website_x', 'website_y', row)
        same_website.append(value)
        
        value = check_equality('phone_x', 'phone_y', row)
        same_phone.append(value)
        
        value = check_equality('name_x', 'name_y', row)
        same_name.append(value)
        
        value = check_equality('StreetName_x', 'StreetName_y', row)
        same_street_name.append(value)
    
    df['dist'] = dist
    df['leven_phone'] = leven_phone
    df['jw_phone'] = jw_phone
    df['leven_name'] = leven_name
    df['jw_name'] = jw_name
    df['leven_street_name'] = leven_street_name
    df['jw_street_name'] = jw_street_name
    df['leven_address'] = leven_address
    df['jw_address'] = jw_address
    df['same_postal_code'] = same_postal_code
    df['same_street_number'] = same_street_number
    df['same_address'] = same_address
    df['same_website'] = same_website
    df['same_phone'] = same_phone
    df['same_name'] = same_name
    df['same_street_name'] = same_street_name
    
    return df

In [216]:
%%time
all_in = feature_creation(all_in)

CPU times: user 7min 30s, sys: 3.29 s, total: 7min 33s
Wall time: 7min 46s


In [142]:
all_in[all_in['is_match']==1]['dist'].mean()

0.19522644166666667

In [217]:
all_in_filter = all_in[(all_in['dist'] <= 1) 
       | (all_in['same_website'] == 1)
       | (all_in['same_street_name'] == 1)
       | (all_in['same_postal_code'] == 1)
       | (all_in['same_street_number'] == 1)
       | (all_in['same_address'] == 1)
       | (all_in['same_phone'] == 1)
       | (all_in['same_name'] == 1)]

In [218]:
features = ['dist', 
            'leven_phone', 'jw_phone',
            'leven_name', 'jw_name', 
            'leven_street_name', 'jw_street_name', 
            'leven_address', 'jw_address',
            'same_postal_code','same_street_number', 
            'same_address', 'same_website',
            'same_phone', 'same_name', 'same_street_name']

In [80]:
all_in[features].dtypes

dist                  float64
leven_phone           float64
jw_phone              float64
leven_name              int64
jw_name               float64
leven_street_name     float64
jw_street_name        float64
leven_address         float64
jw_address            float64
same_postal_code      float64
same_street_number    float64
same_address          float64
same_website          float64
dtype: object

In [262]:
%%time
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import Imputer

X = all_in_filter[features]
Y = all_in_filter['is_match']

impt_train = Imputer()
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, stratify=Y)
X_train_impute = impt_train.fit_transform(X_train)
X_test_impute = impt_train.transform(X_test)
sm = SMOTE(random_state=12, ratio = 0.04)
X_train_res, Y_train_res = sm.fit_sample(X_train_impute, Y_train)

CPU times: user 45.6 ms, sys: 4.99 ms, total: 50.6 ms
Wall time: 47.5 ms




In [263]:
Y_train.sum()/len(Y_train)

0.01029615918556224

In [264]:
Y_train_res.sum()/len(Y_train_res)

0.038439923569742608

In [272]:
%%time
from xgboost import XGBClassifier
xgb_regular = XGBClassifier()
xgb_imputed = XGBClassifier()
xgb_smoted = XGBClassifier()
xgb_regular.fit(X_train, Y_train)
xgb_imputed.fit(X_train_impute, Y_train)
xgb_smoted.fit(X_train_res, Y_train_res)
print('Score with X_test \n')
print(xgb_regular.score(X_test, Y_test))
print(xgb_imputed.score(X_test.as_matrix(), Y_test))
print(xgb_smoted.score(X_test.as_matrix(), Y_test))
print('')
print('Score with X_test_impute \n')
print(xgb_imputed.score(X_test_impute, Y_test))
print(xgb_smoted.score(X_test_impute, Y_test))

Score with X_test 

0.9996529786
0.9995373048
0.998033545402

Score with X_test_impute 

0.9996529786
0.9996529786
CPU times: user 3.84 s, sys: 14.5 ms, total: 3.85 s
Wall time: 3.85 s


In [273]:
from sklearn.metrics import confusion_matrix, classification_report

In [274]:
M_regular = confusion_matrix(Y_test, xgb_regular.predict(X_test))
print(M_)
print(classification_report(Y_test, xgb_regular.predict(X_test)))
M_imputed = confusion_matrix(Y_test, xgb_imputed.predict(X_test_impute))
print(M_imputed)
print(classification_report(Y_test, xgb_imputed.predict(X_test_impute)))
M_smoted = confusion_matrix(Y_test, xgb_smoted.predict(X_test_impute))
print(M_smoted)
print(classification_report(Y_test, xgb_smoted.predict(X_test_impute)))

[[8556    0]
 [   3   86]]
             precision    recall  f1-score   support

          0       1.00      1.00      1.00      8556
          1       1.00      0.97      0.98        89

avg / total       1.00      1.00      1.00      8645

[[8556    0]
 [   3   86]]
             precision    recall  f1-score   support

          0       1.00      1.00      1.00      8556
          1       1.00      0.97      0.98        89

avg / total       1.00      1.00      1.00      8645

[[8556    0]
 [   3   86]]
             precision    recall  f1-score   support

          0       1.00      1.00      1.00      8556
          1       1.00      0.97      0.98        89

avg / total       1.00      1.00      1.00      8645



### Test set

In [275]:
locu_test = df_locu_test[list_feature]
four_test = df_foursquare_test[list_feature]
locu_test = cleaning_locu(locu_test)
four_test = cleaning_four(four_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cavea

In [276]:
df_locu_test = get_address(locu_test)
df_four_test = get_address(four_test)
locutest = pd.merge(df_locu_test, locu_test, on='id', how='outer')
fourtest = pd.merge(df_four_test, four_test, on='id', how='outer')

In [277]:
df1 = locutest
df1['key'] = 0
df2 = fourtest
df2['key'] = 0
all_in_test = pd.merge(df1, df2, on='key')
all_in_test = all_in_test.fillna('')

In [278]:
%%time
all_in_test = feature_creation(all_in_test)

CPU times: user 3min 6s, sys: 786 ms, total: 3min 7s
Wall time: 3min 9s


In [279]:
features = ['dist', 
            'leven_phone', 'jw_phone',
            'leven_name', 'jw_name', 
            'leven_street_name', 'jw_street_name', 
            'leven_address', 'jw_address',
            'same_postal_code','same_street_number', 
            'same_address', 'same_website',
            'same_phone', 'same_name', 'same_street_name']

In [324]:
all_in_test_filter = all_in_test[(all_in_test['dist'] <= 1) 
       | (all_in_test['same_website'] == 1)
       | (all_in_test['same_street_name'] == 1)
       | (all_in_test['same_postal_code'] == 1)
       | (all_in_test['same_street_number'] == 1)
       | (all_in_test['same_address'] == 1)
       | (all_in_test['same_phone'] == 1)
       | (all_in_test['same_name'] == 1)].reset_index(drop=True)

In [325]:
all_in_test_filter

Unnamed: 0,AddressNumber_x,StreetNamePreDirectional_x,StreetName_x,StreetNamePostType_x,id_x,latitude_x,longitude_x,name_x,phone_x,postal_code_x,...,jw_street_name,leven_address,jw_address,same_postal_code,same_street_number,same_address,same_website,same_phone,same_name,same_street_name
0,570,,9th,avenue,b48da849c54f904013e2,40.758,-73.9927,pandarestaurant,2126950836,10036,...,0.000000,9.0,0.000000,,,,,,0,
1,570,,9th,avenue,b48da849c54f904013e2,40.758,-73.9927,pandarestaurant,2126950836,10036,...,0.916667,8.0,0.614815,0.0,0.0,0.0,0.0,0.0,0,0.0
2,570,,9th,avenue,b48da849c54f904013e2,40.758,-73.9927,pandarestaurant,2126950836,10036,...,1.000000,2.0,0.884259,0.0,0.0,0.0,0.0,0.0,0,1.0
3,570,,9th,avenue,b48da849c54f904013e2,40.758,-73.9927,pandarestaurant,2126950836,10036,...,0.722222,9.0,0.544444,,0.0,0.0,,0.0,0,0.0
4,570,,9th,avenue,b48da849c54f904013e2,40.758,-73.9927,pandarestaurant,2126950836,10036,...,1.000000,3.0,0.805556,0.0,0.0,0.0,0.0,0.0,0,1.0
5,570,,9th,avenue,b48da849c54f904013e2,40.758,-73.9927,pandarestaurant,2126950836,10036,...,1.000000,7.0,0.555556,,0.0,0.0,,0.0,0,1.0
6,570,,9th,avenue,b48da849c54f904013e2,40.758,-73.9927,pandarestaurant,2126950836,10036,...,0.722222,6.0,0.611111,0.0,,0.0,,,0,0.0
7,570,,9th,avenue,b48da849c54f904013e2,40.758,-73.9927,pandarestaurant,2126950836,10036,...,0.722222,7.0,0.629630,1.0,0.0,0.0,,,0,0.0
8,570,,9th,avenue,b48da849c54f904013e2,40.758,-73.9927,pandarestaurant,2126950836,10036,...,0.000000,9.0,0.407407,1.0,0.0,0.0,,,0,0.0
9,570,,9th,avenue,b48da849c54f904013e2,40.758,-73.9927,pandarestaurant,2126950836,10036,...,0.777778,3.0,0.777778,0.0,0.0,0.0,,,0,0.0


In [281]:
X_train = all_in_filter[features]
Y_train = all_in_filter['is_match']
X_test = all_in_test_filter[features]

In [329]:
impt = Imputer()
X_train_impute = impt.fit_transform(X_train)
X_test_impute = impt.transform(X_test)
sm = SMOTE(random_state=12, ratio = 0.03)
X_train_res, Y_train_res = sm.fit_sample(X_train_impute, Y_train)



In [330]:
Y_train_res.sum()/len(Y_train_res)

0.029108860328538599

In [331]:
%%time
from xgboost import XGBClassifier
xgb_regular = XGBClassifier(max_depth=10, n_estimators=300)
xgb_imputed = XGBClassifier(max_depth=10, n_estimators=300)
xgb_smoted = XGBClassifier(max_depth=10, n_estimators=300)
xgb_regular.fit(X_train, Y_train)
xgb_imputed.fit(X_train_impute, Y_train)
xgb_smoted.fit(X_train_res, Y_train_res)
Y_pred = xgb_regular.predict(X_test)
Y_pred_imputed = xgb_imputed.predict(X_test_impute)
Y_pred_smoted = xgb_smoted.predict(X_test_impute)

CPU times: user 20.1 s, sys: 97.5 ms, total: 20.2 s
Wall time: 20.5 s


In [332]:
print(Y_pred.sum())
print(Y_pred_imputed.sum())
print(Y_pred_smoted.sum())

231
231
232


In [318]:
df_matches_test = all_in_test_filter[Y_pred_smoted == 1][['id_x', 'id_y']].reset_index(drop=True)

In [319]:
df_matches_test.rename(columns={'id_x': 'locu_id', 'id_y': 'foursquare_id'}, inplace=True)

In [320]:
df_matches_test.to_csv('matches_test_smote_boost.csv', index=False)

In [389]:
L = []
dico = xgb_smoted.booster().get_score(importance_type='weight')
for key, value in dico.items():
    L.append([int(key.split('f')[1]), value])
print(L)
from operator import itemgetter
L_ = sorted(L, key=itemgetter(1), reverse=True)
print(L_)
L_ = [x[0] for x in L_]
print(L_)

[[4, 350], [13, 83], [12, 5], [0, 343], [15, 53], [1, 51], [11, 62], [7, 199], [3, 174], [9, 41], [10, 58], [8, 15], [2, 4], [6, 38], [5, 71]]
[[4, 350], [0, 343], [7, 199], [3, 174], [13, 83], [5, 71], [11, 62], [10, 58], [15, 53], [1, 51], [9, 41], [6, 38], [8, 15], [12, 5], [2, 4]]
[4, 0, 7, 3, 13, 5, 11, 10, 15, 1, 9, 6, 8, 12, 2]


In [392]:
L_

[4, 0, 7, 3, 13, 5, 11, 10, 15, 1, 9, 6, 8, 12, 2]

In [390]:
list(X_train.columns[L_])

['jw_name',
 'dist',
 'leven_address',
 'leven_name',
 'same_phone',
 'leven_street_name',
 'same_address',
 'same_street_number',
 'same_street_name',
 'leven_phone',
 'same_postal_code',
 'jw_street_name',
 'jw_address',
 'same_website',
 'jw_phone']

In [394]:
X_train['same_phone'].sum()

229.0

In [398]:
four_test.shape

(400, 9)

In [399]:
400*400

160000

In [400]:
df_locu_test

Unnamed: 0,AddressNumber,StreetNamePreDirectional,StreetName,StreetNamePostType,id
0,570,,9th,avenue,b48da849c54f904013e2
1,321,east,houston,street,95ad783fd1c65bb8fdbf
2,329,,bowery,,5060d123ccad77923b20
3,2253,,3rd,avenue,9dd6f6b177096efd5da4
4,316,,bowery,,4773c30d2df4368c0d09
5,538,,6th,avenue,206c363a5907bfa98ec0
6,1st,,,avenue,cb95d1e0730222cc3209
7,356,west,58th,street,ecdc736a7d663a46e01d
8,600,,eighth,avenue,25ca87e725b930488ed6
9,,,,,4f9710321455164d1cb4
