In [None]:
import pandas as pd
import numpy as np
from sklearn.externals import joblib
import re
import matplotlib.pyplot as plt

%matplotlib notebook

In [None]:
def combine_events(df):
    
    df['month'] = df['start_time'].dt.month
    df['year'] = df['start_time'].dt.year
    df['day'] = df['start_time'].dt.day

    df_merged = df.merge(df, how='inner', on=['year', 'month', 'day'])
    df_merged = df_merged[df_merged['src_x'] != df_merged['src_y']]

    l = df_merged['src_x'] + df_merged['id_x'].astype(str)
    r = df_merged['src_y'] + df_merged['id_y'].astype(str)
    df_merged['id'] = np.where(l < r, l + r, r + l)

    df_merged.drop_duplicates(subset='id', inplace=True)    
     
    return df_merged.reset_index(drop=True)

In [None]:
def generate_labels(df_merged, random_state=0, balanced=True):
    
    df_merged['target'] = 0

    condition = 'place_names'
    mask = (df_merged['place_name_x'].str.lower() == df_merged['place_name_y'].str.lower()) & (df_merged['place_name_x'] !='') & (df_merged['start_time_x'] == df_merged['start_time_y'])
    df_merged.loc[mask, 'target'] = 1
    df_merged.loc[mask, 'condition'] = condition

    condition = 'names'
    mask = df_merged['name_x'].str.lower() == df_merged['name_y'].str.lower()
    df_merged.loc[mask, 'target'] = 1
    df_merged.loc[mask, 'condition'] = condition

    condition = 'facebook_id'
    mask = df_merged['facebook_id_x'] == df_merged['facebook_id_y']
    df_merged.loc[mask, 'target'] = 1
    df_merged.loc[mask, 'condition'] = condition

    if balanced:
        n = np.sum(df_merged['target'])
        df_merged = df_merged[df_merged['target'] == 1].append(df_merged[df_merged['target'] == 0].sample(n=n, random_state=random_state), sort = False)

    return df_merged.reset_index(drop=True)

In [None]:
def generate_features(name_x, name_y, description_x, description_y, start_time_x, start_time_y, place_name_x, place_name_y, street_x, street_y):
    from nltk.metrics import edit_distance
    
    X = {}
    
    name_x = name_x.lower()
    name_y = name_y.lower()    
    place_name_x = place_name_x.lower()
    place_name_y = place_name_y.lower()
    street_x = street_x.lower()
    street_y = street_y.lower()

    X['time_diff'] = abs((start_time_x - start_time_y).total_seconds() / 3600)
    X['coll_sim'] = 2 * len([1 for collocation in re.findall(r'([A-Z]+\w*\.? [A-Z]+\w+)', description_x) if collocation in description_y]) / (1 + len(re.findall(r'([A-Z]+\w*\.? [A-Z]+\w+)', description_x)) + len(re.findall(r'([A-Z]+\w*\.? [A-Z]+\w+)', description_y)))

    X['name_equality'] = name_x == name_y
    X['name_intersect'] = name_x in name_y or name_y in name_x
    X['name_levensthein'] = edit_distance(name_x,name_y)
    X['name_common_words'] = 2 * len(set(name_x.split()).intersection(name_y.split())) / (len(name_x.split()) + len(name_y.split()))    
    tri_name_x = [name_x[i:i+3] for i in range(len(name_x)-2)]
    tri_name_y = [name_y[i:i+3] for i in range(len(name_y)-2)]
    X['name_trigrams'] = 2 * len(set(tri_name_x).intersection(tri_name_y)) / (len(tri_name_x) + len(tri_name_y))    
    first_letters_name_x = [i[0] for i in name_x.split()]
    first_letters_name_y = [i[0] for i in name_y.split()]
    X['name_first_letters'] = 2 * len(set(first_letters_name_x).intersection(first_letters_name_y)) / (len(first_letters_name_x) + len(first_letters_name_y))

    tri_place_name_x = [place_name_x[i:i+3] for i in range(len(place_name_x)-2)]
    tri_place_name_y = [place_name_y[i:i+3] for i in range(len(place_name_y)-2)]
    X['place_name_trigrams'] = 2 * len(set(tri_place_name_x).intersection(tri_place_name_y)) / (len(tri_place_name_x) + len(tri_place_name_y))
    first_letters_place_name_x = [i[0] for i in place_name_x.split()]
    first_letters_place_name_y = [i[0] for i in place_name_y.split()]
    X['place_name_first_letters'] = 2 * len(set(first_letters_place_name_x).intersection(first_letters_place_name_y)) / (len(first_letters_place_name_x) + len(first_letters_place_name_y))
    
    tri_street_x = [street_x[i:i+3] for i in range(len(street_x)-2)]
    tri_street_y = [street_y[i:i+3] for i in range(len(street_y)-2)]
    X['street_trigrams'] = 2 * len(set(tri_street_x).intersection(tri_street_y)) / (1 + len(tri_street_x) + len(tri_street_y))
    return X

##### Deserializacja danych

In [None]:
fb = joblib.load('../pickles/fb.pkl')
ss = joblib.load('../pickles/ss.pkl')
cjg = joblib.load('../pickles/cjg.pkl')
sk = joblib.load('../pickles/sk.pkl')
um = joblib.load('../pickles/um.pkl')
concatenation = joblib.load('../pickles/concatenation.pkl')

##### Kombinacja eventów

In [None]:
combination = combine_events(concatenation)

##### Utworzenie zioru uczącego

In [None]:
train_pairs = generate_labels(combination)

In [None]:
combination.columns

##### Trenowanie modelu

In [None]:
X = train_pairs.apply(lambda x: generate_features(x.name_x, x.name_y, x.description_x, x.description_y, x.start_time_x, x.start_time_y, x.place_name_x, x.place_name_y, x.street_x, x.street_y), axis=1)
X = pd.DataFrame(X.tolist())
y = train_pairs['target']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=0, n_estimators=100, max_depth=5)
clf.fit(X_train, y_train)

print('Train: %f, test: %f' % (clf.score(X_train, y_train),clf.score(X_test, y_test)))

##### Metryki modelu

In [None]:
from sklearn.metrics import classification_report
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
importances = pd.DataFrame({'feature':X.columns,'importance':np.round(clf.feature_importances_,3)})
importances = importances.sort_values('importance',ascending=False).set_index('feature')
importances

In [None]:
plt.close()
importances.plot.bar()
plt.tight_layout()

##### Przetrenowanie modelu i serializacja

In [None]:
clf.fit(X,y)

In [None]:
joblib.dump(clf, '../models/events_pairs.pkl')

##### Wygenerowanie cech dla wszystkich kombinacji

In [None]:
X = combination.apply(lambda x: generate_features(x.name_x, x.name_y, x.description_x, x.description_y, x.start_time_x, x.start_time_y, x.place_name_x, x.place_name_y, x.street_x, x.street_y), axis=1)
X = pd.DataFrame(X.tolist())

##### Serializacja cech

In [None]:
joblib.dump(X, '../pickles/X.pkl')
#X = joblib.load('../pickles/X.pkl')

##### Predykcja

In [None]:
combination['pred'] = clf.predict(X)
combination['pred_prob'] = clf.predict_proba(X)[:,1]

In [None]:
plt.close()
plt.hist(combination['pred_prob'])

In [None]:
pairs = combination[combination['pred'] == 1]
pairs = pairs.sort_values('pred_prob', ascending=False).groupby(['src_x', 'src_y', 'id_x']).head(1)
pairs = pairs.sort_values('pred_prob', ascending=False).groupby(['src_x', 'src_y', 'id_y']).head(1)
pairs['src_x'].value_counts()

In [None]:
pairs = combination[combination['pred'] == 1]
pairs = pairs.sort_values('pred_prob', ascending=False).groupby(['src_x', 'src_y', 'id_x']).head(1)
pairs = pairs.sort_values('pred_prob', ascending=False).groupby(['src_x', 'src_y', 'id_y']).head(1)
pairs['src_x'].value_counts()

##### Zapisanie par do pliku oraz  serializacja

In [None]:
pairs[['id_x', 'facebook_id_x', 'place_name_x', 'street_x', 'start_time_x',
       'name_x', 'description_x', 'src_x','id_y', 'facebook_id_y', 'place_name_y',
       'street_y', 'start_time_y', 'name_y', 'description_y', 'src_y','pred_prob']]\
.sort_values(by='pred_prob', ascending=False).to_csv('../output/pairs.csv', sep=',', float_format='%.3f', index=False)

In [None]:
joblib.dump(pairs, '../pickles/pairs.pkl')
joblib.dump(combination, '../pickles/combination.pkl')

In [None]:
pairs_id = pairs[['id_x','id_y','src_x','src_y']].astype(str)
df_tmp = fb[['fb']]
concatenation['id'] = concatenation['id'].astype(str)

from itertools import combinations

sources = ['fb','ss','cjg','sk','um']
for pair in combinations(sources,2):
    df_tmp = df_tmp.merge(pairs_id[(pairs_id['src_x'] == pair[0]) & (pairs_id['src_y'] == pair[1])].rename(columns = {'id_x': pair[0], 'id_y': pair[1]})[[pair[0], pair[1]]], how = 'outer')
    
for src in sources:
    t = concatenation[concatenation['src'] == src][['id', 'facebook_id','place_name', 'street', 'start_time', 'name', 'description','lat','lng']]
    df_tmp = df_tmp.merge(t, how='outer', left_on=src, right_on='id', suffixes=['','_' + src])
    
df_tmp['place_name_std'] = df_tmp['place_name']
df_tmp['street_std'] = df_tmp['street']
df_tmp['lat_std'] = df_tmp['lat']
df_tmp['lng_std'] = df_tmp['lng']

for src in sources[1:]:
    df_tmp['place_name_std'] = df_tmp['place_name_std'].fillna(df_tmp['place_name_' + src])
    df_tmp['street_std'] = df_tmp['street_std'].fillna(df_tmp['street_' + src])
    df_tmp['lat_std'] = df_tmp['lat_std'].fillna(df_tmp['lat_' + src])
    df_tmp['lng_std'] = df_tmp['lng_std'].fillna(df_tmp['lng_' + src])

In [None]:
df_tmp[['fb','ss','cjg','sk','um','facebook_id','id_cjg', 'id_sk', 'id_ss', 'id_um',
       'name','name_ss','name_cjg','name_sk','name_um',
       'place_name','place_name_ss','place_name_cjg','place_name_sk','place_name_um',
       'street','street_ss','street_cjg','street_sk','street_um',
       'start_time','start_time_ss','start_time_cjg','start_time_sk','start_time_um',
       'description','description_ss','description_cjg','description_sk','description_um',
       'place_name_std','street_std','lat_std','lng_std']].to_csv('../output/events_database.csv')

joblib.dump(df_tmp, '../pickles/events_database.pkl')