In [1]:
import pandas as pd
import numpy as np
import datetime
import time
import pickle
from tqdm import tqdm
import matplotlib.pyplot as plt
%matplotlib inline
from collections import Counter
from random import choices, sample
from gensim.models import Word2Vec
import gc
pd.set_option('display.float_format',lambda x : '%.2f' % x)
from math import log10

In [2]:
with open('./NN_result/lt_df_NN_rank_9days.pkl', 'rb') as file:
    lt_df_NN_rank = pickle.load(file)

In [3]:
full_table = pd.concat(lt_df_NN_rank[1:1+4], ignore_index=True)

In [4]:
full_table['dist'] = full_table.apply(lambda df: df['rank'] - df['NN_rank'], axis=1)

In [5]:
full_table

Unnamed: 0,target,next,rank,NN_rank,dist
0,2063176,1793668,53.00,40,13.00
1,1036421,215370,26.00,50,-24.00
2,1036421,215370,26.00,50,-24.00
3,4364467,632080,19.00,2,17.00
4,632080,4272037,20.00,61,-41.00
...,...,...,...,...,...
952774,3087929,3292913,88.00,52,36.00
952775,4082039,3801509,48.00,92,-44.00
952776,932794,488456,80.00,20,60.00
952777,1354952,1846009,39.00,1,38.00


In [6]:
full_table.describe()

Unnamed: 0,rank,NN_rank,dist
count,952779.0,952779.0,952779.0
mean,30.48,32.24,-1.77
std,28.11,27.69,33.35
min,1.0,1.0,-98.0
25%,6.0,8.0,-21.0
50%,21.0,24.0,-1.0
75%,50.0,51.0,16.0
max,99.0,99.0,98.0


In [7]:
table_grby = full_table.groupby(['target'])
pd.DataFrame(table_grby.size())

Unnamed: 0_level_0,0
target,Unnamed: 1_level_1
100000,1
1000061,1
100021,3
1000294,1
1000300,22
...,...
999686,1
999712,77
999752,1
999828,425


In [8]:
df_table = pd.merge(full_table, pd.DataFrame(table_grby.size()), on=['target'], how='left')

In [9]:
df_table.rename(columns = {0:'occur'}, inplace=True)

In [10]:
# df_table.drop_duplicates(inplace=True, ignore_index=True)

In [11]:
df_table

Unnamed: 0,target,next,rank,NN_rank,dist,occur
0,2063176,1793668,53.00,40,13.00,35
1,1036421,215370,26.00,50,-24.00,3
2,1036421,215370,26.00,50,-24.00,3
3,4364467,632080,19.00,2,17.00,342
4,632080,4272037,20.00,61,-41.00,648
...,...,...,...,...,...,...
952774,3087929,3292913,88.00,52,36.00,1
952775,4082039,3801509,48.00,92,-44.00,65
952776,932794,488456,80.00,20,60.00,41
952777,1354952,1846009,39.00,1,38.00,36


In [12]:
df_table.describe()

Unnamed: 0,rank,NN_rank,dist,occur
count,952779.0,952779.0,952779.0,952779.0
mean,30.48,32.24,-1.77,182.2
std,28.11,27.69,33.35,266.91
min,1.0,1.0,-98.0,1.0
25%,6.0,8.0,-21.0,23.0
50%,21.0,24.0,-1.0,82.0
75%,50.0,51.0,16.0,230.0
max,99.0,99.0,98.0,2275.0


In [13]:
df_table.groupby('occur').mean().describe()

Unnamed: 0,rank,NN_rank,dist
count,547.0,547.0,547.0
mean,26.18,25.36,0.83
std,7.81,7.91,8.06
min,3.13,4.48,-42.72
25%,21.34,19.36,-4.19
50%,26.86,25.23,-0.34
75%,31.11,30.28,5.31
max,55.28,49.19,32.9


---

In [14]:
target_grby = table_grby # df_table.groupby(['target'])

In [15]:
target_array = df_table['target'].unique()
training_table = pd.DataFrame({'target': target_array})
training_table = pd.merge(training_table, pd.DataFrame(table_grby.size()), on=['target'], how='left')
training_table.rename(columns = {0:'occur'}, inplace=True)

In [16]:
from multiprocessing import  Pool
def parallelize_dataframe(df, func, n_cores=8):
    df_split = np.array_split(df, n_cores)
    pool = Pool(n_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

In [17]:
def add_features(training_table):
    training_table['rank'] = training_table.apply(lambda df: target_grby.get_group(df['target'])['rank'].mean(), axis=1)
    training_table['NN_rank'] = training_table.apply(lambda df: target_grby.get_group(df['target'])['NN_rank'].mean(), axis=1)
    training_table['label'] = training_table.apply(lambda df: df['rank'] > df['NN_rank'], axis=1)
    training_table['dist'] = training_table.apply(lambda df: target_grby.get_group(df['target'])['dist'].mean(), axis=1)
    return training_table

In [18]:
training_table = parallelize_dataframe(training_table, add_features)

In [19]:
training_table

Unnamed: 0,target,occur,rank,NN_rank,label,dist
0,2063176,35,37.60,34.97,True,2.63
1,1036421,3,43.67,60.67,False,-17.00
2,4364467,342,15.94,16.77,False,-0.84
3,632080,648,12.84,15.30,False,-2.46
4,4272037,99,17.86,26.56,False,-8.70
...,...,...,...,...,...,...
67298,518237,1,44.00,87.00,False,-43.00
67299,3580767,1,69.00,23.00,True,46.00
67300,3938629,1,56.00,93.00,False,-37.00
67301,4464701,1,3.00,87.00,False,-84.00


In [20]:
training_table['label'].value_counts()

False    38318
True     28985
Name: label, dtype: int64

In [22]:
def trans(df):
    if df['label'] and df['occur']==1:
        if df['dist'] < 25:
            return False
    return df['label']

In [23]:
training_table['label_x'] = training_table.apply(trans , axis=1)

In [24]:
training_table

Unnamed: 0,target,occur,rank,NN_rank,label,dist,label_x
0,102ca9ccb86811e9b0f5acde48001122,45,14.91,10.18,True,4.73,True
1,e360cdcab86611e987efacde48001122,134,9.13,6.78,True,2.34,True
2,baaadc26b86711e9b9cbacde48001122,48,16.96,33.44,False,-16.48,False
3,e353a26cb86611e992e5acde48001122,38,24.16,28.97,False,-4.82,False
4,e336d7e2b86611e9a7f7acde48001122,9,28.33,20.67,True,7.67,True
...,...,...,...,...,...,...,...
252458,09bc7246b86a11e99a9dacde48001122,1,1.00,1.00,False,0.00,False
252459,f8297b4ab86a11e9a645acde48001122,1,1.00,1.00,False,0.00,False
252460,6c589d58b86711e9a4f2acde48001122,1,6.00,4.00,True,2.00,False
252461,6c60ab54b86711e987faacde48001122,1,1.00,1.00,False,0.00,False


In [26]:
training_table['label_x'].value_counts()

False    165252
True      87211
Name: label_x, dtype: int64

---

In [22]:
df_table_testday = pd.concat(lt_df_NN_rank[1+4:], ignore_index=True)

In [23]:
df_table_testday.drop_duplicates(inplace=True, ignore_index=True)

In [24]:
df_table_testday.describe()

Unnamed: 0,rank,NN_rank
count,308215.0,308215.0
mean,39.75,42.04
std,29.02,28.67
min,1.0,1.0
25%,14.0,17.0
50%,35.0,38.0
75%,63.0,65.0
max,99.0,99.0


In [25]:
switch_table = training_table.drop(columns=['rank', 'NN_rank', 'dist'])

In [26]:
df_table_testday_switch = pd.merge(df_table_testday, switch_table, on=['target'], how='left')

In [27]:
from multiprocessing import  Pool
def parallelize_dataframe(df, func, n_cores=8):
    df_split = np.array_split(df, n_cores)
    pool = Pool(n_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

In [28]:
def add_features(df_table_testday_switch):
    df_table_testday_switch['Ground_Truth'] = df_table_testday_switch.apply(lambda df: df['rank'] > df['NN_rank'], axis=1)
    df_table_testday_switch['switch'] = df_table_testday_switch.apply(lambda df: df['NN_rank'] if df['label'] else df['rank'], axis=1)
    df_table_testday_switch['switch_fillna_NN'] = df_table_testday_switch.apply(lambda df: df['NN_rank'] if np.isnan(df['label']) else df['switch'], axis=1)
    df_table_testday_switch['switch_fillna_knn'] = df_table_testday_switch.apply(lambda df: df['rank'] if np.isnan(df['label']) else df['switch'], axis=1)
#     df_table_testday_switch['switch_x'] = df_table_testday_switch.apply(lambda df: df['NN_rank'] if df['label_x'] else df['rank'], axis=1)
    return df_table_testday_switch

In [29]:
df_table_testday_switch = parallelize_dataframe(df_table_testday_switch, add_features)

In [30]:
df_table_testday_switch

Unnamed: 0,target,next,rank,NN_rank,occur,label,Ground_Truth,switch,switch_fillna_NN,switch_fillna_knn
0,4449648,1580138,77.00,81,5.00,False,False,77.00,77.00,77.00
1,1674582,3574186,54.00,58,38.00,False,False,54.00,54.00,54.00
2,3715112,1712849,2.00,23,1026.00,True,False,23.00,23.00,23.00
3,1712849,3715112,1.00,1,40.00,False,False,1.00,1.00,1.00
4,3715112,4546029,14.00,15,1026.00,True,False,15.00,15.00,15.00
...,...,...,...,...,...,...,...,...,...,...
308210,3244585,46310,52.00,88,6.00,False,False,52.00,52.00,52.00
308211,3586172,3904916,73.00,6,1.00,False,True,73.00,73.00,73.00
308212,1322280,1535093,68.00,81,,,False,81.00,81.00,68.00
308213,3670405,845698,72.00,17,,,True,17.00,17.00,72.00


In [31]:
df_table_testday_switch[df_table_testday_switch['label'].isnull()]

Unnamed: 0,target,next,rank,NN_rank,occur,label,Ground_Truth,switch,switch_fillna_NN,switch_fillna_knn
198,4237354,1897033,14.00,38,,,False,38.00,38.00,14.00
203,3511789,1967801,77.00,56,,,True,56.00,56.00,77.00
224,3952252,4823090,15.00,27,,,False,27.00,27.00,15.00
236,229861,2029890,4.00,4,,,False,4.00,4.00,4.00
239,2585272,1226575,57.00,95,,,False,95.00,95.00,57.00
...,...,...,...,...,...,...,...,...,...,...
308196,5083668,1474281,27.00,34,,,False,34.00,34.00,27.00
308197,2264987,3794487,95.00,9,,,True,9.00,9.00,95.00
308198,1346966,5053663,75.00,89,,,False,89.00,89.00,75.00
308212,1322280,1535093,68.00,81,,,False,81.00,81.00,68.00


In [32]:
df_table_testday_switch.describe()

Unnamed: 0,rank,NN_rank,occur,switch,switch_fillna_NN,switch_fillna_knn
count,308215.0,308215.0,282685.0,308215.0,308215.0,308215.0
mean,39.75,42.04,80.89,37.51,37.51,37.25
std,29.02,28.67,145.79,28.14,28.14,28.01
min,1.0,1.0,1.0,1.0,1.0,1.0
25%,14.0,17.0,8.0,13.0,13.0,13.0
50%,35.0,38.0,29.0,32.0,32.0,31.0
75%,63.0,65.0,90.0,59.0,59.0,59.0
max,99.0,99.0,2275.0,99.0,99.0,99.0


In [33]:
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(df_table_testday_switch['Ground_Truth'].tolist(),df_table_testday_switch['label'].fillna(True).tolist()))

              precision    recall  f1-score   support

       False       0.64      0.54      0.59    169519
        True       0.53      0.63      0.57    138696

    accuracy                           0.58    308215
   macro avg       0.58      0.58      0.58    308215
weighted avg       0.59      0.58      0.58    308215

