# WNS Analytics Wizard

Importing libraries

In [1]:
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

Creating useful functions

In [2]:
def evaluate(actual, predicted, predicted_probability):
    print("Confusion Matrix")
    print(pd.DataFrame(confusion_matrix(actual, predicted)))
    print("")
    print("For Class 1")
    print("f1 Score :", f1_score(actual, predicted))
    print("Precision Score :",precision_score(actual, predicted))
    print("Recall Score :",recall_score(actual, predicted))
    print("")
    print("For Class 0")
    print("f1 Score :", f1_score(1-np.array(actual), 1-np.array(predicted)))
    print("Precision Score :",precision_score(1-np.array(actual), 1-np.array(predicted)))
    print("Recall Score :",recall_score(1-np.array(actual), 1-np.array(predicted)))
    print("")
    print("AUROC :",  roc_auc_score(actual, predicted_probability[:,1]))

In [3]:
def generate_submission(impression_ids, is_clicks, method_name):
    submission_frame = pd.DataFrame(is_clicks, index=impression_ids)
    submission_frame.columns = ['is_click']
    submission_frame.index.name = 'impression_id'
    submission_frame.to_csv('data/test/'+method_name+'.csv')

In [4]:
def flow(X, y, training_func, X_future, y_future_ids):
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2019)
    
    model = training_func(X_train, y_train)
    
    y_pred = model.predict(X_test)
    y_pred_prob = model.predict_proba(X_test)
    evaluate(y_test, y_pred, y_pred_prob)
    
    y_future_pred = model.predict_proba(X_future)[:,1]
    generate_submission(y_future_ids, y_future_pred, training_func.__name__)
    
    return X_test, y_test, y_pred
    

In [131]:
feature_cols = ['user_id','app_code','os_version','is_4G','has_seen_before','overall_click_ratio',
                'total_items','total_unique_items','total_sessions']
category_cols = [0,1,2,3,4]

In [132]:
# Reading Training Data
training_df = pd.read_csv('data/train/train_with_feature.csv', index_col='impression_id', 
                          parse_dates=['impression_time'])
X = training_df[feature_cols].values
y = training_df['is_click'].values
X[:1]

array([[8.7862e+04, 4.2200e+02, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 1.0000e+00, 1.0000e+00, 1.0000e+00]])

In [133]:
testing_df = pd.read_csv('data/test/test_with_feature.csv', index_col='impression_id')
X_future = testing_df[feature_cols].values
y_future_ids = testing_df.index.values
X_future.shape

(90675, 9)

## Trying LightGBM

In [137]:
# def tunned_light_gbm_feature(X_train, y_train):
#     lgb = LGBMClassifier(eval_metric = 'auc', random_state=2019)
#     params = {
#         'num_leaves': [5, 10, 20],
#         'max_depth': [8, 10, 12],
#         'min_data_in_leaf': [28, 30, 33]
#     }
#     lgb_grid = GridSearchCV(lgb, params, cv=10, scoring='roc_auc')
#     lgb_grid.fit(X_train, y_train, categorical_feature=category_cols)
#     print('best_estimator', lgb_grid.best_estimator_)
#     print('best_score', lgb_grid.best_score_)
#     return lgb_grid.best_estimator_

def tunned_light_gbm_feature(X_train, y_train):
    lgb = LGBMClassifier(eval_metric = 'auc', random_state=2019, num_leaves=10, max_depth=10, 
                         min_data_in_leaf=30, categorical_feature=category_cols)
    lgb.fit(X_train, y_train)
    return lgb

X_test, y_test, y_pred = flow(X, y, tunned_light_gbm_feature, X_future, y_future_ids)

Confusion Matrix
       0   1
0  56605  14
1   2766  18

For Class 1
f1 Score : 0.012784090909090908
Precision Score : 0.5625
Recall Score : 0.00646551724137931

For Class 0
f1 Score : 0.9760324165876368
Precision Score : 0.9534115982550403
Recall Score : 0.9997527331814409

AUROC : 0.7345588926425535


In [41]:
validate_test_df = pd.DataFrame(X_test)
validate_test_df.columns = feature_cols
validate_test_df['expected'] = y_test
validate_test_df['actual'] = y_pred
validate_test_df.head()

Unnamed: 0,app_code,os_version,app_click_ratio,overall_click_ratio,max_price,min_price,total_items,total_unique_items,total_sessions,freq_product_type,freq_device,freq_category_1,freq_category_2,freq_category_3,user_id,expected,actual
0,283.0,2.0,0.0,0.0,83968.0,313.0,17.0,13.0,12.0,7455.0,0.0,11.0,76.0,147.0,75131.0,0,0
1,285.0,0.0,0.0,0.0,1280.0,716.0,4.0,2.0,2.0,4803.0,0.0,11.0,51.0,290.0,47401.0,0,0
2,213.0,1.0,0.0,0.0,124800.0,64.0,87.0,64.0,14.0,2786.0,0.0,9.0,3.0,233.0,66292.0,0,0
3,129.0,2.0,0.0,0.0,38886.0,480.0,24.0,15.0,2.0,2690.0,0.0,8.0,41.0,36.0,81649.0,0,0
4,213.0,2.0,0.0,0.0,12092.0,218.0,29.0,15.0,10.0,5276.0,0.0,17.0,26.0,233.0,77831.0,0,0


In [None]:
sorted_training = training_df.sort_values('impression_time')
sorted_training.head()

In [17]:
mismatch = validate_test_df[validate_test_df.expected != validate_test_df.actual]
mismatch.head()

Unnamed: 0,app_code,os_version,app_click_ratio,overall_click_ratio,max_price,min_price,total_items,total_unique_items,total_sessions,freq_product_type,freq_device,freq_category_1,freq_category_2,freq_category_3,expected,actual
18,213.0,0.0,0.0,0.0,82560.0,160.0,59.0,35.0,12.0,5526.0,0.0,9.0,3.0,1.0,1,0
72,296.0,1.0,0.0,0.0,93568.0,160.0,134.0,90.0,20.0,7621.0,0.0,4.0,57.0,292.0,1,0
81,508.0,1.0,0.0,0.0,182144.0,870.0,8.0,8.0,7.0,10175.0,0.0,1.0,42.0,15.0,1,0
88,242.0,1.0,0.0,0.0,20416.0,1264.0,14.0,11.0,9.0,332.0,0.0,17.0,39.0,252.0,1,0
136,207.0,1.0,0.0,0.0,25600.0,2377.0,2.0,2.0,2.0,5193.0,0.0,1.0,7.0,279.0,1,0


In [18]:
not_mismatch = validate_test_df[validate_test_df.expected == validate_test_df.actual]
not_mismatch.head()

Unnamed: 0,app_code,os_version,app_click_ratio,overall_click_ratio,max_price,min_price,total_items,total_unique_items,total_sessions,freq_product_type,freq_device,freq_category_1,freq_category_2,freq_category_3,expected,actual
0,283.0,2.0,0.0,0.0,83968.0,313.0,17.0,13.0,12.0,7455.0,0.0,11.0,76.0,147.0,0,0
1,285.0,0.0,0.0,0.0,1280.0,716.0,4.0,2.0,2.0,4803.0,0.0,11.0,51.0,290.0,0,0
2,213.0,1.0,0.0,0.0,124800.0,64.0,87.0,64.0,14.0,2786.0,0.0,9.0,3.0,233.0,0,0
3,129.0,2.0,0.0,0.0,38886.0,480.0,24.0,15.0,2.0,2690.0,0.0,8.0,41.0,36.0,0,0
4,213.0,2.0,0.0,0.0,12092.0,218.0,29.0,15.0,10.0,5276.0,0.0,17.0,26.0,233.0,0,0


In [20]:
mismatch_app_code_count = mismatch.app_code.value_counts()
mismatch_app_code_count[mismatch_app_code_count < 5]

491.0    4
369.0    4
383.0    4
428.0    4
68.0     4
        ..
97.0     1
106.0    1
62.0     1
468.0    1
215.0    1
Name: app_code, Length: 101, dtype: int64

In [21]:
app_code_count = training_df.app_code.value_counts()
app_code_count[app_code_count < 5]

380    4
131    4
454    4
312    4
224    4
      ..
70     1
401    1
147    1
66     1
233    1
Name: app_code, Length: 171, dtype: int64

In [42]:
false_negative = validate_test_df[(validate_test_df.actual==1) & (validate_test_df.expected==0)]
false_negative

Unnamed: 0,app_code,os_version,app_click_ratio,overall_click_ratio,max_price,min_price,total_items,total_unique_items,total_sessions,freq_product_type,freq_device,freq_category_1,freq_category_2,freq_category_3,user_id,expected,actual
19483,473.0,2.0,0.888889,0.888889,182144.0,230.0,105.0,61.0,39.0,1344.0,0.0,13.0,58.0,102.0,3364.0,0,1
33029,296.0,2.0,1.0,1.0,10803.0,633.0,30.0,16.0,12.0,4237.0,0.0,8.0,21.0,138.0,3501.0,0,1
36444,318.0,1.0,0.0,0.0,124890.0,160.0,240.0,114.0,78.0,9325.0,0.0,9.0,8.0,62.0,87805.0,0,1
44034,190.0,1.0,0.25,0.25,133120.0,128.0,294.0,172.0,69.0,2537.0,0.0,17.0,39.0,279.0,84397.0,0,1
52343,244.0,0.0,0.666667,0.666667,80640.0,54.0,42.0,23.0,16.0,908.0,0.0,1.0,28.0,283.0,84559.0,0,1
54138,207.0,0.0,0.0,0.0,226437.0,64.0,278.0,173.0,68.0,5622.0,0.0,9.0,39.0,62.0,5859.0,0,1
55684,244.0,0.0,0.692308,0.692308,80640.0,54.0,42.0,23.0,16.0,908.0,0.0,1.0,28.0,283.0,84559.0,0,1
58520,207.0,1.0,0.0,0.0,182144.0,50.0,403.0,269.0,93.0,4924.0,0.0,9.0,74.0,292.0,90953.0,0,1
59252,244.0,0.0,0.684211,0.684211,80640.0,54.0,42.0,23.0,16.0,908.0,0.0,1.0,28.0,283.0,84559.0,0,1


In [43]:
false_negative[false_negative.user_id==84559]

Unnamed: 0,app_code,os_version,app_click_ratio,overall_click_ratio,max_price,min_price,total_items,total_unique_items,total_sessions,freq_product_type,freq_device,freq_category_1,freq_category_2,freq_category_3,user_id,expected,actual
52343,244.0,0.0,0.666667,0.666667,80640.0,54.0,42.0,23.0,16.0,908.0,0.0,1.0,28.0,283.0,84559.0,0,1
55684,244.0,0.0,0.692308,0.692308,80640.0,54.0,42.0,23.0,16.0,908.0,0.0,1.0,28.0,283.0,84559.0,0,1
59252,244.0,0.0,0.684211,0.684211,80640.0,54.0,42.0,23.0,16.0,908.0,0.0,1.0,28.0,283.0,84559.0,0,1


In [57]:
sorted_training[sorted_training.user_id==84559][['impression_time','user_id','app_code','is_click','app_click_ratio']]

Unnamed: 0_level_0,impression_time,user_id,app_code,is_click,app_click_ratio
impression_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
41d6c2482cdf34113f998e9df192e148,2018-11-15 21:29:00,84559,244,1,0.0
d82d678e9583c1f5f283ec56fbf1abb7,2018-11-15 21:48:00,84559,244,0,1.0
c1234a24b3825382a0e1ac3e1d925b13,2018-11-16 00:54:00,84559,244,0,0.5
f2b6806d6ed60d2d87b0dd5ae62e6f20,2018-11-16 01:57:00,84559,244,1,0.333333
e0a9b8565cb497d114bdfbd7c5049760,2018-11-18 02:02:00,84559,244,0,0.5
c01fde2578043635c831a96524f9b679,2018-11-18 02:05:00,84559,244,1,0.4
5c4f80aa52df224e6df555eab12c4fc7,2018-11-19 00:34:00,84559,244,1,0.5
599708672e8b3790d67dbf4379f75355,2018-11-19 00:40:00,84559,244,1,0.571429
3e42b75ec468ace08c11557d385c3250,2018-11-20 23:49:00,84559,244,1,0.625
47ebca2644fd2a35105cb3ab82a1d297,2018-11-21 00:25:00,84559,244,0,0.666667


In [None]:
previous_record = time_series_user_data.loc[(52737, 32), :][:'2018-11-30 11:52:00'][:-1]
total_ads = previous_record.shape[0]
total_clicks = previous_record.is_click.sum()
(total_clicks / total_ads) if total_ads != 0 else 0.0