# WNS Analytics Wizard

Importing libraries

In [1]:
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

Creating useful functions

In [2]:
def evaluate(actual, predicted, predicted_probability):
    print("Confusion Matrix")
    print(pd.DataFrame(confusion_matrix(actual, predicted)))
    print("")
    print("For Class 1")
    print("f1 Score :", f1_score(actual, predicted))
    print("Precision Score :",precision_score(actual, predicted))
    print("Recall Score :",recall_score(actual, predicted))
    print("")
    print("For Class 0")
    print("f1 Score :", f1_score(1-np.array(actual), 1-np.array(predicted)))
    print("Precision Score :",precision_score(1-np.array(actual), 1-np.array(predicted)))
    print("Recall Score :",recall_score(1-np.array(actual), 1-np.array(predicted)))
    print("")
    print("AUROC :",  roc_auc_score(actual, predicted_probability[:,1]))

In [3]:
def generate_submission(impression_ids, is_clicks, method_name):
    submission_frame = pd.DataFrame(is_clicks, index=impression_ids)
    submission_frame.columns = ['is_click']
    submission_frame.index.name = 'impression_id'
    submission_frame.to_csv('data/test/'+method_name+'.csv')

In [4]:
def flow(X, y, training_func, X_future, y_future_ids):
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2019)
    
    model = training_func(X_train, y_train)
    
    y_pred = model.predict(X_test)
    y_pred_prob = model.predict_proba(X_test)
    evaluate(y_test, y_pred, y_pred_prob)
    
    y_future_pred = model.predict_proba(X_future)[:,1]
    generate_submission(y_future_ids, y_future_pred, training_func.__name__)
    
    return X_test, y_test, y_pred
    

In [5]:
feature_cols = ["user_id", "app_code", "os_version", "is_4G", "freq_cat3", "unique_cat3", "unique_device", 
                "total_items", "total_sessions", "last_active_session", "mean_price", "std_price", 
                "app_last_ad_seen", "last_ad_seen", "app_click_ratio", "overall_click_ratio"]
category_cols = [0,1,2,3,4]

In [6]:
# Reading Training Data
training_df = pd.read_csv('data/train/train_feature.csv', index_col='impression_id', 
                          parse_dates=['impression_time'])
X = training_df[feature_cols].values
y = training_df['is_click'].values
X[:1]

array([[8.78620e+04, 4.22000e+02, 2.00000e+00, 0.00000e+00, 1.10000e+01,
        1.00000e+00, 1.00000e+00, 1.00000e+00, 1.00000e+00, 2.26734e+06,
        2.35000e+03,         nan, 3.88800e+06, 3.88800e+06, 0.00000e+00,
        0.00000e+00]])

In [7]:
testing_df = pd.read_csv('data/test/test_feature.csv', index_col='impression_id')
X_future = testing_df[feature_cols].values
y_future_ids = testing_df.index.values
X_future.shape

(90675, 16)

## Trying LightGBM

In [22]:
# def tunned_light_gbm_date_range(X_train, y_train):
#     lgb = LGBMClassifier(eval_metric = 'auc', random_state=2019)
#     params = {
#         'num_leaves': [8, 10, 12],
#         'max_depth': [10],
#         'min_data_in_leaf': [30, 33, 35]
#     }
#     lgb_grid = GridSearchCV(lgb, params, cv=10, scoring='roc_auc')
#     lgb_grid.fit(X_train, y_train, categorical_feature=category_cols)
#     print('best_estimator', lgb_grid.best_estimator_)
#     print('best_score', lgb_grid.best_score_)
#     return lgb_grid

def tunned_light_gbm_date_range(X_train, y_train):
    lgb = LGBMClassifier(eval_metric = 'auc', random_state=2019, num_leaves=10, max_depth=10, 
                         min_data_in_leaf=33, categorical_feature=category_cols)
    lgb.fit(X_train, y_train)
    return lgb

X_test, y_test, y_pred = flow(X, y, tunned_light_gbm_date_range, X_future, y_future_ids)

Confusion Matrix
       0   1
0  56608  11
1   2769  15

For Class 1
f1 Score : 0.010676156583629894
Precision Score : 0.5769230769230769
Recall Score : 0.005387931034482759

For Class 0
f1 Score : 0.976033656332977
Precision Score : 0.9533657813631541
Recall Score : 0.999805718928275

AUROC : 0.7176955125843179


In [9]:
validate_test_df = pd.DataFrame(X_test)
validate_test_df.columns = feature_cols
validate_test_df['expected'] = y_test
validate_test_df['actual'] = y_pred
validate_test_df.head()

Unnamed: 0,user_id,app_code,os_version,is_4G,freq_cat3,unique_cat3,unique_device,total_items,total_sessions,last_active_session,mean_price,std_price,app_last_ad_seen,last_ad_seen,app_click_ratio,overall_click_ratio,expected,actual
0,75131.0,283.0,0.0,0.0,11.0,6.0,1.0,13.0,8.0,641340.0,12795.923077,13901.76956,119220.0,119220.0,0.0,0.0,0,0
1,47401.0,285.0,2.0,1.0,14.0,1.0,1.0,1.0,1.0,1198920.0,1280.0,,5910780.0,5910780.0,0.0,0.0,0,0
2,66292.0,213.0,1.0,0.0,9.0,4.0,1.0,10.0,2.0,722820.0,7409.9,6614.242553,5641260.0,5641260.0,0.0,0.0,0,0
3,81649.0,129.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
4,77831.0,213.0,0.0,0.0,4.0,2.0,1.0,3.0,3.0,711300.0,2634.666667,706.099379,4844820.0,4844820.0,0.0,0.0,0,0


In [10]:
sorted_training = training_df.sort_values('impression_time')
sorted_training.head()

Unnamed: 0_level_0,impression_time,user_id,app_code,os_version,is_4G,is_click,total_items,unique_items,total_sessions,last_active_session,...,unique_cat2,freq_cat2,unique_cat3,freq_cat3,unique_device,freq_device,app_last_ad_seen,last_ad_seen,app_click_ratio,overall_click_ratio
impression_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
c4ca4238a0b923820dcc509a6f75849b,2018-11-15 00:00:00,87862,422,2,0,0,1,1,1,2267340,...,1,11,1,11,1,0,3888000,3888000,0.0,0.0
c81e728d9d4c2f636f067f89cc14862c,2018-11-15 00:00:00,89464,129,1,0,0,109,75,23,1483920,...,14,17,14,17,1,0,3888000,3888000,0.0,0.0
eccbc87e4b5ce2fe28308fd9f2a7baf3,2018-11-15 00:00:00,58442,127,0,0,0,27,12,14,623040,...,8,10,8,10,1,0,3888000,3888000,0.0,0.0
a87ff679a2f3e71d9181a67b7542122c,2018-11-15 00:00:00,4238,371,0,0,0,91,41,25,793620,...,11,8,11,8,1,0,3888000,3888000,0.0,0.0
45c48cce2e2d7fbdea1afc51c7c6ad26,2018-11-15 00:01:00,63410,467,0,1,1,7,3,5,1796280,...,2,4,2,4,1,0,3888060,3888060,0.0,0.0


In [11]:
mismatch = validate_test_df[validate_test_df.expected != validate_test_df.actual]
mismatch.head()

Unnamed: 0,user_id,app_code,os_version,is_4G,freq_cat3,unique_cat3,unique_device,total_items,total_sessions,last_active_session,mean_price,std_price,app_last_ad_seen,last_ad_seen,app_click_ratio,overall_click_ratio,expected,actual
18,14459.0,213.0,2.0,1.0,9.0,9.0,1.0,22.0,6.0,1269600.0,13988.045455,18143.978947,4906260.0,4906260.0,0.0,0.0,1,0
72,42807.0,296.0,1.0,0.0,4.0,12.0,1.0,81.0,9.0,774060.0,7927.925926,15984.2921,5527440.0,5527440.0,0.0,0.0,1,0
81,29506.0,508.0,1.0,1.0,1.0,5.0,1.0,5.0,4.0,1058640.0,13844.4,18974.466232,483720.0,483720.0,0.0,0.0,1,0
88,49576.0,242.0,1.0,0.0,17.0,5.0,1.0,11.0,6.0,939540.0,4912.454545,5455.692282,4045080.0,4045080.0,0.0,0.0,1,0
136,73588.0,207.0,1.0,1.0,17.0,1.0,1.0,1.0,1.0,1712400.0,2377.0,,4110900.0,4110900.0,0.0,0.0,1,0


In [12]:
not_mismatch = validate_test_df[validate_test_df.expected == validate_test_df.actual]
not_mismatch.head()

Unnamed: 0,user_id,app_code,os_version,is_4G,freq_cat3,unique_cat3,unique_device,total_items,total_sessions,last_active_session,mean_price,std_price,app_last_ad_seen,last_ad_seen,app_click_ratio,overall_click_ratio,expected,actual
0,75131.0,283.0,0.0,0.0,11.0,6.0,1.0,13.0,8.0,641340.0,12795.923077,13901.76956,119220.0,119220.0,0.0,0.0,0,0
1,47401.0,285.0,2.0,1.0,14.0,1.0,1.0,1.0,1.0,1198920.0,1280.0,,5910780.0,5910780.0,0.0,0.0,0,0
2,66292.0,213.0,1.0,0.0,9.0,4.0,1.0,10.0,2.0,722820.0,7409.9,6614.242553,5641260.0,5641260.0,0.0,0.0,0,0
3,81649.0,129.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
4,77831.0,213.0,0.0,0.0,4.0,2.0,1.0,3.0,3.0,711300.0,2634.666667,706.099379,4844820.0,4844820.0,0.0,0.0,0,0


In [13]:
mismatch_app_code_count = mismatch.app_code.value_counts()
mismatch_app_code_count[mismatch_app_code_count < 5]

383.0    4
428.0    4
68.0     4
491.0    4
302.0    4
369.0    4
411.0    3
260.0    3
481.0    3
395.0    3
240.0    3
470.0    3
366.0    3
289.0    3
512.0    3
144.0    3
488.0    3
246.0    3
438.0    3
351.0    3
0.0      3
14.0     3
391.0    3
336.0    2
513.0    2
2.0      2
176.0    2
19.0     2
48.0     2
8.0      2
        ..
505.0    1
95.0     1
434.0    1
77.0     1
194.0    1
445.0    1
203.0    1
499.0    1
389.0    1
315.0    1
449.0    1
451.0    1
381.0    1
399.0    1
405.0    1
521.0    1
334.0    1
257.0    1
193.0    1
327.0    1
182.0    1
173.0    1
154.0    1
93.0     1
408.0    1
97.0     1
106.0    1
62.0     1
468.0    1
215.0    1
Name: app_code, Length: 101, dtype: int64

In [14]:
app_code_count = training_df.app_code.value_counts()
app_code_count[app_code_count < 5]

380    4
131    4
454    4
312    4
224    4
343    4
307    4
330    4
350    4
517    4
172    4
451    4
286    3
374    3
510    3
143    3
400    3
252    3
520    3
164    3
36     3
132    3
160    3
271    3
435    3
165    3
306    3
104    3
100    3
226    3
      ..
437    1
284    1
52     1
105    1
429    1
65     1
149    1
21     1
332    1
267    1
418    1
140    1
109    1
13     1
269    1
162    1
301    1
205    1
459    1
404    1
417    1
73     1
72     1
71     1
376    1
70     1
401    1
147    1
66     1
233    1
Name: app_code, Length: 171, dtype: int64

In [15]:
false_negative = validate_test_df[(validate_test_df.actual==1) & (validate_test_df.expected==0)]
false_negative

Unnamed: 0,user_id,app_code,os_version,is_4G,freq_cat3,unique_cat3,unique_device,total_items,total_sessions,last_active_session,mean_price,std_price,app_last_ad_seen,last_ad_seen,app_click_ratio,overall_click_ratio,expected,actual
6118,69062.0,37.0,2.0,0.0,13.0,9.0,1.0,20.0,13.0,756600.0,11083.0,15743.738538,264120.0,264120.0,0.666667,0.666667,0,1
19483,3364.0,473.0,0.0,1.0,13.0,14.0,1.0,62.0,25.0,737880.0,9023.5,23702.474726,837720.0,837720.0,0.875,0.875,0,1
31528,10315.0,217.0,1.0,1.0,1.0,2.0,1.0,2.0,2.0,1966440.0,14861.0,14191.633098,1576020.0,1576020.0,0.6,0.6,0,1
34531,52737.0,32.0,0.0,0.0,8.0,6.0,1.0,12.0,7.0,709920.0,17467.083333,29907.951933,132720.0,130380.0,0.0,0.0,0,1
34650,91986.0,423.0,2.0,0.0,11.0,6.0,1.0,13.0,10.0,617580.0,6990.538462,14716.39317,121320.0,121320.0,1.0,1.0,0,1
52343,84559.0,244.0,2.0,0.0,1.0,5.0,1.0,16.0,6.0,1773720.0,3474.3125,2029.455977,85440.0,85440.0,0.636364,0.636364,0,1
55684,84559.0,244.0,2.0,0.0,1.0,5.0,1.0,17.0,7.0,1425360.0,3525.941176,1976.508837,84840.0,84840.0,0.636364,0.636364,0,1
58520,90953.0,207.0,1.0,0.0,9.0,15.0,1.0,247.0,43.0,683880.0,9580.012146,18335.974455,4009620.0,4009620.0,0.0,0.0,0,1
59252,84559.0,244.0,2.0,0.0,1.0,5.0,1.0,16.0,6.0,1688280.0,3474.3125,2029.455977,88200.0,88200.0,0.636364,0.636364,0,1


In [16]:
false_negative[false_negative.user_id==84559]

Unnamed: 0,user_id,app_code,os_version,is_4G,freq_cat3,unique_cat3,unique_device,total_items,total_sessions,last_active_session,mean_price,std_price,app_last_ad_seen,last_ad_seen,app_click_ratio,overall_click_ratio,expected,actual
52343,84559.0,244.0,2.0,0.0,1.0,5.0,1.0,16.0,6.0,1773720.0,3474.3125,2029.455977,85440.0,85440.0,0.636364,0.636364,0,1
55684,84559.0,244.0,2.0,0.0,1.0,5.0,1.0,17.0,7.0,1425360.0,3525.941176,1976.508837,84840.0,84840.0,0.636364,0.636364,0,1
59252,84559.0,244.0,2.0,0.0,1.0,5.0,1.0,16.0,6.0,1688280.0,3474.3125,2029.455977,88200.0,88200.0,0.636364,0.636364,0,1


In [17]:
sorted_training[sorted_training.user_id==84559][['impression_time','user_id','app_code','is_click','app_click_ratio']]

Unnamed: 0_level_0,impression_time,user_id,app_code,is_click,app_click_ratio
impression_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
41d6c2482cdf34113f998e9df192e148,2018-11-15 21:29:00,84559,244,1,0.0
d82d678e9583c1f5f283ec56fbf1abb7,2018-11-15 21:48:00,84559,244,0,0.0
c1234a24b3825382a0e1ac3e1d925b13,2018-11-16 00:54:00,84559,244,0,0.0
f2b6806d6ed60d2d87b0dd5ae62e6f20,2018-11-16 01:57:00,84559,244,1,0.0
e0a9b8565cb497d114bdfbd7c5049760,2018-11-18 02:02:00,84559,244,0,0.0
c01fde2578043635c831a96524f9b679,2018-11-18 02:05:00,84559,244,1,0.0
5c4f80aa52df224e6df555eab12c4fc7,2018-11-19 00:34:00,84559,244,1,0.0
599708672e8b3790d67dbf4379f75355,2018-11-19 00:40:00,84559,244,1,0.0
3e42b75ec468ace08c11557d385c3250,2018-11-20 23:49:00,84559,244,1,0.0
47ebca2644fd2a35105cb3ab82a1d297,2018-11-21 00:25:00,84559,244,0,0.0


In [18]:
previous_record = time_series_user_data.loc[(52737, 32), :][:'2018-11-30 11:52:00'][:-1]
total_ads = previous_record.shape[0]
total_clicks = previous_record.is_click.sum()
(total_clicks / total_ads) if total_ads != 0 else 0.0

NameError: name 'time_series_user_data' is not defined