# WNS Analytics Wizard

Importing libraries

In [1]:
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

Creating useful functions

In [2]:
def evaluate(actual, predicted, predicted_probability):
    print("Confusion Matrix")
    print(pd.DataFrame(confusion_matrix(actual, predicted)))
    print("")
    print("For Class 1")
    print("f1 Score :", f1_score(actual, predicted))
    print("Precision Score :",precision_score(actual, predicted))
    print("Recall Score :",recall_score(actual, predicted))
    print("")
    print("For Class 0")
    print("f1 Score :", f1_score(1-np.array(actual), 1-np.array(predicted)))
    print("Precision Score :",precision_score(1-np.array(actual), 1-np.array(predicted)))
    print("Recall Score :",recall_score(1-np.array(actual), 1-np.array(predicted)))
    print("")
    print("AUROC :",  roc_auc_score(actual, predicted_probability[:,1]))

In [3]:
def generate_submission(impression_ids, is_clicks, method_name):
    submission_frame = pd.DataFrame(is_clicks, index=impression_ids)
    submission_frame.columns = ['is_click']
    submission_frame.index.name = 'impression_id'
    submission_frame.to_csv('data/report/'+method_name+'.csv')

In [4]:
def flow(X, y, training_func, X_future, y_future_ids):
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2019)
    
    model = training_func(X_train, y_train)
    
    y_pred = model.predict(X_test)
    y_pred_prob = model.predict_proba(X_test)
    evaluate(y_test, y_pred, y_pred_prob)
    
    y_future_pred = model.predict_proba(X_future)[:,1]
    generate_submission(y_future_ids, y_future_pred, training_func.__name__)
    
    return X_test, y_test, y_pred
    

In [5]:
feature_cols = ["user_id", "app_code", "os_version", "is_4G", "freq_cat3", "unique_cat3", "unique_device", 
                "unique_items", "total_sessions", "last_active_session", "std_price", 
                "app_last_ad_seen", "last_ad_seen", "user_click_ratio", "app_click_ratio"]
category_cols = [0,1,2,3,4]

In [7]:
# Reading Training Data
training_df = pd.read_csv('data/train/train_feature.csv', index_col='impression_id', 
                          parse_dates=['impression_time'])

# Removing records where user has no history
training_df = training_df[(training_df.user_ads > 0) 
                              | (training_df.app_ads > 0)
                             | (training_df.unique_items > 0)]

X = training_df[feature_cols].values
y = training_df['is_click'].values
X[:1]

array([[8.78620e+04, 4.22000e+02, 2.00000e+00, 0.00000e+00, 1.10000e+01,
        1.00000e+00, 1.00000e+00, 1.00000e+00, 1.00000e+00, 2.26734e+06,
                nan, 3.88800e+06, 3.88800e+06, 0.00000e+00, 0.00000e+00]])

In [8]:
testing_df = pd.read_csv('data/test/test_feature.csv', index_col='impression_id'
                         , parse_dates=['impression_time'])
X_future = testing_df[feature_cols].values
y_future_ids = testing_df.index.values
X_future.shape

(90675, 15)

## Trying LightGBM

In [16]:
def tunned_light_gbm_no_history(X_train, y_train):
    lgb = LGBMClassifier(eval_metric = 'auc', random_state=2019)
    params = {
        'num_leaves': [20,22,24],
        'max_depth': [4,6,8],
        'min_data_in_leaf': [35,37,39]
    }
    lgb_grid = GridSearchCV(lgb, params, cv=10, scoring='roc_auc', n_jobs=9)
    lgb_grid.fit(X_train, y_train, categorical_feature=category_cols)
    print('best_estimator', lgb_grid.best_estimator_)
    print('best_score', lgb_grid.best_score_)
    return lgb_grid

def simple_light_gbm_no_history(X_train, y_train):
    lgb = LGBMClassifier(eval_metric = 'auc', random_state=2019, num_leaves=14, max_depth=6, 
                         min_data_in_leaf=33, categorical_feature=category_cols)
    lgb.fit(X_train, y_train)
    return lgb

X_test, y_test, y_pred = flow(X, y, tunned_light_gbm_no_history, X_future, y_future_ids)

best_estimator LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               eval_metric='auc', importance_type='split', learning_rate=0.1,
               max_depth=6, min_child_samples=20, min_child_weight=0.001,
               min_data_in_leaf=37, min_split_gain=0.0, n_estimators=100,
               n_jobs=-1, num_leaves=22, objective=None, random_state=2019,
               reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
               subsample_for_bin=200000, subsample_freq=0)
best_score 0.7226082001431574
Confusion Matrix
       0   1
0  54717  29
1   2627  22

For Class 1
f1 Score : 0.0162962962962963
Precision Score : 0.43137254901960786
Recall Score : 0.008305020762551907

For Class 0
f1 Score : 0.9763047551075029
Precision Score : 0.9541887555803571
Recall Score : 0.9994702809337669

AUROC : 0.7212597359435167
