# WNS Analytics Wizard

Importing libraries

In [1]:
import numpy as np
import pandas as pd
import lightgbm
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
import matplotlib.pyplot as plt
from sklearn.ensemble import BaggingClassifier

import warnings
warnings.filterwarnings('ignore')

Creating useful functions

In [2]:
def evaluate(actual, predicted, predicted_probability):
    print("Confusion Matrix")
    print(pd.DataFrame(confusion_matrix(actual, predicted)))
    print("")
    print("For Class 1")
    print("f1 Score :", f1_score(actual, predicted))
    print("Precision Score :",precision_score(actual, predicted))
    print("Recall Score :",recall_score(actual, predicted))
    print("")
    print("For Class 0")
    print("f1 Score :", f1_score(1-np.array(actual), 1-np.array(predicted)))
    print("Precision Score :",precision_score(1-np.array(actual), 1-np.array(predicted)))
    print("Recall Score :",recall_score(1-np.array(actual), 1-np.array(predicted)))
    print("")
    print("AUROC :",  roc_auc_score(actual, predicted_probability[:,1]))

In [3]:
def generate_submission(impression_ids, is_clicks, method_name):
    submission_frame = pd.DataFrame(is_clicks, index=impression_ids)
    submission_frame.columns = ['is_click']
    submission_frame.index.name = 'impression_id'
    submission_frame.to_csv('data/report/'+method_name+'.csv')

In [4]:
def flow(X, y, training_func, X_future, y_future_ids):
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2019)
    
    model = training_func(X_train, y_train)
    
    y_pred = model.predict(X_test)
    y_pred_prob = model.predict_proba(X_test)
    evaluate(y_test, y_pred, y_pred_prob)
    
    y_future_pred = model.predict_proba(X_future)[:,1]
    generate_submission(y_future_ids, y_future_pred, training_func.__name__)
    
    return X_test, y_test, y_pred
    

In [5]:
feature_cols = ["user_id", "app_code", "os_version", "is_4G", "freq_cat3", "unique_cat3", "unique_device", 
                "unique_items", "total_sessions", "last_active_session", "std_price", 
                "app_last_ad_seen", "last_ad_seen", "user_click_ratio", "app_click_ratio"]
category_cols = [0,1,2,3,4]

In [6]:
# Reading Training Data
training_df = pd.read_csv('data/train/train_feature.csv', index_col='impression_id', 
                          parse_dates=['impression_time'])
training_df = training_df.fillna(0)

# Removing records where user has no history
training_df = training_df[(training_df.user_ads > 0) 
                              | (training_df.app_ads > 0)
                             | (training_df.unique_items > 0)]

training_df['hour'] = training_df.impression_time.dt.hour
training_df['weekday'] = training_df.impression_time.dt.weekday

X = training_df[feature_cols].values
y = training_df['is_click'].values
X[:1]

array([[8.78620e+04, 4.22000e+02, 2.00000e+00, 0.00000e+00, 1.10000e+01,
        1.00000e+00, 1.00000e+00, 1.00000e+00, 1.00000e+00, 2.26734e+06,
        0.00000e+00, 3.88800e+06, 3.88800e+06, 0.00000e+00, 0.00000e+00]])

In [7]:
testing_df = pd.read_csv('data/test/test_feature.csv', index_col='impression_id'
                         , parse_dates=['impression_time'])
testing_df = testing_df.fillna(0)
testing_df['hour'] = testing_df.impression_time.dt.hour
testing_df['weekday'] = testing_df.impression_time.dt.weekday

X_future = testing_df[feature_cols].values
y_future_ids = testing_df.index.values
X_future.shape

(90675, 15)

## Trying LightGBM

In [8]:
def ensemble_light_gbm(X_train, y_train):
    lgb = LGBMClassifier(random_state=1024, num_leaves=22, max_depth=6, min_data_in_leaf=37, 
                         learning_rate=0.1, colsample_bytree=0.8, categorical_feature=category_cols)
    lgb_grid = BaggingClassifier(lgb, n_estimators=10, n_jobs=10, random_state=41)
    lgb_grid.fit(X_train, y_train)
    
    return lgb_grid

X_test, y_test, y_pred = flow(X, y, ensemble_light_gbm, X_future, y_future_ids)

Confusion Matrix
       0   1
0  54728  18
1   2622  27

For Class 1
f1 Score : 0.0200445434298441
Precision Score : 0.6
Recall Score : 0.010192525481313703

For Class 0
f1 Score : 0.976448758207251
Precision Score : 0.9542807323452485
Recall Score : 0.9996712088554415

AUROC : 0.72194924438924
