# WNS Analytics Wizard

Importing libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
import matplotlib.pyplot as plt

SOLVER = 'lbfgs'

Creating useful functions

In [2]:
def evaluate(actual, predicted, predicted_probability):
    print("Confusion Matrix")
    print(pd.DataFrame(confusion_matrix(actual, predicted)))
    print("")
    print("For Class 1")
    print("f1 Score :", f1_score(actual, predicted))
    print("Precision Score :",precision_score(actual, predicted))
    print("Recall Score :",recall_score(actual, predicted))
    print("")
    print("For Class 0")
    print("f1 Score :", f1_score(1-np.array(actual), 1-np.array(predicted)))
    print("Precision Score :",precision_score(1-np.array(actual), 1-np.array(predicted)))
    print("Recall Score :",recall_score(1-np.array(actual), 1-np.array(predicted)))
    print("")
    print("AUROC :",  roc_auc_score(actual, predicted_probability[:,1]))

In [3]:
def plot_roc(y_test, y_pred):
    fpr, tpr, thresholds = roc_curve(y_test, y_pred)
    _ = plt.title('ROC')
    _ = plt.plot(fpr, tpr, 'b')
    _ = plt.plot([0, 1], [0, 1],'r--')
    _ = plt.xlim([0, 1])
    _ = plt.ylim([0, 1])
    _ = plt.ylabel('True Positive Rate')
    _ = plt.xlabel('False Positive Rate')
    plt.show()

In [4]:
def flow(X, y, training_func):
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=5)
    model = training_func(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_prob = model.predict_proba(X_test)
    evaluate(y_test, y_pred, y_pred_prob)
    plot_roc(y_test, y_pred)

Understanding Dataset

In [5]:
# Read Items
item_df = pd.read_csv('data/train/item_data.csv', index_col='item_id')
item_df.head()

Unnamed: 0_level_0,item_price,category_1,category_2,category_3,product_type
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
26880,4602,11,35,20,3040
54939,3513,12,57,85,6822
40383,825,17,8,279,1619
8777,2355,13,58,189,5264
113705,1267,17,39,151,10239


In [6]:
# Reading view logs
view_log_df = pd.read_csv('data/train/view_log.csv', parse_dates=['server_time'])
view_log_df.device_type = view_log_df.device_type.astype('category')
view_log_df.head()

Unnamed: 0,server_time,device_type,session_id,user_id,item_id
0,2018-10-15 08:58:00,android,112333,4557,32970
1,2018-10-15 08:58:00,android,503590,74788,7640
2,2018-10-15 08:58:00,android,573960,23628,128855
3,2018-10-15 08:58:00,android,121691,2430,12774
4,2018-10-15 08:58:00,android,218564,19227,28296


In [7]:
# Reading Ad impressions
training_df = pd.read_csv('data/train/train.csv', index_col='impression_id', parse_dates=['impression_time'])
training_df['os_version'] = training_df.os_version.astype('category')
training_df.head()

Unnamed: 0_level_0,impression_time,user_id,app_code,os_version,is_4G,is_click
impression_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
c4ca4238a0b923820dcc509a6f75849b,2018-11-15 00:00:00,87862,422,old,0,0
45c48cce2e2d7fbdea1afc51c7c6ad26,2018-11-15 00:01:00,63410,467,latest,1,1
70efdf2ec9b086079795c442636b55fb,2018-11-15 00:02:00,71748,259,intermediate,1,0
8e296a067a37563370ded05f5a3bf3ec,2018-11-15 00:02:00,69209,244,latest,1,0
182be0c5cdcd5072bb1864cdee4d3d6e,2018-11-15 00:02:00,62873,473,latest,0,0


## Trying Out-of-box Logistic Regression

In [8]:
def out_of_box_logistic(X_train, y_train):
    return LogisticRegression(solver=SOLVER).fit(X_train, y_train)

In [9]:
compatible_frame = pd.get_dummies(training_df)
X = compatible_frame[['user_id','app_code','os_version_latest','os_version_old','is_4G']].values
y = ad_impression_df['is_click'].values

NameError: name 'ad_impression_df' is not defined

In [None]:
flow(X, y, out_of_box_logistic)