# WNS Analytics Wizard

Importing libraries

In [1]:
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
import matplotlib.pyplot as plt

Creating useful functions

In [2]:
def evaluate(actual, predicted, predicted_probability):
    print("Confusion Matrix")
    print(pd.DataFrame(confusion_matrix(actual, predicted)))
    print("")
    print("For Class 1")
    print("f1 Score :", f1_score(actual, predicted))
    print("Precision Score :",precision_score(actual, predicted))
    print("Recall Score :",recall_score(actual, predicted))
    print("")
    print("For Class 0")
    print("f1 Score :", f1_score(1-np.array(actual), 1-np.array(predicted)))
    print("Precision Score :",precision_score(1-np.array(actual), 1-np.array(predicted)))
    print("Recall Score :",recall_score(1-np.array(actual), 1-np.array(predicted)))
    print("")
    print("AUROC :",  roc_auc_score(actual, predicted_probability[:,1]))

In [3]:
def plot_roc(y_test, y_pred):
    fpr, tpr, thresholds = roc_curve(y_test, y_pred)
    _ = plt.title('ROC')
    _ = plt.plot(fpr, tpr, 'b', alpha=0.2)
#     _ = plt.plot([0, 1], [0, 1],'r--', alpha=0.2)
    _ = plt.xlim([0, 1])
    _ = plt.ylim([0, 1])
    _ = plt.ylabel('True Positive Rate')
    _ = plt.xlabel('False Positive Rate')
    plt.show()

In [4]:
def generate_submission(impression_ids, is_clicks, method_name):
    submission_frame = pd.DataFrame(is_clicks, index=impression_ids)
    submission_frame.columns = ['is_click']
    submission_frame.index.name = 'impression_id'
    print(submission_frame.head())
    submission_frame.to_csv('data/test/'+method_name+'.csv')

In [5]:
def flow(X, y, training_func, X_future, y_future_ids):
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=5)
    
    model = training_func(X_train, y_train)
    
    y_pred = model.predict(X_test)
    y_pred_prob = model.predict_proba(X_test)
    evaluate(y_test, y_pred, y_pred_prob)
    plot_roc(y_test, y_pred)
    
    y_future_pred = model.predict(X_future)
    generate_submission(y_future_ids, y_future_pred, training_func.__name__)
    

## Data Wrangling User details

In [26]:
# Read Items
item_df = pd.read_csv('data/train/item_data.csv', index_col='item_id')
item_df.head()

Unnamed: 0_level_0,item_price,category_1,category_2,category_3,product_type
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
26880,4602,11,35,20,3040
54939,3513,12,57,85,6822
40383,825,17,8,279,1619
8777,2355,13,58,189,5264
113705,1267,17,39,151,10239


In [36]:
# Reading view logs
view_log_df = pd.read_csv('data/train/view_log.csv', parse_dates=['server_time'])
view_log_df.device_type = view_log_df.device_type.astype('category')
view_log_df.head()

(3118622, 5)

In [39]:
# Merging items and view_logs
user_details = pd.merge(view_log_df, item_df, how='inner', on='item_id')
user_details.head()

Unnamed: 0,server_time,device_type,session_id,user_id,item_id,item_price,category_1,category_2,category_3,product_type
0,2018-10-15 08:58:00,android,112333,4557,32970,54685,16,56,253,3184
1,2018-10-15 09:36:00,android,783457,88320,32970,54685,16,56,253,3184
2,2018-10-15 10:59:00,android,6902,1711,32970,54685,16,56,253,3184
3,2018-10-15 11:31:00,android,61138,58906,32970,54685,16,56,253,3184
4,2018-10-15 12:03:00,android,441653,64221,32970,54685,16,56,253,3184


In [40]:
user_details.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3116840 entries, 0 to 3116839
Data columns (total 10 columns):
server_time     datetime64[ns]
device_type     category
session_id      int64
user_id         int64
item_id         int64
item_price      int64
category_1      int64
category_2      int64
category_3      int64
product_type    int64
dtypes: category(1), datetime64[ns](1), int64(8)
memory usage: 240.8 MB


In [38]:
pd.merge(view_log_df, item_df, how='inner', on='item_id').shape

(3116840, 10)

In [10]:
user_habbit = view_log_df[['user_id', 'item_id']]
user_habbit = user_habbit.drop_duplicates()
user_habbit = user_habbit.groupby('user_id').count()
user_habbit.columns = ['items']
user_habbit.head()

Unnamed: 0_level_0,items
user_id,Unnamed: 1_level_1
0,18
1,8
2,130
3,3
4,2


In [11]:
def get_level(count):
    if count <= 4:
        return 0
    elif count <= 11:
        return 1
    elif count <= 27:
        return 2
    else:
        return 3
interested_items['level'] = interested_items['items'].apply(get_level)
interested_items.head()

NameError: name 'interested_items' is not defined

## Feature Engineering

In [None]:
os_version_map = {'old': 0, 'latest': 1, 'intermediate': 2}
def extract_features(dataframe):
    dataframe['user_habbit'] = interested_items.loc[dataframe.user_id, 'items'].values
    dataframe['os_version_int'] = dataframe['os_version'].replace(os_version_map)
    return dataframe[['user_habbit','app_code','os_version_int','is_4G']].values

In [None]:
# Reading Training Data
training_df = pd.read_csv('data/train/train.csv', index_col='impression_id', parse_dates=['impression_time'])
X = extract_features(training_df)
y = training_df['is_click'].values
training_df.head()

In [None]:
# Reading Testing Data
testing_df = pd.read_csv('data/train/train.csv', index_col='impression_id', parse_dates=['impression_time'])
X_future = extract_features(testing_df)
y_future_ids = testing_df.index.values
testing_df.head()

## Trying LightGBM

In [None]:
def tunned_light_gbm(X_train, y_train):
    lgb = LGBMClassifier(eval_metric = 'auc',random_state=2019)
    params = {
        'num_leaves': [35],
        'max_depth': [5,10,15]
    }
    lgb_grid = GridSearchCV(lgb, params, cv=5, scoring='roc_auc')
    lgb_grid.fit(X_train, y_train)
    print('best_estimator', lgb_grid.best_estimator_)
    print('best_score', lgb_grid.best_score_)
    return lgb_grid.best_estimator_

flow(X, y, tunned_light_gbm, X_future, y_future_ids)