# WNS Analytics Wizard

Importing libraries

In [1]:
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
import matplotlib.pyplot as plt

Creating useful functions

In [2]:
def evaluate(actual, predicted, predicted_probability):
    print("Confusion Matrix")
    print(pd.DataFrame(confusion_matrix(actual, predicted)))
    print("")
    print("For Class 1")
    print("f1 Score :", f1_score(actual, predicted))
    print("Precision Score :",precision_score(actual, predicted))
    print("Recall Score :",recall_score(actual, predicted))
    print("")
    print("For Class 0")
    print("f1 Score :", f1_score(1-np.array(actual), 1-np.array(predicted)))
    print("Precision Score :",precision_score(1-np.array(actual), 1-np.array(predicted)))
    print("Recall Score :",recall_score(1-np.array(actual), 1-np.array(predicted)))
    print("")
    print("AUROC :",  roc_auc_score(actual, predicted_probability[:,1]))

In [3]:
def plot_roc(y_test, y_pred):
    fpr, tpr, thresholds = roc_curve(y_test, y_pred)
    _ = plt.title('ROC')
    _ = plt.plot(fpr, tpr, 'b', alpha=0.2)
#     _ = plt.plot([0, 1], [0, 1],'r--', alpha=0.2)
    _ = plt.xlim([0, 1])
    _ = plt.ylim([0, 1])
    _ = plt.ylabel('True Positive Rate')
    _ = plt.xlabel('False Positive Rate')
    plt.show()

In [4]:
def generate_submission(impression_ids, is_clicks, method_name):
    submission_frame = pd.DataFrame(is_clicks, index=impression_ids)
    submission_frame.columns = ['is_click']
    submission_frame.index.name = 'impression_id'
    print(submission_frame.head())
    submission_frame.to_csv('data/test/'+method_name+'.csv')

In [5]:
def flow(X, y, training_func, X_future, y_future_ids):
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=5)
    
    model = training_func(X_train, y_train)
    
    y_pred = model.predict(X_test)
    y_pred_prob = model.predict_proba(X_test)
    evaluate(y_test, y_pred, y_pred_prob)
    plot_roc(y_test, y_pred)
    
    y_future_pred = model.predict(X_future)
    generate_submission(y_future_ids, y_future_pred, training_func.__name__)
    

## Data Wrangling User details

In [6]:
# Read Items
item_df = pd.read_csv('data/train/item_data.csv', index_col='item_id')
item_df.head()

Unnamed: 0_level_0,item_price,category_1,category_2,category_3,product_type
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
26880,4602,11,35,20,3040
54939,3513,12,57,85,6822
40383,825,17,8,279,1619
8777,2355,13,58,189,5264
113705,1267,17,39,151,10239


In [7]:
# Reading view logs
user_log_df = pd.read_csv('data/train/view_log.csv', parse_dates=['server_time'])
user_log_df.device_type = user_log_df.device_type.astype('category')
user_log_df.head()

Unnamed: 0,server_time,device_type,session_id,user_id,item_id
0,2018-10-15 08:58:00,android,112333,4557,32970
1,2018-10-15 08:58:00,android,503590,74788,7640
2,2018-10-15 08:58:00,android,573960,23628,128855
3,2018-10-15 08:58:00,android,121691,2430,12774
4,2018-10-15 08:58:00,android,218564,19227,28296


In [8]:
# Merging items and view_logs
expanded_user_logs = pd.merge(user_log_df, item_df, how='inner', on='item_id')
expanded_user_logs.head()

Unnamed: 0,server_time,device_type,session_id,user_id,item_id,item_price,category_1,category_2,category_3,product_type
0,2018-10-15 08:58:00,android,112333,4557,32970,54685,16,56,253,3184
1,2018-10-15 09:36:00,android,783457,88320,32970,54685,16,56,253,3184
2,2018-10-15 10:59:00,android,6902,1711,32970,54685,16,56,253,3184
3,2018-10-15 11:31:00,android,61138,58906,32970,54685,16,56,253,3184
4,2018-10-15 12:03:00,android,441653,64221,32970,54685,16,56,253,3184


In [9]:
def most_frequent(x):
    return x.value_counts().index[0]

In [12]:
aggregator = {
    'item_price': [np.max, np.min],
    'item_id': ['count','nunique'],
    'session_id': 'nunique'
#     'product_type': most_frequent,
#     'category_1': most_frequent,
#     'category_2': most_frequent,
#     'category_3': most_frequent,
#     'device_type': most_frequent
}

In [13]:
cols = ['max_price','min_price','total_items','total_unique_items','total_sessions']#,'freq_product_type'
        #,'freq_category_1','freq_category_2','freq_category_3','freq_device']
user_details = expanded_user_logs.groupby('user_id').agg(aggregator)
user_details.columns = cols
user_details.head()

Unnamed: 0_level_0,max_price,min_price,total_items,total_unique_items,total_sessions
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,92160,332,42,18,11
1,12595,383,8,8,3
2,281536,128,165,130,37
3,16640,537,8,3,1
4,58252,1977,2,2,1


In [14]:
user_cat_cols = []
for col in user_cat_cols:
    user_details[col] = user_details[col].astype('category')
expanded_user_details = pd.get_dummies(user_details)
expanded_user_details.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 89152 entries, 0 to 92586
Data columns (total 5 columns):
max_price             89152 non-null int64
min_price             89152 non-null int64
total_items           89152 non-null int64
total_unique_items    89152 non-null int64
total_sessions        89152 non-null int64
dtypes: int64(5)
memory usage: 4.1 MB


## Feature Engineering

In [15]:
cat_cols = ['app_code','os_version']
def extract_features(dataframe):
    dataframe = dataframe[['user_id','app_code','os_version','is_4G']]
    for col in cat_cols:
        dataframe.loc[:,col] = dataframe.loc[:,col].astype('category')
    dataframe = pd.get_dummies(dataframe)
    
    return pd.merge(dataframe, user_details, how='left', on='user_id').values

In [16]:
# Reading Training Data
training_df = pd.read_csv('data/train/train.csv', index_col='impression_id', parse_dates=['impression_time'])
X = extract_features(training_df)
y = training_df['is_click'].values
X[:1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


array([[8.7862e+04, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.00

In [17]:
# Reading Testing Data
testing_df = pd.read_csv('data/test/test.csv', index_col='impression_id', parse_dates=['impression_time'])
X_future = extract_features(testing_df)
y_future_ids = testing_df.index.values
X_future[:1]

array([[4.4754e+04, 1.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.00

## Trying LightGBM

In [18]:
def tunned_light_gbm(X_train, y_train):
    lgb = LGBMClassifier(eval_metric = 'auc',random_state=2019)
    params = {
        'num_leaves': [30,35,40],
        'max_depth': [5,10,15],
        'min_data_in_leaf': [50,100,150]
    }
    lgb_grid = GridSearchCV(lgb, params, cv=10, scoring='roc_auc')
    lgb_grid.fit(X_train, y_train)
    print('best_estimator', lgb_grid.best_estimator_)
    print('best_score', lgb_grid.best_score_)
    return lgb_grid.best_estimator_

flow(X, y, tunned_light_gbm, X_future, y_future_ids)

MemoryError: 