# Avazu CTR Prediction

## Imports

In [53]:
'''Specifying the necessary imports'''
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn
import sklearn
import matplotlib.dates as mdates
import random
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import f1_score, auc

In [54]:
'''Ignore Warning Messages'''
import warnings
warnings.filterwarnings('ignore')

In [55]:
'''Set style to GGplot for asthetics'''
matplotlib.style.use('ggplot')


In [56]:
'''Helper function for date column which will be passed in when we import dataset'''
parse_date = lambda val : pd.datetime.strptime(val, '%y%m%d%H')

In [57]:
''' Sample from Training Data down to 1 million records'''

n = 40428966  #total number of records in the clickstream data 
sample_size = 1000000
skip_values = sorted(random.sample(range(1,n), n-sample_size)) 

#Tracking the indices of rows to be skipped at random in the next stage i.e the LOADING stage 


In [58]:
'''
Memory optimization-by changing the int32 to int64-massiely reduced the size of the dataset for faster proccessing
'''
data_types = {
    'id': np.str,
    'click': np.bool_,
    'hour': np.str,
    'C1': np.uint16,
    'banner_pos': np.uint16,
    'site_id': np.object,
    'site_domain': np.object,
    'site_category': np.object,
    'app_id': np.object,
    'app_domain': np.object,
    'app_category': np.object,
    'device_id': np.object,
    'device_ip': np.object,
    'device_model': np.object,
    'device_type': np.uint16,
    'device_conn_type': np.uint16,
    'C14': np.uint16,
    'C15': np.uint16,
    'C16': np.uint16,
    'C17': np.uint16,
    'C18': np.uint16,
    'C19': np.uint16,
    'C20': np.uint16,
    'C21': np.uint16
}

train_data = pd.read_csv('train.csv', parse_dates = ['hour'],
                        date_parser = parse_date, skiprows = skip_values , 
                        dtype = data_types )




In [59]:
'''Create a separate data frame where clicks = 1'''

train_data_clicks = train_data[train_data['click']==1]

In [60]:
## Feature Engineering

In [61]:
''' Click through rate for entire dataset'''
rows = train_data.shape[0]

click_through_rate = train_data['click'].value_counts()/rows 

In [62]:
'''Dataset ranges from 2014-10-21 to 2014-10-30-about 9 days'''
train_data.hour.describe() 


count                 1000001
unique                    240
top       2014-10-22 09:00:00
freq                    11241
first     2014-10-21 00:00:00
last      2014-10-30 23:00:00
Name: hour, dtype: object

In [63]:
'''Impressions V/S Clicks'''

df_impressions = train_data.groupby('hour').agg({'click':'sum'})

df_click = train_data[train_data['click']==1]

temp_click = df_click.groupby('hour').agg({'click' : 'sum'})



In [64]:
''' Create 3 new features based on the results from the cell above
     1. hour_in_day-The hour of the event
     2. weekday -- What day was the ad clicked
     3. Day_name -- Take the specific day from hour
'''

train_data['hour_in_day'] = train_data['hour'].apply(lambda val : val.hour)
train_data_clicks['hour_in_day'] = train_data_clicks['hour'].apply(lambda val : val.hour)

train_data['weekday'] = train_data['hour'].apply(lambda val: val.dayofweek)
train_data_clicks['weekday'] = train_data_clicks['hour'].apply(lambda val: val.dayofweek)

train_data['day_name'] = train_data['hour'].apply(lambda x: x.strftime('%A'))
train_data_clicks['day_name'] = train_data_clicks['hour'].apply(lambda x: x.strftime('%A'))



In [65]:
train_data.columns

Index(['id', 'click', 'hour', 'C1', 'banner_pos', 'site_id', 'site_domain',
       'site_category', 'app_id', 'app_domain', 'app_category', 'device_id',
       'device_ip', 'device_model', 'device_type', 'device_conn_type', 'C14',
       'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 'hour_in_day',
       'weekday', 'day_name'],
      dtype='object')

In [66]:
'''Create a new dataframe from hour'''

hour_df = pd.DataFrame() 

hour_df['hr'] = train_data_clicks[['hour_in_day','click']].groupby(['hour_in_day']).count().reset_index().sort_values('click',ascending=False)['hour_in_day']


In [67]:
'''Cicks vs hour'''

hour_df['pos_clicks'] = train_data_clicks[['hour_in_day','click']].groupby(['hour_in_day']).count().reset_index().sort_values('click',ascending=False)['click']
            

In [68]:
'''Clicks and impressions vs hours'''

hour_df['impressions_total'] = train_data[['hour_in_day','click']].groupby(['hour_in_day']).count().reset_index().sort_values('click',ascending=False)['click']
            


In [69]:
'''
Create click through rate column-multiply by 100 for percentages
'''

hour_df['click_through_rate'] = 100*hour_df['pos_clicks']/hour_df['impressions_total']

In [70]:
device_temp = train_data[['device_type','click']].groupby(['device_type','click'])

In [71]:
app_features = ['app_id', 'app_domain', 'app_category']

In [72]:
'''Create area feature-subtraction gives best results'''
train_data['area'] = train_data['C15']-train_data['C16']

In [73]:
'''Create others variable to capture the variety of site category'''
train_data['site_category'] = train_data['site_category'].apply(lambda x: 'others' if x not in ['50e219e0','f028772b','28905ebd','3e814130'] else x)

In [74]:
'''Create others for app category as well'''

train_data['app_category'] = train_data['app_category'].apply(lambda x: 'others' if x not in ['07d7df22', '0f2161f8'] else x)

## Build models

### Data Preperation

In [75]:
model_features = ['weekday', 'hour_in_day',
                  'banner_pos', 'site_category',
                  'device_conn_type', 'app_category',
                  'device_type', 'area']

In [76]:
model_target = 'click'

In [77]:
'''Use a sample for processing-total entries is 30,000'''
train_model = train_data[model_features+[model_target]].sample(frac=0.1,random_state=42)

In [78]:
'''One hot encode features for modeling'''
def one_hot_features(data_frame, feature_set):
    new_data_frame = pd.get_dummies(data_frame,
                                     columns = feature_set,
                                    sparse = True)

    return new_data_frame

In [79]:
train_model = one_hot_features(train_model,
                                ['site_category',
                                 'app_category',
                                 'banner_pos'])


In [80]:
model_features = np.array(train_model.columns[train_model.columns!=model_target].tolist())

## Xgboost

In [93]:
import xgboost
from xgboost import XGBClassifier
from sklearn.metrics import classification_report


In [83]:
'''Create test train split for XGboost'''
x_train, x_valid, y_train, y_valid = train_test_split(
    train_model[model_features].values,
    train_model[model_target].values,
    test_size=0.3,
    random_state=42)

In [84]:
model = XGBClassifier()
xgb_clf = model

In [85]:
'''Run model with auc as eval metric'''
xgb_clf.fit(x_train, y_train, early_stopping_rounds=100,
            eval_metric="auc", eval_set=[(x_valid, y_valid)])

[0]	validation_0-auc:0.603814
Will train until validation_0-auc hasn't improved in 100 rounds.
[1]	validation_0-auc:0.603349
[2]	validation_0-auc:0.605381
[3]	validation_0-auc:0.605381
[4]	validation_0-auc:0.605381
[5]	validation_0-auc:0.610638
[6]	validation_0-auc:0.610639
[7]	validation_0-auc:0.611114
[8]	validation_0-auc:0.612332
[9]	validation_0-auc:0.620312
[10]	validation_0-auc:0.620157
[11]	validation_0-auc:0.620161
[12]	validation_0-auc:0.623171
[13]	validation_0-auc:0.62249
[14]	validation_0-auc:0.622492
[15]	validation_0-auc:0.62318
[16]	validation_0-auc:0.623638
[17]	validation_0-auc:0.623656
[18]	validation_0-auc:0.62178
[19]	validation_0-auc:0.621885
[20]	validation_0-auc:0.621879
[21]	validation_0-auc:0.621921
[22]	validation_0-auc:0.623751
[23]	validation_0-auc:0.622504
[24]	validation_0-auc:0.626604
[25]	validation_0-auc:0.625421
[26]	validation_0-auc:0.625743
[27]	validation_0-auc:0.626652
[28]	validation_0-auc:0.625749
[29]	validation_0-auc:0.625493
[30]	validation_0-

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [86]:
'''Run model with aucpr as eval metric'''
xgb_clf.fit(x_train, y_train, early_stopping_rounds=100,
            eval_metric="aucpr", eval_set=[(x_valid, y_valid)])

[0]	validation_0-aucpr:0.243317
Will train until validation_0-aucpr hasn't improved in 100 rounds.
[1]	validation_0-aucpr:0.245373
[2]	validation_0-aucpr:0.248288
[3]	validation_0-aucpr:0.248288
[4]	validation_0-aucpr:0.248288
[5]	validation_0-aucpr:0.252658
[6]	validation_0-aucpr:0.253049
[7]	validation_0-aucpr:0.253598
[8]	validation_0-aucpr:0.253912
[9]	validation_0-aucpr:0.255782
[10]	validation_0-aucpr:0.256727
[11]	validation_0-aucpr:0.256765
[12]	validation_0-aucpr:0.257139
[13]	validation_0-aucpr:0.256575
[14]	validation_0-aucpr:0.256596
[15]	validation_0-aucpr:0.257176
[16]	validation_0-aucpr:0.257607
[17]	validation_0-aucpr:0.25762
[18]	validation_0-aucpr:0.257987
[19]	validation_0-aucpr:0.258043
[20]	validation_0-aucpr:0.25805
[21]	validation_0-aucpr:0.258065
[22]	validation_0-aucpr:0.257761
[23]	validation_0-aucpr:0.257025
[24]	validation_0-aucpr:0.261868
[25]	validation_0-aucpr:0.260257
[26]	validation_0-aucpr:0.26048
[27]	validation_0-aucpr:0.261937
[28]	validation_0-aucp

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [88]:
y_pred = xgb_clf.predict(x_valid)
predictions = [round(value) for value in y_pred]

In [90]:
from sklearn import metrics

print(metrics.confusion_matrix(y_valid, predictions))

[[24918    40]
 [ 4989    53]]


In [91]:
print(metrics.roc_auc_score(y_valid, predictions))


0.5044545045911165


In [92]:
print(metrics.accuracy_score(y_valid, predictions))

0.8323666666666667


The model appears to perform when looking at the evaluation metrics, but the confusion matrix suggests otherwise. The Roc Auc score hovers around 50, which suggests the mdoel is not doing a good job of predicting.