# Avazu CTR Prediction

## Imports

In [1]:
'''Specifying the necessary imports'''
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn
import sklearn
import matplotlib.dates as mdates
import random
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import f1_score, auc

In [2]:
'''Ignore Warning Messages'''
import warnings
warnings.filterwarnings('ignore')

In [3]:
'''Set style to GGplot for asthetics'''
matplotlib.style.use('ggplot')


In [4]:
'''Helper function for date column which will be passed in when we import dataset'''
parse_date = lambda val : pd.datetime.strptime(val, '%y%m%d%H')

In [5]:
''' Sample from Training Data down to 1 million records'''

n = 40428966  #total number of records in the clickstream data 
sample_size = 100000
skip_values = sorted(random.sample(range(1,n), n-sample_size)) 

#Tracking the indices of rows to be skipped at random in the next stage i.e the LOADING stage 


In [6]:
'''
Memory optimization-by changing the int32 to int64-massiely reduced the size of the dataset for faster proccessing
'''
data_types = {
    'id': np.str,
    'click': np.bool_,
    'hour': np.str,
    'C1': np.uint16,
    'banner_pos': np.uint16,
    'site_id': np.object,
    'site_domain': np.object,
    'site_category': np.object,
    'app_id': np.object,
    'app_domain': np.object,
    'app_category': np.object,
    'device_id': np.object,
    'device_ip': np.object,
    'device_model': np.object,
    'device_type': np.uint16,
    'device_conn_type': np.uint16,
    'C14': np.uint16,
    'C15': np.uint16,
    'C16': np.uint16,
    'C17': np.uint16,
    'C18': np.uint16,
    'C19': np.uint16,
    'C20': np.uint16,
    'C21': np.uint16
}

train_data = pd.read_csv('train.csv', parse_dates = ['hour'],
                        date_parser = parse_date, skiprows = skip_values , 
                        dtype = data_types )




In [7]:
'''Create a separate data frame where clicks = 1'''

train_data_clicks = train_data[train_data['click']==1]

In [8]:
## Feature Engineering

In [9]:
''' Click through rate for entire dataset'''
rows = train_data.shape[0]

click_through_rate = train_data['click'].value_counts()/rows 

In [10]:
'''Dataset ranges from 2014-10-21 to 2014-10-30-about 9 days'''
train_data.hour.describe() 


count                  100001
unique                    240
top       2014-10-22 09:00:00
freq                     1102
first     2014-10-21 00:00:00
last      2014-10-30 23:00:00
Name: hour, dtype: object

In [11]:
'''Impressions V/S Clicks'''

df_impressions = train_data.groupby('hour').agg({'click':'sum'})

df_click = train_data[train_data['click']==1]

temp_click = df_click.groupby('hour').agg({'click' : 'sum'})



In [12]:
''' Create 3 new features based on the results from the cell above
     1. hour_in_day-The hour of the event
     2. weekday -- What day was the ad clicked
     3. Day_name -- Take the specific day from hour
'''

train_data['hour_in_day'] = train_data['hour'].apply(lambda val : val.hour)
train_data_clicks['hour_in_day'] = train_data_clicks['hour'].apply(lambda val : val.hour)

train_data['weekday'] = train_data['hour'].apply(lambda val: val.dayofweek)
train_data_clicks['weekday'] = train_data_clicks['hour'].apply(lambda val: val.dayofweek)

train_data['day_name'] = train_data['hour'].apply(lambda x: x.strftime('%A'))
train_data_clicks['day_name'] = train_data_clicks['hour'].apply(lambda x: x.strftime('%A'))



In [13]:
train_data.columns

Index(['id', 'click', 'hour', 'C1', 'banner_pos', 'site_id', 'site_domain',
       'site_category', 'app_id', 'app_domain', 'app_category', 'device_id',
       'device_ip', 'device_model', 'device_type', 'device_conn_type', 'C14',
       'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 'hour_in_day',
       'weekday', 'day_name'],
      dtype='object')

In [14]:
'''Create a new dataframe from hour'''

hour_df = pd.DataFrame() 

hour_df['hr'] = train_data_clicks[['hour_in_day','click']].groupby(['hour_in_day']).count().reset_index().sort_values('click',ascending=False)['hour_in_day']


In [15]:
'''Cicks vs hour'''

hour_df['pos_clicks'] = train_data_clicks[['hour_in_day','click']].groupby(['hour_in_day']).count().reset_index().sort_values('click',ascending=False)['click']
            

In [16]:
'''Clicks and impressions vs hours'''

hour_df['impressions_total'] = train_data[['hour_in_day','click']].groupby(['hour_in_day']).count().reset_index().sort_values('click',ascending=False)['click']
            


In [17]:
'''
Create click through rate column-multiply by 100 for percentages
'''

hour_df['click_through_rate'] = 100*hour_df['pos_clicks']/hour_df['impressions_total']

In [18]:
device_temp = train_data[['device_type','click']].groupby(['device_type','click'])

In [19]:
app_features = ['app_id', 'app_domain', 'app_category']

In [20]:
'''Create area feature-subtraction gives best results'''
train_data['area'] = train_data['C15']-train_data['C16']

In [21]:
'''Create others variable to capture the variety of site category'''
train_data['site_category'] = train_data['site_category'].apply(lambda x: 'others' if x not in ['50e219e0','f028772b','28905ebd','3e814130'] else x)

In [22]:
'''Create others for app category as well'''

train_data['app_category'] = train_data['app_category'].apply(lambda x: 'others' if x not in ['07d7df22', '0f2161f8'] else x)

## Build models

### Data Preperation

In [23]:
model_features = ['weekday', 'hour_in_day',
                  'banner_pos', 'site_category',
                  'device_conn_type', 'app_category',
                  'device_type', 'area']

In [24]:
model_target = 'click'

In [25]:
'''Use a sample for processing-total entries is 30,000'''
train_model = train_data[model_features+[model_target]].sample(frac=0.1,random_state=42)

In [26]:
'''One hot encode features for modeling'''
def one_hot_features(data_frame, feature_set):
    new_data_frame = pd.get_dummies(data_frame,
                                     columns = feature_set,
                                    sparse = True)

    return new_data_frame

In [27]:
train_model = one_hot_features(train_model,
                                ['site_category',
                                 'app_category',
                                 'banner_pos'])


In [28]:
model_features = np.array(train_model.columns[train_model.columns!=model_target].tolist())

## Logistic Regression


In [91]:
from sklearn.linear_model import LogisticRegression

In [92]:
lr_clf = LogisticRegression(random_state=42, solver='lbfgs',
                         multi_class='ovr').fit(x_train, y_train)

In [93]:
y_pred = lr_clf.predict(x_valid)
predictions = [round(value) for value in y_pred]

In [94]:
print(metrics.confusion_matrix(y_valid, predictions))

[[2529    0]
 [ 471    0]]


In [95]:
print(metrics.roc_auc_score(y_valid, predictions))


0.5


In [96]:
print(metrics.accuracy_score(y_valid, predictions))

0.843


## Adaboost

In [97]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier


In [109]:
bdt_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=500),
                         algorithm="SAMME",
                         n_estimators=200)

In [110]:
bdt_clf.fit(x_train, y_train)

AdaBoostClassifier(algorithm='SAMME',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=500,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
          learning_rate=1.0, n_estimators=200, random_state=None)

In [111]:
y_pred = bdt.predict(x_valid)
predictions = [round(value) for value in y_pred]

In [112]:
print(metrics.confusion_matrix(y_valid, predictions))

[[2389  140]
 [ 437   34]]


In [113]:
print(metrics.roc_auc_score(y_valid, predictions))


0.5084144937829458


In [114]:
print(metrics.accuracy_score(y_valid, predictions))

0.8076666666666666


## Xgboost

In [115]:
import xgboost
from xgboost import XGBClassifier
from sklearn.metrics import classification_report


In [116]:
'''Create test train split for XGboost'''
x_train, x_valid, y_train, y_valid = train_test_split(
    train_model[model_features].values,
    train_model[model_target].values,
    test_size=0.3,
    random_state=42)

In [253]:
model = XGBClassifier(max_depth=1000, n_estimators=200, objective="rank:pairwise", feature_selector="cyclic")
xgb_clf = model

In [259]:
'''Run model with auc as eval metric'''
xgb_clf.fit(x_train, y_train, early_stopping_rounds=100,
            eval_metric="auc", eval_set=[(x_valid, y_valid)])

[0]	validation_0-auc:0.555857
Will train until validation_0-auc hasn't improved in 100 rounds.
[1]	validation_0-auc:0.557561
[2]	validation_0-auc:0.558001
[3]	validation_0-auc:0.558198
[4]	validation_0-auc:0.559687
[5]	validation_0-auc:0.55905
[6]	validation_0-auc:0.559393
[7]	validation_0-auc:0.558297
[8]	validation_0-auc:0.556325
[9]	validation_0-auc:0.556841
[10]	validation_0-auc:0.556184
[11]	validation_0-auc:0.555977
[12]	validation_0-auc:0.554887
[13]	validation_0-auc:0.55436
[14]	validation_0-auc:0.553196
[15]	validation_0-auc:0.553053
[16]	validation_0-auc:0.552482
[17]	validation_0-auc:0.55293
[18]	validation_0-auc:0.553518
[19]	validation_0-auc:0.552616
[20]	validation_0-auc:0.552182
[21]	validation_0-auc:0.553295
[22]	validation_0-auc:0.552929
[23]	validation_0-auc:0.552948
[24]	validation_0-auc:0.552928
[25]	validation_0-auc:0.553431
[26]	validation_0-auc:0.553153
[27]	validation_0-auc:0.552878
[28]	validation_0-auc:0.55263
[29]	validation_0-auc:0.553064
[30]	validation_0-a

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, feature_selector='cyclic', gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=1000,
       min_child_weight=1, missing=None, n_estimators=200, n_jobs=1,
       nthread=None, objective='rank:pairwise', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [260]:
y_pred = xgb_clf.predict(x_valid)
predictions = [round(value) for value in y_pred]

In [261]:
from sklearn import metrics

print(metrics.confusion_matrix(y_valid, predictions))

[[1565  964]
 [ 253  218]]


In [262]:
print(metrics.roc_auc_score(y_valid, predictions))


0.5408333396297219


In [263]:
print(metrics.accuracy_score(y_valid, predictions))

0.5943333333333334


The model appears to perform when looking at the evaluation metrics, but the confusion matrix suggests otherwise. The Roc Auc score hovers around 50, which suggests the mdoel is not doing a good job of predicting.