# Avazu CTR Prediction

## Imports

In [216]:
'''Specifying the necessary imports'''
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn
import sklearn
import matplotlib.dates as mdates
import random
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import f1_score, auc

In [217]:
'''Ignore Warning Messages'''
import warnings
warnings.filterwarnings('ignore')

In [218]:
'''Set style to GGplot for asthetics'''
matplotlib.style.use('ggplot')


In [219]:
'''Helper function for date column which will be passed in when we import dataset'''
parse_date = lambda val : pd.datetime.strptime(val, '%y%m%d%H')

In [220]:
''' Sample from Training Data down to 1 million records'''

n = 40428966  #total number of records in the clickstream data 
sample_size = 1000000
skip_values = sorted(random.sample(range(1,n), n-sample_size)) 

#Tracking the indices of rows to be skipped at random in the next stage i.e the LOADING stage 


In [221]:
'''
Memory optimization-by changing the int32 to int64-massiely reduced the size of the dataset for faster proccessing
'''
data_types = {
    'id': np.str,
    'click': np.bool_,
    'hour': np.str,
    'C1': np.uint16,
    'banner_pos': np.uint16,
    'site_id': np.object,
    'site_domain': np.object,
    'site_category': np.object,
    'app_id': np.object,
    'app_domain': np.object,
    'app_category': np.object,
    'device_id': np.object,
    'device_ip': np.object,
    'device_model': np.object,
    'device_type': np.uint16,
    'device_conn_type': np.uint16,
    'C14': np.uint16,
    'C15': np.uint16,
    'C16': np.uint16,
    'C17': np.uint16,
    'C18': np.uint16,
    'C19': np.uint16,
    'C20': np.uint16,
    'C21': np.uint16
}

train_data = pd.read_csv('train.csv', parse_dates = ['hour'],
                        date_parser = parse_date, skiprows = skip_values , 
                        dtype = data_types )




In [226]:
'''Create a separate data frame where clicks = 1'''

train_data_clicks = train_data[train_data['click']==1]

In [227]:
## Feature Engineering

In [228]:
''' Click through rate for entire dataset'''
rows = train_data.shape[0]

click_through_rate = train_data['click'].value_counts()/rows 

In [229]:
'''Dataset ranges from 2014-10-21 to 2014-10-30-about 9 days'''
train_data.hour.describe() 


count                 1000001
unique                    240
top       2014-10-22 09:00:00
freq                    11083
first     2014-10-21 00:00:00
last      2014-10-30 23:00:00
Name: hour, dtype: object

In [230]:
'''Impressions V/S Clicks'''

df_impressions = train_data.groupby('hour').agg({'click':'sum'})

df_click = train_data[train_data['click']==1]

temp_click = df_click.groupby('hour').agg({'click' : 'sum'})



In [231]:
''' Create 3 new features based on the results from the cell above
     1. hour_in_day-The hour of the event
     2. weekday -- What day was the ad clicked
     3. Day_name -- Take the specific day from hour
'''

train_data['hour_in_day'] = train_data['hour'].apply(lambda val : val.hour)
train_data_clicks['hour_in_day'] = train_data_clicks['hour'].apply(lambda val : val.hour)

train_data['weekday'] = train_data['hour'].apply(lambda val: val.dayofweek)
train_data_clicks['weekday'] = train_data_clicks['hour'].apply(lambda val: val.dayofweek)

train_data['day_name'] = train_data['hour'].apply(lambda x: x.strftime('%A'))
train_data_clicks['day_name'] = train_data_clicks['hour'].apply(lambda x: x.strftime('%A'))



In [232]:
train_data.columns

Index(['id', 'click', 'hour', 'C1', 'banner_pos', 'site_id', 'site_domain',
       'site_category', 'app_id', 'app_domain', 'app_category', 'device_id',
       'device_ip', 'device_model', 'device_type', 'device_conn_type', 'C14',
       'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 'hour_in_day',
       'weekday', 'day_name'],
      dtype='object')

In [233]:
'''Create a new dataframe from hour'''

hour_df = pd.DataFrame() 

hour_df['hr'] = train_data_clicks[['hour_in_day','click']].groupby(['hour_in_day']).count().reset_index().sort_values('click',ascending=False)['hour_in_day']


In [235]:
'''Cicks vs hour'''

hour_df['pos_clicks'] = train_data_clicks[['hour_in_day','click']].groupby(['hour_in_day']).count().reset_index().sort_values('click',ascending=False)['click']
            

In [236]:
'''Clicks and impressions vs hours'''

hour_df['impressions_total'] = train_data[['hour_in_day','click']].groupby(['hour_in_day']).count().reset_index().sort_values('click',ascending=False)['click']
            


In [237]:
'''
Create click through rate column-multiply by 100 for percentages
'''

hour_df['click_through_rate'] = 100*hour_df['pos_clicks']/hour_df['impressions_total']

In [239]:
banner_df['position'] = train_data_clicks[['banner_pos','click']].groupby(['banner_pos']).count().reset_index().sort_values('click',ascending=False)['banner_pos']

In [240]:
banner_df['pos_clicks'] = train_data_clicks[['banner_pos','click']].groupby(['banner_pos']).count().reset_index().sort_values('click',ascending=False)['click']

In [241]:
banner_df['total_impressions'] = train_data[['banner_pos','click']].groupby(['banner_pos']).count().reset_index().sort_values('click',ascending=False)['click']

In [242]:
banner_df['click_pct'] = 100*banner_df['pos_clicks']/banner_df['total_impressions']


In [243]:
list_of_banners = banner_df.sort_values(by='click_pct',ascending=False)['position'].tolist()

In [244]:
device_temp = train_data[['device_type','click']].groupby(['device_type','click'])

In [245]:
dev_type_df_total_imp.columns = ['device_type', 'click2']

merged_df = pd.merge(left = dev_type_df , right = dev_type_df_total_imp,
                    how = 'inner', on = 'device_type')



In [246]:
app_features = ['app_id', 'app_domain', 'app_category']

In [181]:
#Todo:Create 3 features

In [None]:
## Build models

In [247]:
### Data Preperation

In [248]:
model_features = ['weekday', 'hour_in_day',
                  'banner_pos', 'site_category',
                  'device_conn_type', 'app_category',
                  'device_type']

In [249]:
model_target = 'click'

In [187]:
'''Use a sample for processing-total entries is 30,000'''
train_model = train_data[model_features+[model_target]].sample(frac=0.1,random_state=42)

In [188]:
'''One hot encode features for modeling'''
def one_hot_features(data_frame, feature_set):
    new_data_frame = pd.get_dummies(data_frame,
                                     columns = feature_set,
                                    sparse = True)

    return new_data_frame

In [189]:
train_model = one_hot_features(train_model,
                                ['site_category',
                                 'app_category',
                                 'banner_pos'])


In [191]:
model_features = np.array(train_model.columns[train_model.columns!=model_target].tolist())

## Xgboost

In [250]:
import xgboost
from xgboost import XGBClassifier
from sklearn.metrics import classification_report


In [205]:
'''Create test train split for XGboost'''
x_train, x_valid, y_train, y_valid = train_test_split(
    x_train,
    y_train,
    stratify=y_train,
    test_size=0.3,
    random_state=42)

In [206]:
model = XGBClassifier()
xgb_clf = model

In [207]:
'''Run model with auc as eval metric'''
xgb_clf.fit(x_train, y_train, early_stopping_rounds=100,
            eval_metric="auc", eval_set=[(x_valid, y_valid)])

[0]	validation_0-auc:0.604403
Will train until validation_0-auc hasn't improved in 100 rounds.
[1]	validation_0-auc:0.604403
[2]	validation_0-auc:0.604403
[3]	validation_0-auc:0.604707
[4]	validation_0-auc:0.604707
[5]	validation_0-auc:0.613264
[6]	validation_0-auc:0.61299
[7]	validation_0-auc:0.612104
[8]	validation_0-auc:0.612029
[9]	validation_0-auc:0.613154
[10]	validation_0-auc:0.613154
[11]	validation_0-auc:0.613171
[12]	validation_0-auc:0.613402
[13]	validation_0-auc:0.615491
[14]	validation_0-auc:0.616976
[15]	validation_0-auc:0.616804
[16]	validation_0-auc:0.619002
[17]	validation_0-auc:0.618989
[18]	validation_0-auc:0.619663
[19]	validation_0-auc:0.619605
[20]	validation_0-auc:0.619919
[21]	validation_0-auc:0.619962
[22]	validation_0-auc:0.620568
[23]	validation_0-auc:0.620509
[24]	validation_0-auc:0.620681
[25]	validation_0-auc:0.620733
[26]	validation_0-auc:0.620812
[27]	validation_0-auc:0.619433
[28]	validation_0-auc:0.619994
[29]	validation_0-auc:0.62135
[30]	validation_0

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [208]:
'''Run model with aucpr as eval metric'''
xgb_clf.fit(x_train, y_train, early_stopping_rounds=100,
            eval_metric="aucpr", eval_set=[(x_valid, y_valid)])

[0]	validation_0-aucpr:0.235358
Will train until validation_0-aucpr hasn't improved in 100 rounds.
[1]	validation_0-aucpr:0.235358
[2]	validation_0-aucpr:0.235358
[3]	validation_0-aucpr:0.235714
[4]	validation_0-aucpr:0.235714
[5]	validation_0-aucpr:0.24109
[6]	validation_0-aucpr:0.240976
[7]	validation_0-aucpr:0.240078
[8]	validation_0-aucpr:0.240065
[9]	validation_0-aucpr:0.241401
[10]	validation_0-aucpr:0.241401
[11]	validation_0-aucpr:0.241511
[12]	validation_0-aucpr:0.241581
[13]	validation_0-aucpr:0.240698
[14]	validation_0-aucpr:0.241801
[15]	validation_0-aucpr:0.241653
[16]	validation_0-aucpr:0.242363
[17]	validation_0-aucpr:0.241938
[18]	validation_0-aucpr:0.242467
[19]	validation_0-aucpr:0.242455
[20]	validation_0-aucpr:0.242605
[21]	validation_0-aucpr:0.242613
[22]	validation_0-aucpr:0.242542
[23]	validation_0-aucpr:0.242531
[24]	validation_0-aucpr:0.242759
[25]	validation_0-aucpr:0.242768
[26]	validation_0-aucpr:0.242782
[27]	validation_0-aucpr:0.242587
[28]	validation_0-au

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [209]:
y_pred = xgb_clf.predict(x_test)
predictions = [round(value) for value in y_pred]

In [254]:
from sklearn import metrics

print(metrics.confusion_matrix(y_test, predictions))

[[24849     3]
 [ 5143     5]]


In [251]:
print(metrics.roc_auc_score(y_test, predictions))


0.5004252681703188


The model appears to perform when looking at the evaluation metrics, but the confusion matrix suggests otherwise. The Roc Auc score hovers around 50, which suggests the mdoel is not doing a good job of predicting.