# Avazu CTR Boosting Models

1) Import, clean, engineer features, hashing
2) Intro to boosting
3) Run a baseline model-no custom parameters
4) Add custom hyperparameters to model to reduce overfitting
5) Run CV to improve results and find best params
6) Run model with the improved params we have found

### Import Libraries

In [22]:
import numpy as np
from sklearn.metrics import (roc_auc_score, mean_absolute_error, 
mean_squared_error, average_precision_score, confusion_matrix,
classification_report)
import pandas as pd
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
import random

### Ignore Warnings

In [5]:
'''Ignore Warning Messages'''
import warnings
warnings.filterwarnings('ignore')

### Matplotlib Style

In [6]:
'''Set style to GGplot for asthetics'''
plt.style.use('ggplot')


In [16]:
df = pd.read_csv('/Users/Pierre/Desktop/repos/BidTime/data/samples/avazu_sample_300k.csv')

## Data Prepation

In [17]:
for c in df.columns:
    df[c]=df[c].apply(str)
    le=preprocessing.LabelEncoder().fit(df[c])
    df[c] =le.transform(df[c])
    pd.to_numeric(df[c]).astype(np.float)

In [18]:
X = df.drop(['click', 'id', 'hour', 'device_id', 'device_ip'], axis=1).values
Y = df['click'].values

In [19]:
n_train = 100000
X_train = X[:n_train]
Y_train = Y[:n_train]
X_test = X[n_train:]
Y_test = Y[n_train:]

In [21]:
enc = OneHotEncoder(handle_unknown='ignore')
X_train_enc = enc.fit_transform(X_train)
X_test_enc = enc.transform(X_test)

In [None]:
from sklearn.linear_model import LogisticRegression
DT = DecisionTreeClassifier(random_state=42)


In [28]:
print('Training samples: {0}'.format(n_train))
print('ROC of AUC on testing set: {1:.3f}'.format(n_train, roc_auc_score(Y_test, pred_LR)))
print('Mean Absolute Error on testing set: {1:.3f}'.format(n_train, mean_absolute_error(Y_test, pred_LR)))
print('Root Mean Squared Error on testing set: {1:.3f}'.format(n_train, (mean_squared_error(Y_test, pred_LR)**2)))
print('Average Precision Recall Score on testing set: {1:.3f}'.format(n_train, (average_precision_score(Y_test, pred_LR))))



Training samples: 100000
ROC of AUC on testing set: 0.717
Mean Absolute Error on testing set: 0.255
Root Mean Squared Error on testing set: 0.017
Average Precision Recall Score on testing set: 0.333


In [40]:
rounded_pred_LR = [round(value) for value in pred_LR]
confusion_matrix(Y_test, rounded_pred_LR)

array([[163348,   2535],
       [ 31407,   2710]])

In [58]:
target_names = ['No Click','Click']
cr_pred_LR = LR.predict(X_test_enc)
print(str(classification_report(Y_test, cr_pred_LR,target_names=target_names)))

              precision    recall  f1-score   support

    No Click       0.84      0.98      0.91    165883
       Click       0.52      0.08      0.14     34117

   micro avg       0.83      0.83      0.83    200000
   macro avg       0.68      0.53      0.52    200000
weighted avg       0.78      0.83      0.77    200000



In [69]:
SGD = SGDClassifier(loss='log', random_state=42)
SGD.fit(X_train_enc.toarray(), Y_train)

pred_SGD = SGD.predict_proba(X_test_enc.toarray())[:, 1]


In [62]:
print('Training samples: {0}'.format(n_train))
print('ROC of AUC on testing set: {1:.3f}'.format(n_train, roc_auc_score(Y_test, pred_SGD)))
print('Mean Absolute Error on testing set: {1:.3f}'.format(n_train, mean_absolute_error(Y_test, pred_SGD)))
print('Root Mean Squared Error on testing set: {1:.3f}'.format(n_train, (mean_squared_error(Y_test, pred_SGD)**2)))
print('Average Precision Recall Score on testing set: {1:.3f}'.format(n_train, (average_precision_score(Y_test, pred_SGD))))



Training samples: 100000
ROC of AUC on testing set: 0.717
Mean Absolute Error on testing set: 0.260
Root Mean Squared Error on testing set: 0.017
Average Precision Recall Score on testing set: 0.335


In [63]:
rounded_pred_SGD = [round(value) for value in pred_SGD]
confusion_matrix(Y_test, rounded_pred_SGD)

array([[163073,   2810],
       [ 31117,   3000]])

In [72]:
cr_pred_SGD = SGD.predict(X_test_enc)
print(str(classification_report(Y_test, cr_pred_SGD, target_names=target_names)))

              precision    recall  f1-score   support

    No Click       0.84      0.98      0.91    165883
       Click       0.52      0.09      0.15     34117

   micro avg       0.83      0.83      0.83    200000
   macro avg       0.68      0.54      0.53    200000
weighted avg       0.78      0.83      0.78    200000

