In [35]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix, precision_score, recall_score, roc_curve, auc
import numpy as np
from utils import *

In [5]:
df_train = pd.read_csv('/Users/dysson/Downloads/train.gz',compression='gzip')

In [6]:
df_train.rename(columns = {'C1': 'search_engine_type', 'C14': 'product_type', 'C15': 'advertiser_type'}, inplace = True)

In [7]:
# Define X and y
X = df_train.loc[:, ~df_train.columns.isin(['click'])]
y = df_train.click

In [8]:
X_reduce = X[['hour', 'search_engine_type', 'banner_pos', 'device_type', 'device_conn_type', 'product_type', 'advertiser_type', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21']]

In [9]:
# Set up classifier using training data to predict test data
X_train, X_test, y_train, y_test = train_test_split(
  X_reduce, y, test_size = .2, random_state = 0)

### DT classifier

In [10]:
clf = DecisionTreeClassifier()
y_pred = clf.fit(X_train, y_train).predict(X_test)

In [11]:
# Define confusion matrix and four categories
conf_matrix = confusion_matrix(y_test, y_pred)
tn = conf_matrix[0][0]
fp = conf_matrix[0][1]
fn = conf_matrix[1][0]
tp = conf_matrix[1][1]

print("confusion_matrix: \n", conf_matrix)
print("\nTN: %s, FP: %s, FN: %s, TP: %s" %(tn, fp, fn, tp))

confusion_matrix: 
 [[6617757   94278]
 [1269830  103929]]

TN: 6617757, FP: 94278, FN: 1269830, TP: 103929


In [12]:
# Compute confusion matrix and get four categories
conf_matrix = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = conf_matrix.ravel()

In [13]:
# Calculate total return, total spent, and ROI
r = 0.2
cost = 0.05
total_return = calc_total_return(tp=tp , r=r)
total_cost = calc_total_cost(fp=fp, tp=tp, cost=cost)
roi = calc_roi(total_return=total_return, total_cost=total_cost)
print("Total return: %s \nTotal cost: %s \nROI: %s" %(
  round(total_return,2), round(total_cost,2), round(roi,4)))

Total return: 20785.8 
Total cost: 9910.35 
ROI: 2.0974


In [14]:
# Evaluate precision and recall
## using average = 'weighted' bc class imbalance; 
## from docs: 'weighted': Calculate metrics for each label, and find their average weighted by support (the number of true instances for each label). This alters 'macro' to account for label imbalance; 
## it can result in an F-score that is not between precision and recall.
prec = precision_score(y_test, y_pred, average = 'weighted')
recall = recall_score(y_test, y_pred, average = 'weighted')
print("Precision: %s, Recall: %s" %(prec, recall))

Precision: 0.7855484472239275, Recall: 0.8312957268018453


#### regularization

In [29]:
# Iterate over different levels of max depth
for max_depth_val in [2, 3, 5, 10, 15, 20]:
  # Create and fit model
  clf = DecisionTreeClassifier(max_depth = max_depth_val)
  print("Evaluating tree with max_depth = %s" %(max_depth_val))
  y_pred_reg = clf.fit(X_train, y_train).predict(X_test) 
  
  # Evaluate confusion matrix, precision, recall
  print("Confusion matrix: ")
  print(confusion_matrix(y_test, y_pred_reg))
  prec = precision_score(y_test, y_pred_reg, average = 'weighted')
  recall = recall_score(y_test, y_pred_reg, average = 'weighted')
  print("Precision: %s, Recall: %s" %(prec, recall))

Evaluating tree with max_depth = 2
Confusion matrix: 
[[6712035       0]
 [1373759       0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Precision: 0.6890695827491409, Recall: 0.8301021519964521
Evaluating tree with max_depth = 3
Confusion matrix: 
[[6712035       0]
 [1373759       0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Precision: 0.6890695827491409, Recall: 0.8301021519964521
Evaluating tree with max_depth = 5
Confusion matrix: 
[[6630569   81466]
 [1283819   89940]]
Precision: 0.7845971859152628, Recall: 0.8311501628658855
Evaluating tree with max_depth = 10
Confusion matrix: 
[[6639324   72711]
 [1280889   92870]]
Precision: 0.7911458677939663, Recall: 0.832595289961629
Evaluating tree with max_depth = 15
Confusion matrix: 
[[6634001   78034]
 [1271836  101923]]
Precision: 0.7927868545893796, Recall: 0.8330565928342968
Evaluating tree with max_depth = 20
Confusion matrix: 
[[6633361   78674]
 [1273656  100103]]
Precision: 0.7915212354082443, Recall: 0.8327523555509824


In [31]:
# Set up k-fold
k_fold = KFold(n_splits = 4, random_state = 0, shuffle = True)

# Evaluate precision and recall for each fold
precision = cross_val_score(
  clf, X_train, y_train, cv = k_fold, scoring = 'precision_weighted')
recall = cross_val_score(
  clf, X_train, y_train, cv = k_fold, scoring = 'recall_weighted')
print("Precision scores: %s" %(precision)) 
print("Recall scores: %s" %(recall))

Precision scores: [0.79090516 0.79046005 0.79110235 0.79074976]
Recall scores: [0.83265169 0.83243548 0.83291732 0.83269977]


In [32]:
# Iterate over different levels of max depth and set up k-fold
for max_depth_val in [3, 5, 10]:
  k_fold = KFold(n_splits = 4, random_state = 0, shuffle = True)
  clf = DecisionTreeClassifier(max_depth = max_depth_val)
  print("Evaluating Decision Tree for max_depth = %s" %(max_depth_val))
  y_pred = clf.fit(X_train, y_train).predict(X_test) 
  
  # Calculate precision for cross validation and test
  cv_precision = cross_val_score(
    clf, X_train, y_train, cv = k_fold, scoring = 'precision_weighted')
  precision = precision_score(y_test, y_pred, average = 'weighted')
  print("Cross validation Precision: %s" %(cv_precision))
  print("Test Precision: %s" %(precision))

Evaluating Decision Tree for max_depth = 3


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Cross validation Precision: [0.68913118 0.68891803 0.68962855 0.68936627]
Test Precision: 0.6890695827491409
Evaluating Decision Tree for max_depth = 5
Cross validation Precision: [0.78438442 0.78405002 0.78430679 0.78443541]
Test Precision: 0.7845971859152628
Evaluating Decision Tree for max_depth = 10
Cross validation Precision: [0.79115099 0.79007214 0.79030502 0.79121424]
Test Precision: 0.7911458677939663


### random forest

In [34]:
# Create random forest classifier with specified params
clf = RandomForestClassifier(n_estimators = 50, max_depth = 5)

# Train classifier - predict probability score and label
## To get the probabilities, use predict_proba() instead of predict()
y_score = clf.fit(X_train, y_train).predict_proba(X_test) 
## intended to be predicted class label thus using predict() instead of predict_proba()
y_pred = clf.fit(X_train, y_train).predict(X_test) 

In [None]:
# Get ROC curve metrics
fpr, tpr, thresholds = roc_curve(y_test, y_score[:, 1])
print("ROC of AUC: %s"%(auc(fpr, tpr)))

In [None]:
# Get precision and recall
precision = precision_score(y_test, y_pred, average = 'weighted')
recall = recall_score(y_test, y_pred, average = 'weighted')
print("Precision: %s, Recall: %s" %(precision, recall))

In [None]:
# Create list of hyperparameters 
n_estimators = [10, 50]
max_depth = [5, 20]
param_grid = {'n_estimators': n_estimators, 'max_depth': max_depth}

# Use Grid search CV to find best parameters 
print("starting RF grid search.. ")
rf = RandomForestClassifier()
clf = GridSearchCV(estimator = rf, param_grid = param_grid, scoring = 'roc_auc')
clf.fit(X_train, y_train)

print("Best Score: ")
print(clf.best_score_)
print("Best Estimator: ")
print(clf.best_estimator_)