In [20]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_score, recall_score
import numpy as np
from utils import *

In [5]:
df_train = pd.read_csv('/Users/dysson/Downloads/train.gz',compression='gzip')

In [6]:
df_train.rename(columns = {'C1': 'search_engine_type', 'C14': 'product_type', 'C15': 'advertiser_type'}, inplace = True)

In [7]:
# Define X and y
X = df_train.loc[:, ~df_train.columns.isin(['click'])]
y = df_train.click

In [8]:
X_reduce = X[['hour', 'search_engine_type', 'banner_pos', 'device_type', 'device_conn_type', 'product_type', 'advertiser_type', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21']]

In [9]:
# Set up classifier using training data to predict test data
X_train, X_test, y_train, y_test = train_test_split(
  X_reduce, y, test_size = .2, random_state = 0)

### DT classifier

In [10]:
clf = DecisionTreeClassifier()
y_pred = clf.fit(X_train, y_train).predict(X_test)

In [11]:
# Define confusion matrix and four categories
conf_matrix = confusion_matrix(y_test, y_pred)
tn = conf_matrix[0][0]
fp = conf_matrix[0][1]
fn = conf_matrix[1][0]
tp = conf_matrix[1][1]

print("confusion_matrix: \n", conf_matrix)
print("\nTN: %s, FP: %s, FN: %s, TP: %s" %(tn, fp, fn, tp))

confusion_matrix: 
 [[6617757   94278]
 [1269830  103929]]

TN: 6617757, FP: 94278, FN: 1269830, TP: 103929


In [12]:
# Compute confusion matrix and get four categories
conf_matrix = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = conf_matrix.ravel()

In [13]:
# Calculate total return, total spent, and ROI
r = 0.2
cost = 0.05
total_return = calc_total_return(tp=tp , r=r)
total_cost = calc_total_cost(fp=fp, tp=tp, cost=cost)
roi = calc_roi(total_return=total_return, total_cost=total_cost)
print("Total return: %s \nTotal cost: %s \nROI: %s" %(
  round(total_return,2), round(total_cost,2), round(roi,4)))

Total return: 20785.8 
Total cost: 9910.35 
ROI: 2.0974


In [14]:
# Evaluate precision and recall
## using average = 'weighted' bc class imbalance; 
## from docs: 'weighted': Calculate metrics for each label, and find their average weighted by support (the number of true instances for each label). This alters 'macro' to account for label imbalance; 
## it can result in an F-score that is not between precision and recall.
prec = precision_score(y_test, y_pred, average = 'weighted')
recall = recall_score(y_test, y_pred, average = 'weighted')
print("Precision: %s, Recall: %s" %(prec, recall))

Precision: 0.7855484472239275, Recall: 0.8312957268018453
