In this session, we will build a spam classification by using an email dataset. Our goal is to develop optimal models to predict whether an email is spam or not spam based on word characteristics within each email. 
We have to perform the following steps:


1.   Prepare data
2.   Find the optimal depth for the Decision Tree model and evaluate performance.
3. Fit the Bagging model using multiple bootstrapped datasets and ensemble.
4. Fit a Random Forest model.
5. Explore the Bias vs Variance tradeoff.

## Step 1: prepare your dataset

In [None]:
import numpy as np
import pandas as pd
import math
import matplotlib
import matplotlib.pyplot as plt
import sklearn.metrics as metrics
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score,roc_curve
%matplotlib inline
from tqdm import tqdm
from sklearn.model_selection import learning_curve

#mount google drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#import dataset
df =pd.read_csv("drive/My Drive/Colab Notebooks/Lab3_dataset1.csv")
columns = ["Column_"+str(i+1) for i in range(df.shape[1]-1)] + ['Spam']
df.columns = columns
df.head()

In [None]:
#Split data into train and test
np.random.seed(10)
indx = np.random.rand(len(df)) < 0.7
print(indx)
df_train = df[indx]
df_test = df[~indx]

#Split predictor and response columns
x_train, y_train = df_train.drop(['Spam'], axis=1), df_train['Spam']
x_test, y_test = df_test.drop(['Spam'], axis=1), df_test['Spam']

print(df_train.shape)
print(df_test.shape)

#Check Percentage of Spam in Train and Test Set
print("Percentage of Spam in Training Set :", str(100*y_train.sum()/len(y_train))+'%')
print("Percentage of Spam in Testing Set :",str(100*y_test.sum()/len(y_test))+'%')



## Step 2: Train your Decision Tree model

In [None]:
#Tuning of the parameter depth: find optimal depth of trees
depth= {}
tree_start, tree_end = 3, 30
for i in range(tree_start, tree_end):
    model = DecisionTreeClassifier(max_depth=i)
    scores = cross_val_score(estimator=model, X=x_train, y=y_train, cv=5, n_jobs=-1)
    depth[i] = scores.mean()
    
#Plot of results
print(depth)
lists = sorted(depth.items())
x, y = zip(*lists) 
y_err = scores.std()
plt.ylabel("Cross Validation Accuracy")
plt.xlabel("Maximum Depth")
plt.title('Variation of Accuracy with Depth for Decision Tree Model')
plt.plot(x, y, 'k-', marker='o')
plt.fill_between(x, y - y_err, y + y_err, color='grey', alpha=0.2)
plt.show()

In [None]:
#Make best depth a variable
best_depth = sorted(depth, key=depth.get, reverse=True)[0]
print("The best depth is:", best_depth)

In [None]:
#Evalaute the performance choosing the best depth
model = DecisionTreeClassifier(max_depth=best_depth)
model.fit(x_train, y_train)

#Check Accuracy of Spam Detection in Train and Test Set
print("Accuracy, Training Set: {:.1%}".format(accuracy_score(y_train, model.predict(x_train))))
print("Accuracy, Testing Set: {:.1%}".format(accuracy_score(y_test, model.predict(x_test))))

In [None]:
#Confusion Matrix
confusion_matrix = metrics.confusion_matrix(y_test, model.predict(x_test))

cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix)

cm_display.plot()
plt.show()

pd.crosstab(y_test, model.predict(x_test), margins=True, rownames=['Actual'], colnames=['Predicted'])

In [None]:
y_proba = model.predict_proba(x_test)[:,1]
print("Roc AUC:", roc_auc_score(y_test, model.predict_proba(x_test)[:,1],average='macro'))
fpr, tpr, thresholds = roc_curve(y_test, y_proba)
plt.plot(fpr, tpr, label='Decision Tree')
plt.plot([0, 1], ls="--",label='Chance level')
plt.plot([0, 0], [1, 0] , c=".7"), plt.plot([1, 1] , c=".7")
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.legend(loc='best')
plt.show() 

## Step 3: Bagging

In [None]:
#Creating model
np.random.seed(0)
model = DecisionTreeClassifier(max_depth=5) 

#Initializing variables
n_trees = 100 
predictions_train = np.zeros((df_train.shape[0], n_trees))
predictions_test = np.zeros((df_test.shape[0], n_trees))

#Bootstraping iterations
for i in range(n_trees):
    temp_sample = df_train.sample(frac=1, replace=True)
    response_variable = temp_sample['Spam']
    temp_sample = temp_sample.drop(['Spam'], axis=1)
    model.fit(temp_sample, response_variable)  
    predictions_train[:,i] = model.predict(x_train)   
    predictions_test[:,i] = model.predict(x_test)
    
#Make Predictions Dataframe
columns = ["Bootstrap-Model_"+str(i+1) for i in range(n_trees)]
predictions_train = pd.DataFrame(predictions_train, columns=columns)
predictions_test = pd.DataFrame(predictions_test, columns=columns)
print(predictions_train.shape)
print(predictions_test.shape)

In [None]:
y_train = df_train['Spam'].values
y_test = df_test['Spam'].values

# n_trees
num_to_avg = 100 

fig, axs = plt.subplots(1, 2, figsize=(14, 7))
for (ax, label, predictions, y) in [
    (axs[0], 'Train', predictions_train, y_train), 
    (axs[1], 'Test', predictions_test, y_test)
]:
    mean_predictions = predictions.iloc[:,:num_to_avg].mean(axis=1)
    mean_predictions[y == 1].hist(density=True, histtype='step', range=[0,1], label='Spam', lw=2, ax=ax)
    mean_predictions[y == 0].hist(density=True, histtype='step', range=[0,1], label='Not-Spam', lw=2, ax=ax)
    ax.legend(loc='upper center');
    ax.set_xlabel("Mean of ensemble predictions")
    ax.set_title(label)

In [None]:
#Function to ensemble the prediction of each bagged decision tree model
def get_prediction(df, count=-1):
    count = df.shape[1] if count==-1 else count
    temp = df.iloc[:,0:count]
    return np.mean(temp, axis=1)>0.5

#Check performance metrics of Spam Detection in Test Set

Accuracy = metrics.accuracy_score(y_test, get_prediction(predictions_test, count=-1))
Sensitivity = metrics.recall_score(y_test, get_prediction(predictions_test, count=-1))
Specificity = metrics.recall_score(y_test, get_prediction(predictions_test, count=-1),pos_label=0)
F1_score = metrics.f1_score(y_test, get_prediction(predictions_test, count=-1))

print({"Accuracy":Accuracy,"Sensitivity":Sensitivity,"Specificity":Specificity,"F1_score":F1_score})


In [None]:
confusion_matrix = metrics.confusion_matrix(y_test, get_prediction(predictions_test, count=-1))

cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix)

cm_display.plot()
plt.show()

In [None]:
#Fit a Random Forest Model

#Training
model = RandomForestClassifier(n_estimators=int(math.sqrt(x_train.shape[1])), max_depth=best_depth)
model.fit(x_train, y_train)

#Predict
y_pred_train = model.predict(x_train)
y_pred_test = model.predict(x_test)

#Performance metrics
train_score = accuracy_score(y_train, y_pred_train)*100
test_score = accuracy_score(y_test, y_pred_test)*100

print("Accuracy, Training Set :",str(train_score)+'%')
print("Accuracy, Testing Set :",str(test_score)+'%')

In [None]:
#Top Features
feature_importance = model.feature_importances_
feature_importance = 100.0 * (feature_importance / feature_importance.max())
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5

#Plot
plt.figure(figsize=(10,12))
plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, x_train.columns[sorted_idx])
plt.xlabel('Relative Importance')
plt.title('Variable Importance')

In [None]:
#Plot Learning Curve for training
estimator = RandomForestClassifier(n_estimators=int(math.sqrt(x_train.shape[1])), max_depth=best_depth)
title = "Learning Curves (Random Forest)"

train_sizes, train_scores, test_scores = learning_curve(estimator, x_train, y_train, cv=5,scoring="accuracy", train_sizes=np.linspace(0.01, 1.0, 50))


train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)

test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

plt.subplots(1, figsize=(10,10))
plt.plot(train_sizes, train_mean, '--', color="red",  label="Training score")
plt.plot(train_sizes, test_mean, color="black", label="Cross-validation score")

plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, color="red", alpha=0.3)
plt.fill_between(train_sizes, test_mean - test_std, test_mean + test_std, color="gray",alpha=0.3)

plt.title("Learning Curve")
plt.xlabel("Training Set Size"), plt.ylabel("Accuracy Score"), plt.legend(loc="best")
plt.show()