# Model Training: This file is used to train models and tune the best performing model

In [1]:
import collections
import numpy as np
import pandas as pd
import csv
import itertools
import random
from sklearn.model_selection import KFold
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler 
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
import xgboost as xgb

In [2]:
df = pd.read_hdf('spam_processed_cv.h5')

In [3]:
df

Unnamed: 0,volume-dflt,spm-lbl-trgt,aa,aaa,aaaa,aaaaaa,aaaaaaaaaa,aaaaaaaaaaaaaaaaaa,aaaaaaaaaaaaaaaaaaaa,aaaaaaaaaaaaaaaaaaaaaaaaaa,...,þàº,þàç,þàì,þàïà,þàïàº,þàïàô,þàïáö¼ò,þîñæ,þüg,ÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿó
0,430,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3247,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2201,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2707,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1185,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11779,2525,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11780,3784,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11781,17801,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11782,2503,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
df = df.drop(columns=['volume-dflt'])

In [5]:
df

Unnamed: 0,spm-lbl-trgt,aa,aaa,aaaa,aaaaaa,aaaaaaaaaa,aaaaaaaaaaaaaaaaaa,aaaaaaaaaaaaaaaaaaaa,aaaaaaaaaaaaaaaaaaaaaaaaaa,aaaaaaaaaaaaaaaaaaaaaaaaaaaa,...,þàº,þàç,þàì,þàïà,þàïàº,þàïàô,þàïáö¼ò,þîñæ,þüg,ÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿÿó
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11779,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11780,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11781,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11782,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
X = df.loc[:, df.columns != 'spm-lbl-trgt'].values
y = df['spm-lbl-trgt'].values

In [7]:
clf_nbg = GaussianNB()
clf_dtc = DecisionTreeClassifier()
clf_rf = RandomForestClassifier()
clf_lr = LogisticRegression(max_iter=500)
clf_abc = AdaBoostClassifier()
clf_xgb = xgb.XGBClassifier()

In [12]:
# write the results found for the hidden layer grid search
with open('rf_tuning.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["model/hyperparameter", "acc_score_train", "acc_score_test", "rec_score_train", "rec_score_test", "prec_score_train", "prec_score_test", "f1_score_train", "f1_score_test"])

In [8]:
scaler = StandardScaler()  

def train_model(clf, clf_name):
    kf = KFold(n_splits=5)
    acc_score_train = []
    acc_score_test = []
    rec_score_train = []
    rec_score_test = []
    prec_score_train = []
    prec_score_test = []
    f1_score_train = []
    f1_score_test = []
    count = 1
  
    for train_index , test_index in kf.split(X):
        # apply kfolding
        X_train , X_test = X[train_index,:],X[test_index,:]
        y_train , y_test = y[train_index] , y[test_index]
     
        print("Scaling...")
        # scale the training and test data
        scaler.fit(X_train)  
        X_train = scaler.transform(X_train)  
        X_test = scaler.transform(X_test)  

        print("Fitting...")
        # train the model using early stopping
        clf.fit(X_train,y_train)

        print("Predicting...")
        # predict results
        predict_values_train = clf.predict(X_train)
        predict_values_test = clf.predict(X_test)

        # compute accuracy
        acc_train = accuracy_score(predict_values_train , y_train)
        acc_score_train.append(acc_train)
        acc_test = accuracy_score(predict_values_test , y_test)
        acc_score_test.append(acc_test)

        # compute recall
        rec_train = recall_score(predict_values_train , y_train)
        rec_score_train.append(rec_train)
        rec_test = recall_score(predict_values_test , y_test)
        rec_score_test.append(rec_test)
        
        # compute precision
        prec_train = precision_score(predict_values_train , y_train)
        prec_score_train.append(prec_train)
        prec_test = precision_score(predict_values_test , y_test)
        prec_score_test.append(prec_test)
        
        # compute f1
        f1_train = f1_score(predict_values_train , y_train)
        f1_score_train.append(f1_train)
        f1_test = f1_score(predict_values_test , y_test)
        f1_score_test.append(f1_test)

        # confusion matrix
        #confusion += confusion_matrix(y_test, predict_values_test)
        
        print(str(count) + " fold")
        print("acc_score_train " + str(acc_train))
        print("acc_score_test " + str(acc_test))
        print("rec_score_train " + str(rec_train))
        print("rec_score_test " + str(rec_test))
        print("prec_score_train " + str(prec_train))
        print("prec_score_test " + str(prec_test))
        print("f1_score_train " + str(f1_train))
        print("f1_score_test " + str(f1_test))
        count += 1
      
    # calculate averages of the 5 rounds and write/print results
    avg_acc_score_train = sum(acc_score_train)/5
    avg_acc_score_test = sum(acc_score_test)/5
    avg_rec_score_train = sum(rec_score_train)/5
    avg_rec_score_test = sum(rec_score_test)/5
    avg_prec_score_train = sum(prec_score_train)/5
    avg_prec_score_test = sum(prec_score_test)/5
    avg_f1_score_train = sum(f1_score_train)/5
    avg_f1_score_test = sum(f1_score_test)/5
    print("Overall results: ")
    print("acc_score_train " + str(avg_acc_score_train))
    print("acc_score_test " + str(avg_acc_score_test))
    print("rec_score_train " + str(avg_rec_score_train))
    print("rec_score_test " + str(avg_rec_score_test))
    print("prec_score_train " + str(avg_prec_score_train))
    print("prec_score_test " + str(avg_prec_score_test))
    print("f1_score_train " + str(avg_f1_score_train))
    print("f1_score_test " + str(avg_f1_score_test))
    #print("confusion ", confusion)
    
    # write results to csv to make sure not losing the progress
    with open('rf_tuning.csv', 'a') as file:
        writer = csv.writer(file)
        writer.writerow([clf_name, avg_acc_score_train, avg_acc_score_test , avg_rec_score_train, avg_rec_score_test,avg_prec_score_train ,avg_prec_score_test, avg_f1_score_train, avg_f1_score_test])
        

In [10]:
train_model(clf_nbg, "clf_nbg")

Scaling...
Fitting...
Predicting...
1 fold
acc_score_train 0.9645698525511828
acc_score_test 0.8973271107339839
rec_score_train 1.0
rec_score_test 0.9312406576980568
prec_score_train 0.9404103479036574
prec_score_test 0.8925501432664756
f1_score_train 0.9692901802133137
f1_score_test 0.9114850036576445
Scaling...
Fitting...
Predicting...
2 fold
acc_score_train 0.9586294685477883
acc_score_test 0.8973271107339839
rec_score_train 1.0
rec_score_test 0.9295454545454546
prec_score_train 0.9306666666666666
prec_score_test 0.8917151162790697
f1_score_train 0.9640883977900552
f1_score_test 0.9102373887240357
Scaling...
Fitting...
Predicting...
3 fold
acc_score_train 0.9596902514055373
acc_score_test 0.8913873568095037
rec_score_train 1.0
rec_score_test 0.9318181818181818
prec_score_train 0.9322033898305084
prec_score_test 0.8810888252148997
f1_score_train 0.9649122807017544
f1_score_test 0.9057437407952872
Scaling...
Fitting...
Predicting...
4 fold
acc_score_train 0.9606449559775114
acc_score_

In [11]:
train_model(clf_dtc, "clf_dtc")

Scaling...
Fitting...
Predicting...
1 fold
acc_score_train 0.9993635302853506
acc_score_test 0.9520577004666949
rec_score_train 0.9989306718944929
rec_score_test 0.9559346126510305
prec_score_train 1.0
prec_score_test 0.9634670487106017
f1_score_train 0.9994650499286732
f1_score_test 0.9596860506600071
Scaling...
Fitting...
Predicting...
2 fold
acc_score_train 0.9993635302853506
acc_score_test 0.9431480695799745
rec_score_train 0.9989344698987747
rec_score_test 0.9467625899280575
prec_score_train 1.0
prec_score_test 0.9563953488372093
f1_score_train 0.9994669509594882
f1_score_test 0.9515545914678235
Scaling...
Fitting...
Predicting...
3 fold
acc_score_train 0.9995756868569003
acc_score_test 0.9503606279168434
rec_score_train 0.9992868604029239
rec_score_test 0.9487719298245614
prec_score_train 1.0
prec_score_test 0.9684813753581661
f1_score_train 0.9996433030140895
f1_score_test 0.9585253456221198
Scaling...
Fitting...
Predicting...
4 fold
acc_score_train 0.9994696085711255
acc_score_

In [12]:
train_model(clf_rf, "clf_rf")

Scaling...
Fitting...
Predicting...
1 fold
acc_score_train 0.9993635302853506
acc_score_test 0.9817564700890963
rec_score_train 0.9989306718944929
rec_score_test 0.9828693790149893
prec_score_train 1.0
prec_score_test 0.9863896848137536
f1_score_train 0.9994650499286732
f1_score_test 0.9846263854129423
Scaling...
Fitting...
Predicting...
2 fold
acc_score_train 0.9993635302853506
acc_score_test 0.9800593975392448
rec_score_train 0.9989344698987747
rec_score_test 0.9790915645277577
prec_score_train 1.0
prec_score_test 0.9869186046511628
f1_score_train 0.9994669509594882
f1_score_test 0.9829895041621425
Scaling...
Fitting...
Predicting...
3 fold
acc_score_train 0.9995756868569003
acc_score_test 0.9749681798896903
rec_score_train 0.9992868604029239
rec_score_test 0.9751243781094527
prec_score_train 1.0
prec_score_test 0.9828080229226361
f1_score_train 0.9996433030140895
f1_score_test 0.9789511237959329
Scaling...
Fitting...
Predicting...
4 fold
acc_score_train 0.9994696085711255
acc_score_

In [15]:
train_model(clf_lr, "clf_lr")

Scaling...
Fitting...
Predicting...
1 fold
acc_score_train 0.9993635302853506
acc_score_test 0.9664828171404327
rec_score_train 0.9989306718944929
rec_score_test 0.9653710247349824
prec_score_train 1.0
prec_score_test 0.9785100286532952
f1_score_train 0.9994650499286732
f1_score_test 0.9718961223763786
Scaling...
Fitting...
Predicting...
2 fold
acc_score_train 0.9993635302853506
acc_score_test 0.9673313534153585
rec_score_train 0.9989344698987747
rec_score_test 0.9622775800711744
prec_score_train 1.0
prec_score_test 0.9825581395348837
f1_score_train 0.9994669509594882
f1_score_test 0.9723121179431858
Scaling...
Fitting...
Predicting...
3 fold
acc_score_train 0.9995756868569003
acc_score_test 0.9741196436147646
rec_score_train 0.9992868604029239
rec_score_test 0.9717314487632509
prec_score_train 1.0
prec_score_test 0.9849570200573066
f1_score_train 0.9996433030140895
f1_score_test 0.9782995375311277
Scaling...
Fitting...
Predicting...
4 fold
acc_score_train 0.9994696085711255
acc_score_

In [17]:
train_model(clf_xgb, "clf_xgb")

Scaling...
Fitting...
Predicting...
1 fold
acc_score_train 0.9924684417099819
acc_score_test 0.9728468392023759
rec_score_train 0.9883515707730322
rec_score_test 0.9670406732117812
prec_score_train 0.9991079393398751
prec_score_test 0.9878223495702005
f1_score_train 0.9937006476798864
f1_score_test 0.9773210489014884
Scaling...
Fitting...
Predicting...
2 fold
acc_score_train 0.9920441285668824
acc_score_test 0.9753924480271532
rec_score_train 0.9878691983122363
rec_score_test 0.9653954802259888
prec_score_train 0.9989333333333333
prec_score_test 0.9934593023255814
f1_score_train 0.9933704587642535
f1_score_test 0.9792263610315186
Scaling...
Fitting...
Predicting...
3 fold
acc_score_train 0.9919380502811075
acc_score_test 0.9787865931268561
rec_score_train 0.9874801622288838
rec_score_test 0.9752824858757062
prec_score_train 0.9991079393398751
prec_score_test 0.9892550143266475
f1_score_train 0.9932600212841433
f1_score_test 0.9822190611664295
Scaling...
Fitting...
Predicting...
4 fold


In [9]:
train_model(clf_abc , "clf_abc ")

Scaling...
Fitting...
Predicting...
1 fold
acc_score_train 0.9393232205367561
acc_score_test 0.927874416631311
rec_score_train 0.9323140353891084
rec_score_test 0.9221763085399449
prec_score_train 0.968242640499554
prec_score_test 0.9591690544412608
f1_score_train 0.9499387362156485
f1_score_test 0.9403089887640451
Scaling...
Fitting...
Predicting...
2 fold
acc_score_train 0.9401718468229553
acc_score_test 0.9316928298684769
rec_score_train 0.930137684854666
rec_score_test 0.918100481761872
prec_score_train 0.9728
prec_score_test 0.9694767441860465
f1_score_train 0.9509906152241919
f1_score_test 0.943089430894309
Scaling...
Fitting...
Predicting...
3 fold
acc_score_train 0.9378381245359075
acc_score_test 0.9418752651675859
rec_score_train 0.9293413173652695
rec_score_test 0.9374565670604587
prec_score_train 0.9691347011596788
prec_score_test 0.9663323782234957
f1_score_train 0.9488209606986899
f1_score_test 0.9516754850088184
Scaling...
Fitting...
Predicting...
4 fold
acc_score_train 0

In [None]:
# perform hyperparameter search for random forest

n_estimators = [50, 100, 200, 300]
max_features = ['log2', 'sqrt']
max_depth = [None, 80, 90, 100, 110]

for est in n_estimators:
    for max_f in max_features:
        for max_d in max_depth:
            clf_rf = RandomForestClassifier(n_estimators=est, max_features=max_f, max_depth=max_d)
            clf_name = "n_estimators=" + str(est) + " max_features=" + str(max_f) + " max_depth=" + str(max_d)
            print(clf_name)
            train_model(clf_rf, clf_name)

n_estimators=300 max_features=log2 max_depth=None
Scaling...
Fitting...
Predicting...
1 fold
acc_score_train 0.9993635302853506
acc_score_test 0.9838778107764107
rec_score_train 0.9989306718944929
rec_score_test 0.9801980198019802
prec_score_train 1.0
prec_score_test 0.9928366762177651
f1_score_train 0.9994650499286732
f1_score_test 0.9864768683274022
Scaling...
Fitting...
Predicting...
2 fold
acc_score_train 0.9993635302853506
acc_score_test 0.9855748833262622
rec_score_train 0.9989344698987747
rec_score_test 0.9820402298850575
prec_score_train 1.0
prec_score_test 0.9934593023255814
f1_score_train 0.9994669509594882
f1_score_test 0.9877167630057805
Scaling...
Fitting...
Predicting...
3 fold
acc_score_train 0.9995756868569003
acc_score_test 0.9809079338141705
rec_score_train 0.9992868604029239
rec_score_test 0.9773851590106007
prec_score_train 1.0
prec_score_test 0.9906876790830945
f1_score_train 0.9996433030140895
f1_score_test 0.983991462113127
Scaling...
Fitting...
Predicting...
4 f

In [None]:
# save the scalar to file
scaler = StandardScaler()
scaler.fit(X)  
X = scaler.transform(X)
pickle.dump(scaler, open('final_pj_scaler', 'wb'))

# train the model with best hyperparameters and save to use it for the spam filter app
clf = RandomForestClassifier(n_estimators=300, max_features='log2', max_depth=None)
clf.fit(X,y)
pickle.dump(clf, open('final_pj_model', 'wb'))

# save the features to file
new_df = pd.DataFrame(columns=df.columns)
new_df = new_df.drop(columns=['spm-lbl-trgt'])
new_df.to_pickle("final_pj_features")