# Try a classifier with cross-validation and record the accuracy

In [1]:
import numpy as np
import time
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
import text_package

HAM = 'ham'
SPAM = 'spam'
CLASS = 'Class'
SUBJECT = 'Subject'
CONTENT = 'Content'
BOTH = 'Both'

# Try this on the whole train set
train_list = text_package.get_text_from_csv('train.csv')
# Create the CountVectorizer object
simple_vectorizer = CountVectorizer()
# First try it on the merger of the content and the subject
counts = simple_vectorizer.fit_transform(train_list[BOTH].values)
# And first try with a logistic regression
log_reg = LogisticRegression()


# Create a function to try some n values for cross validation and get the accuracy
def get_cross_val_accuracy_time(n_fold):
    start_time = time.time()
    # Cross-validate the model with the data, with 'n_fold' cv
    scores = cross_val_score(log_reg, counts, train_list[CLASS], cv=n_fold, scoring='accuracy')
    print('CV = ', n_fold)
    print('['+', '.join('{0:5.3%}'.format(k) for k in scores)+']')
    # Print the accuracy
    print('Accuracy: {0:11.9%}'.format(np.mean(scores)))
    # Print the computation time
    print("Time: {:6.4} seconds".format(time.time() - start_time))
    # Print the ratio Accuracy calculated/Time it took to calculate it
    print("Accuracy/Time ratio: {0:5.3%}/s\n".format(np.mean(scores)/(time.time() - start_time)))

# Try cv fold between 2 and 9, included
for i in range(2, 10):
    get_cross_val_accuracy_time(i)

CV =  2
[98.424%, 98.203%]
Accuracy: 98.313549974%
Time:  5.945 seconds
Accuracy/Time ratio: 16.538%/s

CV =  3
[98.526%, 98.652%, 98.385%]
Accuracy: 98.521186060%
Time:  12.29 seconds
Accuracy/Time ratio: 8.015%/s

CV =  4
[98.509%, 98.932%, 98.610%, 98.423%]
Accuracy: 98.618640455%
Time:  21.98 seconds
Accuracy/Time ratio: 4.487%/s

CV =  5
[98.517%, 98.919%, 98.708%, 98.665%, 98.432%]
Accuracy: 98.648301466%
Time:  25.18 seconds
Accuracy/Time ratio: 3.918%/s

CV =  6
[98.551%, 98.780%, 99.008%, 98.551%, 98.602%, 98.551%]
Accuracy: 98.673739176%
Time:  34.82 seconds
Accuracy/Time ratio: 2.834%/s

CV =  7
[98.428%, 98.754%, 99.021%, 98.813%, 98.546%, 98.190%, 98.695%]
Accuracy: 98.635580623%
Time:  41.41 seconds
Accuracy/Time ratio: 2.382%/s

CV =  8
[98.407%, 98.712%, 98.882%, 99.017%, 98.373%, 98.440%, 98.474%, 98.813%]
Accuracy: 98.639818899%
Time:  49.44 seconds
Accuracy/Time ratio: 1.995%/s

CV =  9
[98.399%, 98.704%, 98.780%, 99.123%, 98.932%, 98.360%, 98.627%, 98.398%, 98.779%]

# Get the number of correctly classified and misclassified emails

In [2]:
# Function to get the accuracy in terms of number of emails, instead of ratio
def get_cross_val_accuracy(n_fold):
    scores = cross_val_score(log_reg, counts, train_list[CLASS], cv=n_fold, scoring='accuracy')
    print(np.around(scores*23600, decimals=1))
    # Print the number of correctly classified and misclassified emails
    print('# Correctly classified: ', round(np.mean(scores)*23600, 0))
    print('# Misclassified: ', round((1-np.mean(scores))*23600, 0))
    
get_cross_val_accuracy(4)

[ 23248.1  23348.   23272.   23227.9]
# Correctly classified:  23274.0
# Misclassified:  326.0


# Investigate the errors by computing some scores

In [3]:
# Print the precision, recall and f1 scores
print('Precision: ', np.mean(cross_val_score(log_reg, counts, train_list[CLASS], cv=4, scoring='precision_weighted')))
print('Recall:    ', np.mean(cross_val_score(log_reg, counts, train_list[CLASS], cv=4, scoring='recall_weighted')))
print('F1:        ', np.mean(cross_val_score(log_reg, counts, train_list[CLASS], cv=4, scoring='f1_weighted')))

Precision:  0.986288321708
Recall:     0.986186404548
F1:         0.986183715999
