<h1> Spam filter </h1>
Artificial Intelligence I - Continuous Assessment. Solution by Marinara Marcato. Student no: 115105971

Given a labeled dataset containing examples of ham (1650) and spam (1248) emails, create a spam filter using machine learning techniques in order learn to correctly classify ham/spam emails. 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from pprint import pprint

from timeit import default_timer as timer
    
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin


from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import stop_words

from sklearn.decomposition import TruncatedSVD

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_validate

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin


#shuffle the dataset and reset index
def Shuffle_DF (df):
    df = df.take(np.random.permutation(len(df))) 
    df.reset_index(drop = True, inplace = True)
    return df


def Build_DF (dir, label):
    data, data_filtered, data_balanced, data_filtered_balanced, ex_index = [], [], [], [] ,[]
    # os.listdir(dir): returns an array containing the name of the files in dir 
    # os.path.join(dir, file_names): joins the directory (dir) and file name(file_names) returning 'dir\\filename'
    for j in range (len(dir)):
        for i in [os.path.join(dir[j], file_names) for file_names in os.listdir(dir[j])]:
            with open (i, "r", encoding="utf8") as file:
                #append string containing the text file content to the list created while labeling
                entire_email = file.read()
                data.append([entire_email, label[j]])
                data_filtered.append([entire_email.split('\n\n',1)[1], label[j]])     

    for i in range (len(dir)): 
        ex_index.append( len(os.listdir(dir[i])) ) 
        min_ex = min(ex_index)
    ex_index.insert(0, 0)
    
    for i in range (len(dir)):
        data_balanced.extend( data[ ex_index[i] : (ex_index[i]+ min_ex)] )
        data_filtered_balanced.extend( data_filtered[ ex_index[i] : (ex_index[i]+ min_ex)] )     
    
    # transform the lists into a labaled pandas dataframe    
    df = Shuffle_DF(pd.DataFrame(data, columns = ['email', 'label']))
    df_filtered = Shuffle_DF(pd.DataFrame(data_filtered, columns = ['email', 'label'])) 
        
    df_balanced = Shuffle_DF(pd.DataFrame(data_balanced, columns = ['email', 'label']))
    df_filtered_balanced = Shuffle_DF(pd.DataFrame(data_filtered_balanced, columns = ['email', 'label']))
    return df, df_filtered, df_balanced, df_filtered_balanced

def Validate_Pipeline (pipelines, X, y, cross_val, title, label): 
    pipe_performance = []
    y_predicted = []                                              
    parameters = ('Accuracy (%)', 'Precision (%)', 'Recall (%)', 'F1(%)', 'test_score (%)', 'train_score (%)', 'fit_time (s)', 'score_time (s)' )
    print (parameters)
    for i in range (len(pipelines)):
        [[TN, FP], [FN, TP]] = confusion_matrix( y[i], (cross_val_predict(pipelines[i], X[i], y[i], cv= cross_val)))

        # performance measurements: accuracy, precision, recall and f1 are calculated based on the confusion matrix
        # http://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
        # http://scikit-learn.org/stable/auto_examples/model_selection/plot_precision_recall.html#sphx-glr-auto-examples-model-selection-plot-precision-recall-py                                   
        accuracy = (TN+TP)/(TN+TP+FN+FP) 
        precision = (TP)/(TP+FP) 
        recall = (TP)/(TP+FN) 
        f1 = 2*precision*recall/(precision + recall) 
        # I could have used the function below to calculate the same parameters
            #however, it is not that easy to plot the data from it and it takes longer to calculate than the above
                # I compared the results from both methods and they are the same
        # classification_report(y, cross_val_predict(pipelines[i], X[i], y[i], cv= kf_st ), digits = 4)
        
        score = cross_validate(pipelines[i], X[i], y[i], cv= cross_val, return_train_score=True )
                                                
        pipe_performance.append([100*accuracy, 100*precision, 100*recall, 100*f1, 
                                 100*np.mean(score['test_score']), 100*np.mean(score['train_score']), 
                                 np.mean(score['fit_time']), np.mean(score['score_time'])])
  
        print(label[i], '\t', ["%.5f" % elem for elem in pipe_performance[i]])

        
    # Plotting the graph for visual performance comparison 
    width = .9/len(pipelines)
    index = np.arange(9)
    colour = ['b', 'r', 'g', 'y', 'm', 'c', 'k']
    
    fig, ax = plt.subplots()
    ax2 = ax.twinx()
    for i in range (len(pipelines)):
        ax.bar(index[0:6] + width*i, pipe_performance[i][0:6], width, color = colour[i], label = label[i])  
        ax2.bar(index[6:8] + width*i, pipe_performance[i][6:8], width, color = colour[i], label = label[i])  

    ax.set_xticks(index + width*(len(pipe_performance)-1) / 2)
    ax.set_xticklabels(parameters, rotation=45)
    ax.legend()
    ax.set_ylabel('Percentage (%)')
    ax.set_ylim([0,110])
    ax2.set_ylabel('Time (s)')
    plt.title(title)
    plt.figure(figsize=(10,20))
    plt.show()

    return (pipe_performance)       

def Validate_GridSearchCV(pipe, params, X, y):
    pipe_performance = []
    kf_st = StratifiedKFold(n_splits = 10)
    
    g = GridSearchCV(pipe, param_grid = params, cv= kf_st, refit='precision', scoring= ['accuracy', 'precision', 'recall', 'f1'])
    gs = g.fit(X,y)
    gs_results = gs.cv_results_
    
    gs_train = []#['mean_train_accuracy', 'mean_train_precision', 'mean_train_recall', 'mean_train_f1'] 
    gs_rank = ['rank_test_accuracy', 'rank_test_precision', 'rank_test_recall', 'rank_test_f1']
    gs_test = ['mean_test_accuracy', 'mean_test_precision', 'mean_test_recall', 'mean_test_f1']
    gs_time = ['mean_fit_time', 'mean_score_time']
    gs_all = gs_train + gs_test + gs_time 
        
    for parameter in gs_all: pipe_performance.append( gs_results[parameter])
    pipe_performance = np.asarray(pipe_performance)
    pipe_performance = pipe_performance.reshape(len(gs_all),len(gs_results['params']))   
    
    best = [i for i,x in enumerate(gs_results['params']) if x == gs.best_params_ ] 
      
    print('Best estimator\n Params:\t', gs.best_params_, 'Best precision score: \t', gs.best_score_)
    print( 'Performance:\t',gs_all)
    print('\t\t', pipe_performance[:,best[0]])
    
    # Plotting the graph for visual performance comparison 
    width = .9/len(gs_results['params'])
    index = np.arange(len(gs_all)+1)
    colour = ['b', 'r', 'g', 'y', 'm', 'c', 'k', 'b', 'r', 'g', 'y', 'm', 'c', 'k', ]
    
    fig, ax = plt.subplots()
    ax2 = ax.twinx()
    for i in range (len(gs_results['params'])):
        ax.bar(index[0:4] + width*i, pipe_performance[0:4,i], width, color = colour[i], label = gs_results['params'][i])  
        ax2.bar(index[4:6] + width*i, pipe_performance[4:6,i], width, color = colour[i], label = gs_results['params'][i])  

    ax.set_xticks(index + width * (len(gs_results['params'])-1)  / 2)
    ax.set_xticklabels(gs_all, rotation=45)
    ax.legend( loc='upper center', bbox_to_anchor=(1.9, 0.5))
    ax.set_ylabel('Performance')
    ax2.set_ylabel('Time (s)')
    plt.title(title)
    plt.figure(figsize=(10,20))
    plt.show()
    
    return g.best_estimator_




<h1> Dataframe</h1>
The very first step is to concatenate the emails which are in .txt files into a labeled dataframe. The function Get_DataFrame was created for that reason. 

It basically takes as an input the directory where the text files are located and the label associated with it and creates two labeled dataframes where the first column contains the text found in the .txt file and the second column contains the labels, in this case ham = 0, spam = 1. One of the dataframes is composed of the body of the email, while the other also carries the email header. This is because I want to investigate whether the routing information contained in the header would be useful in classifying emails, although I wouldn't expect it to be true.

The separation between hearder and body of the email was implemented taking into account the protocol definition that specifies: "A message consists of header fields and, optionally, a body. The body is simply a sequence of lines containing ASCII characters. It is separated from the headers by a null line (i.e., a line with nothing preceding the CRLF)." (RFC822:  Standard for ARPA Internet Text Messages, https://www.w3.org/Protocols/rfc822/). This also implies that less features will be extracted from the examples, causing the computation associated with the classification algorithm to be much faster.

In [2]:
df, df_filtered, df_balanced, df_filtered_balanced  = Build_DF(['ham', 'spam'], [0, 1])
#print ('Labeled Dataframe where the data are already shuffled \n', df)
#print (df.describe(include = 'all'))

# encoding label does not make difference in this case  
X = df['email'].values
y = df["label"].values 
    
X_filtered = df_filtered['email'].values
y_filtered = df_filtered['label'].values 

# Note that ham = 1650 examples and spam = 1248 (57% examples are ham)
    # Ideally we should use the same number of examples in each class in order to unbias the classifier
        # even though the difference is not very significant in this case, it might improve the classification performance
X_balanced = df_balanced['email'].values
y_balanced = df_balanced['label'].values 
    
X_filtered_balanced = df_filtered_balanced['email'].values
y_filtered_balanced = df_filtered_balanced['label'].values 

FileNotFoundError: [WinError 3] The system cannot find the path specified: 'ham'

<h1> Pipelines </h1>

Pipelines implement a series of transform operations and a final estimator. They are going to be chosen based on the 
dataset (labels (provided/missing), size (no. of examples), type (numeric/nominal/text) the size ofand type of problem (regression, binary/multiclass classification).

Given that we are provided with a dataset constitued of two classes, this problem is considered a case of binary classification and will be solved using supervised learning techniques as it is labeled.

The feature extraction step takes into account the fact that the data type is text and therefore Vectorization/Tokenizing methods are employed. There are few options already implemented in scikit-learn:
- CountVectorizer(): separate words into tokens (tokenization) and count their occurence in each example, outputs a sparse matrix. 
- TfidfVectorizer(): separate words into tokens (tokenization) and count their occurence in each example, outputs a normalized sparse matrix . 
- HashingVectorizer():

This is binary classification, therefore there is no difference between one-versus-one and one-versus-rest estimator, as there are only two categories. 

In [0]:
# Pipelines that implements different vectorization methods
pipe_tfidf = Pipeline ([
    ("vectorizer", TfidfVectorizer(stop_words = 'english')),
    ("scaler", StandardScaler(with_mean=False)),
    ("estimator", LogisticRegression())  
])
pipe_count = Pipeline ([
    ("vectorizer", CountVectorizer(stop_words = 'english', dtype = float)),
    # Because the input to StandardScaler is a sparce matrix, with_mean has to be set to False
    ("scaler", StandardScaler(with_mean=False)),
    ("estimator", LogisticRegression())  
])
pipe_hash = Pipeline ([
    ("vectorizer", HashingVectorizer(stop_words = 'english', n_features =2**1*, dtype = float)),
    # Because the input to StandardScaler is a sparce matrix, with_mean has to be set to False
    ("scaler", StandardScaler(with_mean=False)),
    ("estimator", LogisticRegression())  
])  


<h1> Performance measurements </h1>
<h2> Cross-validation </h2>
It is important to investigate the ability of the classifier to correctly identify spam and ham email. For that reason, it is necessary to select the most appropriate cross-validation method based on the amount of examples in the dataset.

k-Fold cross-validation was implemented in order to evaluate the performance of multiple pipelines as the dataset is large enough, as a minimum number of 30 examples per fold is recommended and in this case there are approx. 3k examples/10 folds = 300 examples. Holdout was disconsidered as the data size would not be big enough, therefore it would deliver different perfomance values each time due to shuffle plit. 

Stratification is implemented as it is important to make sure that the proportion of examples of each class in the overall dataset is respected while partitioning into training and test sets for performance measurement purposes.

Therefore, the algorithm has to:
 - Partition the dataset (df) into training (X_train, Y_train) and test (X_test, Y_test) sets respecting the proportion of each class (k-Fold stratification)
 - Train the estimator on the training set (X_train, Y_train) and predicting the class (^y_text) using the test set (X_test, Y_test)
 - Evaluate the estimator's performance based on the labels (Y_train)
 - Repeat the steps above K times in the case of K-fold. 

Fortunately, Stratified k-Fold cross-validation is already implemented in scikit-learn in the cross_validate, cross_val_predict and cross_val_score built-in functions for example. 

<h2> Performance Parameters </h2>
- Accuracy: ability of the classifier to correctly classify spam and ham, in other words, the percentage of ham and spam emails that were correctly classified in the training set.
- Precision: ability of the classifier to not lable a ham email as spam. This is probably the most important parameter for us, as this represents the worst thing a classifier could do
- Recall: ability of the classifier to find spam emails
- F1: a metric of averaging accuracy and precision

- Test_score: the accuracy in predicting the test data
- Train_score: the accuracy in predicting the train data 

- Fit_time: the averaged amount of time taken by the algorithm to fit the train data 
- Score_time: the averaged amount of time taken by the algorithm to score the test data in each fold


<h1> Impact of the DataFrame choice on the performance </h1> 

four different dataframes were created as discussed above. Now, we are going to compare how they affect the performance of the classification algorithm. For that purpose, the performance using TFIDF, Count and Hashing Vectorizers and Logistic Regression  will be analysed for each of the four dataframes.


<h2> Pipeline: TFIDF Vectorizer + Standard Scaler + Logistic Regression Dataframe </h2> 

The chart below shows a comparison between the impact that different dataframes have on the performance data. As discussed above, the data was slightly unbalanced (57% of the examples were ham), therefore if we build a classifier using all the ham and spam examples, because there are more ham examples, the classifier will tend to classify more spam as ham then the opposite. This explains why the precision was increased for unbalanced data, as the precision is the ability of the classifier to not label spam as ham.

All the results are optimal for filtered and balanced data, the most important one being F1 because it is an average of the precision and recall. It is possible to observe that it also provides the fastest fit and score times, as there are fewer examples (balanced) and fewer features (filtered). The filtered features proved to be more meaningful as there was an increase in accuracy compared to the unfiltered data for both balaced and unbalanced datasets.

Conclusively, only the filtered and balanced dataset will be used below to study the effect that the pipeline elements have on the performance parameters as filtering improved the performance and balancing is known for being the best practice in classification problems. Another approach that could be taken is separate the data in two dataframes: one containing the head and the other the body of the email and process them in different pipelines/estimators and then combine their results using ensemble methods.


In [0]:
print("\n Stratified k-Fold cross-validation with Tf-idf, Count and Hashing Vectorizers and Logistic Regression")
kf_st = StratifiedKFold(n_splits = 10)
pipe = [pipe_tfidf] * 4
label = ['Unfiltered & Unbalanced', 'Filtered & Unbalanced', 'Unfiltered & Balanced', 'Filtered & Balanced']
title = 'Logistic Regression with tfidf, count and hashing Vectorizes (using filtered and balanced dataset)'
Xi = [X , X_filtered , X_balanced , X_filtered_balanced]
yi = [y, y_filtered, y_balanced, y_filtered_balanced]

LR = Validate_Pipeline (pipe, Xi, yi, kf_st, title, label)

In [0]:
transformers = Pipeline ([
    ("vectorizer", TfidfVectorizer(stop_words = 'english')),
    ("scaler", StandardScaler(with_mean=False)),
])
X_uu = transformers.fit_transform(X, y)
X_fb = transformers.fit_transform(X_filtered_balanced, y_filtered_balanced)
print('Unfiltered and unbalanced feature matrix shape:', X_uu.shape, '\n Filtered and Balanced feature matrix shape:', X_fb.shape)

<h2> TFIDF, Count and Hashing Vectorizers + Standard Scaler + Logistic Regression  </h2>

From the chart below, it is possible to observe that tfidf vectorizer outperformed the others in terms of Accuracy, Recall, F1, fit time and score time, while the best precision was achieved by Count Vectorizer, offering an improvement of 1.25%. 

Note that the fit time was significantly higher for hashing as there is a lot of processing involved. Therefore, it is not suitable for this application as the feature vector isn't too big for the others to handle. I tried making the n_features parameters a bit smaller to improve the processing time, but it lowered the other metrics 

In [0]:
print("\n Stratified k-Fold cross-validation with Tf-idf, Count and Hashing Vectorizers and Logistic Regression")
kf_st = StratifiedKFold(n_splits = 10)
pipe = [pipe_tfidf, pipe_count, pipe_hash]
label = ['tfidf', 'count', 'hashing']
Xi = [X_filtered_balanced] * len(pipe)
yi = [y_filtered_balanced] * len(pipe)
title = 'Logistic Regression with tfidf, count and hashing Vectorizes (using filtered and balanced dataset)'
LR = Validate_Pipeline (pipe, Xi, yi, kf_st, title, label)

<h2> Applying GridSearch Cross-Validation to find best parameters for TF-IDF   </h2>

In order to find the Tfidf parameter 'norm' that produce the most accurate results in terms of precision, we are going to perform Grid Search CV. The function Validate_GridSearch finds the best parameter or combination of parameters and displays it in text and graph format and returnt a pipeline  

In [0]:
kf_st = StratifiedKFold(n_splits = 10)
param_grid = dict(vectorizer__norm=[None, 'l1', 'l2'])
#grid_search = GridSearchCV(pipe_tfidf, param_grid=param_grid, cv = kf_st , refit='precision',  scoring = ['accuracy', 'precision', 'recall', 'f1'])
#grid_search.fit(X_filtered_balanced, y_filtered_balanced)
#print('Best estimator', grid_search.best_estimator_)
#print('Best Params',grid_search.best_params_)
#print('Best score', grid_search.best_score_)
#print('Scorer', grid_search.scorer_)
title = 'Tfidf using different normalization methods + StandardScaler + Logistic Regression'
best_tfidf = Validate_GridSearch(pipe_tfidf, param_grid, X_filtered_balanced, y_filtered_balanced)
#LR = Validate_Pipeline (pipe, Xi, yi, kf_st, title, label)

<h2> TFIDF and Count Vectorizers + LSA + Standard Scaler + Logistic Regression  </h2>

Another useful transformer to try in this type of data where the number of features is much higher than the number of examples, is dimensionaly reduction, which is implemented by  Principal Component Analysis (PCA).

Again, tfidf vectorizer outperformed the others in all performance parameters measured. Note that the duration increase much more for tfidf than the others, it also increased significantly for Count while it remained almost the same for Hashing. This shows the ability and suitability of the Hashing vectorizer to work with large datasets. It is possible to see that there was a small increase in the other parameters (Accuracy, Precision, Recall and F1), however this wouldn't be significant taking into account the extra processing time required.

In [0]:
# Pipeline that implements dimentionality reduction
pipe_tfidf_lsa = Pipeline ([
    ("vectorizer", TfidfVectorizer(stop_words = 'english')),
    ("dimens", TruncatedSVD(n_components=500, n_iter=7, random_state=42)),
    # Because the input to StandardScaler is a sparce matrix, with_mean has to be set to False
    ("scaler", StandardScaler(with_mean=False)),
    ("estimator", LogisticRegression())  
])

pipe_count_lsa = Pipeline ([
    ("vectorizer", CountVectorizer(stop_words = 'english', dtype = float)),
    ("dimens", TruncatedSVD(n_components=500, n_iter=7, random_state=42)),
    # Because the input to StandardScaler is a sparce matrix, with_mean has to be set to False
    ("scaler", StandardScaler(with_mean=False)),
    ("estimator", LogisticRegression())  
])

print("\n Stratified k-Fold cross-validation with Tf-idf and Count Vectorizers + LSA + Logistic Regression")
kf_st = StratifiedKFold(n_splits = 10)
pipe = [pipe_tfidf_lsa, pipe_count_lsa]
label = ['tfidf', 'count']
title = 'Tf-idf and count +  LSA + Standard Scaler + Logistic Regression '
Xi = [X_filtered_balanced] * len(pipe)
yi = [y_filtered_balanced] * len(pipe)
LR = Validate_Pipeline (pipe, Xi, yi, kf_st, title, label)

<h1> TFIDF, Count and Hashing Vectorizers + LSA + Logistic Regression</h1> 

Using Grid Search to find the pipeline parameters that produce the best precision.

In [0]:
# Pipeline that implements dimentionality reduction
pipe_tfidf_lsa = Pipeline ([
    ("vectorizer", TfidfVectorizer(stop_words = 'english')),
    ("dimens", TruncatedSVD(n_iter=7, random_state=42)),
    # Because the input to StandardScaler is a sparce matrix, with_mean has to be set to False
    ("scaler", StandardScaler(with_mean=False)),
    ("estimator", LogisticRegression())  
])
param_grid = dict(vectorizer__norm=[None, 'l1', 'l2'],
                  dimens__n_components=[110, 300, 500, 700])
                  #estimator__solver=['newton-cg','liblinear'])
    
gs_tfidf = Validate_GridSearchCV(pipe_tfidf_lsa, param_grid, X_filtered_balanced, y_filtered_balanced)

In [0]:
# Parameters of the estimator that produced the best performance
print('Parameters of the estimator that produced the best performance\n \t', gs_tfidf.best_params_)
# Precision was considered the most important parameter as we want a system that avoids misclassifying ham as spam 
print('Precision was considered the most important parameter as we want a system that avoids misclassifying ham as spam \n Best precision score:\t', gs_tfidf.best_score_) 
gs_tfidf_results = gs_tfidf.cv_results_
#print('CV results', gs_results)
gs_tfidf_time = ['mean_fit_time', 'mean_score_time']
# test parameters are more important
gs_tfidf_train = []#['mean_train_accuracy', 'mean_train_precision', 'mean_train_recall', 'mean_train_f1'] 
gs_tfidf_test = ['mean_test_accuracy', 'mean_test_precision', 'mean_test_recall', 'mean_test_f1']
gs_tfidf_rank = ['rank_test_accuracy', 'rank_test_precision', 'rank_test_recall', 'rank_test_f1']
gs_tfidf_all = gs_tfidf_time + gs_tfidf_train + gs_tfidf_test + gs_tfidf_rank
#visualizing the parameters for the 12 combinations formed by norm=[None, 'l1', 'l2'], n_components=[110, 300, 500, 700]
#for parameter in gs_all:
#    print (parameter, ["%.5f" % elem for elem in gs_results[parameter] ])
#print ('Parameters of the estimator that produced the Best score\n', gs_results['params'][2])
print('\n Other performance measurements for the best estimator:')
for parameter in gs_tfidf_all: print ('\t %s \t' % parameter, gs_tfidf_results[parameter][2])

In [0]:
pipe_count_lsa = Pipeline ([
    ("vectorizer", CountVectorizer(stop_words = 'english', dtype = float)),
    ("dimens", TruncatedSVD( n_iter=7, random_state=42)),
    # Because the input to StandardScaler is a sparce matrix, with_mean has to be set to False
    ("scaler", StandardScaler(with_mean=False)),
    ("estimator", LogisticRegression())  
])
param_grid = dict(dimens__n_components=[110, 300, 500, 700], 
                    estimator__solver=['newton-cg','liblinear'])

# using gridsearchCV to find the best parameters for lsa given the dictionary and the pipeline above 
best_count_lsa = Validate_GridSearchCV(pipe_count_lsa, param_grid,X_filtered_balanced, y_filtered_balanced )


In [0]:
# Parameters of the estimator that produced the best performance
print('Parameters of the estimator that produced the best performance\n \t', gs_count.best_params_)
# Precision was considered the most important parameter as we want a system that avoids misclassifying ham as spam 
print('Precision was considered the most important parameter as we want a system that avoids misclassifying ham as spam \n Best precision score:\t', gs_count.best_score_) 
gs_count_results = gs_count.cv_results_
#print('CV results', gs_results)
gs_count_time = ['mean_fit_time', 'mean_score_time']
# test parameters are more important
gs_count_train = []#['mean_train_accuracy', 'mean_train_precision', 'mean_train_recall', 'mean_train_f1'] 
gs_count_test = ['mean_test_accuracy', 'mean_test_precision', 'mean_test_recall', 'mean_test_f1']
gs_count_rank = ['rank_test_accuracy', 'rank_test_precision', 'rank_test_recall', 'rank_test_f1']
gs_count_all = gs_count_time + gs_count_train + gs_count_test + gs_count_rank
#visualizing the parameters for the 12 combinations formed by norm=[None, 'l1', 'l2'], n_components=[110, 300, 500, 700]
#for parameter in gs_all:
#    print (parameter, ["%.5f" % elem for elem in gs_results[parameter] ])
#print ('Parameters of the estimator that produced the Best score\n', gs_results['params'][2])
print('\n Other performance measurements for the best estimator:')
for parameter in gs_count_all: print ('\t %s \t' % parameter, gs_count_results[parameter][4])

<h2> TFIDF + Standard Scaler + Logistic Regression , Linear SVC, RF, MultinomialNB, BernoulliNB, kNN </h2>

We can compare the performance of different classifiers such as Logitics Regression, Linear Support Vector Classification, Random Forest, Multinomial Naive Bayes, Bernoulli Naive Bayes and k Nearest Neighbors. 

In [0]:
pipe_log = Pipeline ([("estimator", LogisticRegression()) ])
pipe_svc = Pipeline ([("estimator", LinearSVC(penalty='l2', dual=False, tol=1e-3))])
pipe_rf = Pipeline  ([("estimator", RandomForestClassifier(n_estimators=100)) ])
pipe_mnb = Pipeline ([("estimator", MultinomialNB(alpha=.01)) ])
pipe_bnb = Pipeline ([("estimator", BernoulliNB(alpha=.01))])
pipe_knn = Pipeline ([("estimator", KNeighborsClassifier(n_neighbors=10))])   


In [0]:
print("\n Stratified k-Fold cross-validation with Tf-idf + Standard Scaler + Logistic Regression, Linear SVC, RF, MultinomialNB, BernoulliNB, kNN")
kf_st = StratifiedKFold(n_splits = 10)
transformers = Pipeline ([
    ("vectorizer", TfidfVectorizer(stop_words = 'english')),
    ("scaler", StandardScaler(with_mean=False)),
])

pipe = [pipe_log, pipe_svc, pipe_rf, pipe_mnb, pipe_bnb, pipe_knn ]
label = ['LR', 'SVC', 'RF', 'MNB', 'BNB', 'kNN']
title = 'Tf-idf + Standard Scaler +  LR, SVC, RF, MNB, BNB, kNN'
Xi = [ transformers.fit_transform(X_filtered_balanced)] * len(pipe)
yi = [y_filtered_balanced] * len(pipe)
LR = Validate_Pipeline (pipe, Xi, yi, kf_st, title, label)

In [0]:
# find the best parameters for LinearSVC
pipe_SVC = Pipeline ([
    ("vectorizer", TfidfVectorizer(stop_words = 'english')),
    ("scaler", StandardScaler(with_mean=False)),
    ("estimator", LinearSVC())
])
param_grid = dict(vectorizer__norm = [None, 'l1', 'l2'], 
                     estimator__C= [1, 10, 100])
best_linearSVC = Validate_GridSearchCV(pipe_SVC, param_grid ,X_filtered_balanced, y_filtered_balanced)

In [0]:
# find the best parameters for Random Forest
pipe_RF = Pipeline  ([
    ("vectorizer", TfidfVectorizer(stop_words = 'english')),
    ("scaler", StandardScaler(with_mean=False)),
    ("estimator", RandomForestClassifier()) ])
param_grid = dict( vectorizer__norm = [None, 'l1','l2'],
                estimator__n_estimators =  [10, 50],
                 estimator__min_samples_split = [2,7])
best_RF = Validate_GridSearchCV(pipe_RF, param_grid , X_filtered_balanced, y_filtered_balanced)


In [0]:
# find the best parameters for Random Forest
pipe_knn = Pipeline  ([
    ("vectorizer", TfidfVectorizer(stop_words = 'english')),
    ("scaler", StandardScaler(with_mean=False)),
    ("estimator", KNeighborsClassifier())   ])
param_grid = dict( vectorizer__norm = [None, 'l1','l2'],
                estimator__n_neighbors =  [2, 10, 100])
best_knn = Validate_GridSearchCV(pipe_knn, param_grid,  X_filtered_balanced, y_filtered_balanced)

<h1> Conclusion </h1>
The pipeline that provided the best performance was the one implementing Tfidf vectorizer + LSA + Standard Scaler +Logistic Regression ( with {'dimens_n_components': 500, 'vectorizer_norm': 'l1'}, producing precision score:  99.5%). The performance achieved by SVC was very close to the Logistic Regression, with an improved fit time. Because of the amount of processing involved in LSA, the algorithm was slow to fit and score compared to the others. GridSearchCV is a great function to use in order to find the best parameters for a determined pipeline as setting the parameters correctly does make a great difference.

ps. I wrote the function Validate_GridSearchCV so I wouldn't have to repeate a lot of code lines, however it takes a long time to process some of the pipelines, so some of the results shown were generated by the code I had before. With time and pacience you can re-run the cell and check the results yourself - some of the cells took more than 20mins. 