In [26]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix, f1_score

In [16]:
data = pd.read_csv("spam_ham_dataset.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [17]:
data = data.drop(columns=['label_num', 'Unnamed: 0'])


In [18]:
data.head()

Unnamed: 0,label,text
0,ham,Subject: enron methanol ; meter # : 988291\r\n...
1,ham,"Subject: hpl nom for january 9 , 2001\r\n( see..."
2,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar..."
3,spam,"Subject: photoshop , windows , office . cheap ..."
4,ham,Subject: re : indian springs\r\nthis deal is t...


For this exercise, three pipelines will be created. Each pipeline will be used to feed in data and evaluate the performance of three specific algorithms. These are the Logistic Regression model, Multinomial Naive Bayes and Polynomial Naive Bayes. A cross validation test will then be implemented on each of these models to compare their accuracies and select the best model for prediction purposes. 

In [19]:
pipeline_1 = Pipeline([
    ('count_vectorizer', CountVectorizer(ngram_range=(1,2))),
    ('multinomial_naive_bayes', MultinomialNB())
])

pipeline_2 = Pipeline([
    ('tfid_vectorizeer', TfidfVectorizer()),
    ('regressor', LogisticRegression())
])

pipeline_3 = Pipeline([
    ('count_vectorizer', CountVectorizer(ngram_range=(1,2))),
    ('complement_naive_bayes', ComplementNB())
])

In [20]:
X = data['text']
target = data['label']
X_train, X_test, y_train, y_test = train_test_split(X, target, test_size=0.3, random_state=42)


In [21]:
pipeline_1.fit(X_train, y_train)
preds = pipeline_1.predict(X_test)
print(classification_report(y_test, pipeline_1.predict(X_test)))

              precision    recall  f1-score   support

         ham       0.97      1.00      0.98      1121
        spam       0.99      0.93      0.96       431

    accuracy                           0.98      1552
   macro avg       0.98      0.96      0.97      1552
weighted avg       0.98      0.98      0.98      1552



In [22]:
pipeline_2.fit(X_train, y_train)
preds = pipeline_2.predict(X_test)
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

         ham       0.99      0.99      0.99      1121
        spam       0.97      0.97      0.97       431

    accuracy                           0.98      1552
   macro avg       0.98      0.98      0.98      1552
weighted avg       0.98      0.98      0.98      1552



In [23]:
pipeline_3.fit(X_train, y_train)
preds_3 = pipeline_3.predict(X_test)
print(classification_report(y_test, preds_3))

              precision    recall  f1-score   support

         ham       0.98      0.99      0.99      1121
        spam       0.99      0.94      0.96       431

    accuracy                           0.98      1552
   macro avg       0.98      0.97      0.97      1552
weighted avg       0.98      0.98      0.98      1552



In [30]:
#applying cross validation to select the most appropriate model
val_score_1 = cross_val_score(pipeline_1, X, target, cv=5)
val_score_2 = cross_val_score(pipeline_2, X, target, cv=5)
val_score_3 = cross_val_score(pipeline_3, X, target, cv=5)
print("Score for Pipeline 1: " + str(val_score_1.mean()))
print("Score for Pipeline 2: " + str(val_score_2.mean()))
print("Score for pipeline 3: " + str(val_score_3.mean()))

Score for Pipeline 1: 0.9767949616423252
Score for Pipeline 2: 0.9831751371251833
Score for pipeline 3: 0.9781489268260776


In [31]:
#from the Results Above, Pipeline 2 seems to have the most Accurarate performance of the three employed.The use of th Term Frequency Inverse Document Frequency and Logistic Regression has been used significantly in NLP tasks