In [22]:
#Necessary imports
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.metrics import classification_report
import pandas as pd

In [23]:
#Defining the train, test and validation data filepaths
train_path = "C:/CMI/MSc/SEM2/AML/Assignment1/train.csv"
val_path ="C:/CMI/MSc/SEM2/AML/Assignment1/val.csv"
test_path = "C:/CMI/MSc/SEM2/AML/Assignment1/test.csv"

In [24]:
#Loading the train, test and validation data
train_data = pd.read_csv(train_path)
val_data = pd.read_csv(val_path)
test_data = pd.read_csv(test_path)

In [25]:
#Considering only the transformed text for training our model
x_train,y_train=train_data["transformed_text"], train_data["Spam"]
x_val,y_val=val_data["transformed_text"],val_data["Spam"]
x_test,y_test=test_data["transformed_text"],test_data["Spam"]

In [26]:
x_train.shape,x_val.shape,x_test.shape

((3986,), (854,), (855,))

We use CountVectorizer and TF-IDF transformer to extract features from the emails for training our model

In [27]:
Bag_of_words = CountVectorizer()
Bag_of_words.fit(x_train)

In [28]:
train_vocab = Bag_of_words.vocabulary_
x_train_bow = Bag_of_words.transform(x_train)
x_val_bow = Bag_of_words.transform(x_val)
x_test_bow = Bag_of_words.transform(x_test)

len(train_vocab), x_train_bow.shape, x_val_bow.shape, x_test_bow.shape

(31765, (3986, 31765), (854, 31765), (855, 31765))

In [29]:
tfidf_transformer = TfidfTransformer().fit(x_train_bow)

x_train_tf = tfidf_transformer.transform(x_train_bow)
x_val_tf = tfidf_transformer.transform(x_val_bow)
x_test_tf = tfidf_transformer.transform(x_test_bow)

x_train_tf.shape, x_val_tf.shape, x_test_tf.shape

((3986, 31765), (854, 31765), (855, 31765))

We choose SVM as our model on the train data and find the most suitable value of 'C' based on the model's performance on our validation data.

In [30]:
for i in [0.01, 0.1, 1, 10, 20]:
    sv = SVC(C = i)
    sv.fit(x_train_tf,y_train)
    y_pred = sv.predict(x_val_tf)
    print(i, "\n", classification_report(y_val, y_pred, zero_division=1))

0.01 
               precision    recall  f1-score   support

           0       0.76      1.00      0.86       647
           1       1.00      0.00      0.00       207

    accuracy                           0.76       854
   macro avg       0.88      0.50      0.43       854
weighted avg       0.82      0.76      0.65       854

0.1 
               precision    recall  f1-score   support

           0       0.80      1.00      0.89       647
           1       1.00      0.23      0.38       207

    accuracy                           0.81       854
   macro avg       0.90      0.62      0.63       854
weighted avg       0.85      0.81      0.77       854

1 
               precision    recall  f1-score   support

           0       0.99      1.00      0.99       647
           1       0.99      0.96      0.98       207

    accuracy                           0.99       854
   macro avg       0.99      0.98      0.98       854
weighted avg       0.99      0.99      0.99       854

10

We choose 'C'=10 as our best fit model

In [31]:
sv = SVC(C = 10)
sv.fit(x_train_tf,y_train)

Now we test our model on the test data

In [32]:
y_pred = sv.predict(x_test_tf)
print(classification_report(y_test, y_pred, zero_division=1))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       622
           1       1.00      0.96      0.98       233

    accuracy                           0.99       855
   macro avg       0.99      0.98      0.99       855
weighted avg       0.99      0.99      0.99       855




Now we test 3 benchmark models on the test data



#### Decision Tree


In [33]:
DT = DecisionTreeClassifier(criterion = "gini", random_state = 123,min_samples_leaf=6)
DT.fit(x_train_tf,y_train)
y_pred = DT.predict(x_test_tf)
print(classification_report(y_test, y_pred, zero_division=1))

              precision    recall  f1-score   support

           0       0.95      0.98      0.97       622
           1       0.94      0.87      0.90       233

    accuracy                           0.95       855
   macro avg       0.95      0.92      0.93       855
weighted avg       0.95      0.95      0.95       855



#### Naive Bayes

In [34]:
NB=MultinomialNB()
NB.fit(x_train_tf,y_train)
y_pred = NB.predict(x_test_tf)
print(classification_report(y_test, y_pred, zero_division=1))

              precision    recall  f1-score   support

           0       0.84      1.00      0.91       622
           1       1.00      0.50      0.66       233

    accuracy                           0.86       855
   macro avg       0.92      0.75      0.79       855
weighted avg       0.88      0.86      0.85       855



#### Logisitic Regression


In [35]:
log_reg_model = LogisticRegression()
log_reg_model.fit(x_train_tf, y_train)
y_pred = log_reg_model.predict(x_test_tf)
print(classification_report(y_test, y_pred, zero_division=1))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98       622
           1       1.00      0.88      0.94       233

    accuracy                           0.97       855
   macro avg       0.98      0.94      0.96       855
weighted avg       0.97      0.97      0.97       855



Since we have an unbalanced dataset (with nearly 70% 0s and 30% 1s) we should try to maximize both the F1 score and the Accuracy.

So, based on the Accuracy and F1 score, our SVM model has the best performance and the Logistic Regression model has the best performance out of the the 3 benchmark models.