In [18]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

import warnings
warnings.filterwarnings('ignore')

### Reading the csv files created in preparation part

In [19]:
train_df = pd.read_csv('./sms+spam+collection/train.csv')
valid_df = pd.read_csv('./sms+spam+collection/valid.csv')
test_df = pd.read_csv('./sms+spam+collection/test.csv')

In [20]:
# There are some empty sms filed, hence removing them
train_df.dropna(inplace=True)

### Processing the text

In [21]:
bow_transformer = CountVectorizer(analyzer=lambda x: x.split()).fit(train_df['processed sms'])
sms_bow = bow_transformer.transform(train_df['processed sms'])
sms_bow

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 35911 stored elements and shape (4171, 6952)>

In [22]:
tfidf_transformer = TfidfTransformer().fit(sms_bow)

### Creating TF-IDF embeddings for train, valid and test datasets

In [23]:
train_sms_tfidf = tfidf_transformer.transform(sms_bow)
train_sms_tfidf

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 35911 stored elements and shape (4171, 6952)>

In [24]:
valid_sms_tfidf = tfidf_transformer.transform(bow_transformer.transform(valid_df['processed sms']))
valid_sms_tfidf

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 5134 stored elements and shape (697, 6952)>

In [25]:
test_sms_tfidf = tfidf_transformer.transform(bow_transformer.transform(test_df['processed sms']))
test_sms_tfidf

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 5363 stored elements and shape (697, 6952)>

### Fine-tunning Support Vector Classifier

Since dataset is imbalanced, I am using `recall` to decide the parameter of the best model

In [9]:
parameters = {
    'C': [0.01, 0.1, 1, 10, 20],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'degree': [2,3,4,5],
    'gamma': ['scale', 'auto']
}

svc = SVC()
clf = GridSearchCV(svc, param_grid=parameters, scoring='recall')

y = train_df.label.map({'ham': 0, 'spam': 1}).values
clf.fit(train_sms_tfidf, y)

In [10]:
# Best parameters after fine-tunning
clf.best_estimator_

Evaluating on test dataset

In [11]:
y_pred_test = clf.predict(test_sms_tfidf)

y_true_test = test_df.label.map({'ham': 0, 'spam': 1}).values
print(classification_report(y_true_test, y_pred_test))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       604
           1       0.95      0.95      0.95        93

    accuracy                           0.99       697
   macro avg       0.97      0.97      0.97       697
weighted avg       0.99      0.99      0.99       697



### Fine-tunning Logistic Regression model

In [12]:
parameters = {
    'penalty': ['l1', 'l2', 'elasticnet', None],
    'C': [0.001, 0.1, 1.0, 10, 20],
}

lr = LogisticRegression()
clf = GridSearchCV(lr, param_grid=parameters, scoring='recall')

y = train_df.label.map({'ham': 0, 'spam': 1}).values
clf.fit(train_sms_tfidf, y)

In [13]:
# Best parameters after fine-tunning
clf.best_estimator_

Evaluating on test dataset

In [14]:
y_pred_test = clf.predict(test_sms_tfidf)

y_true_test = test_df.label.map({'ham': 0, 'spam': 1}).values
print(classification_report(y_true_test, y_pred_test))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       604
           1       0.93      0.90      0.92        93

    accuracy                           0.98       697
   macro avg       0.96      0.95      0.95       697
weighted avg       0.98      0.98      0.98       697



### Fine-tunning Decision Tree Classifier

In [35]:
parameters = {
    'criterion': ['gini', 'entropy', 'log_loss'],
    'splitter': ['best', 'random'],
    'max_features': ['sqrt', 'log2', None, 1, 2]
}

tree = DecisionTreeClassifier()
clf = GridSearchCV(tree, param_grid=parameters, scoring='recall')

y = train_df.label.map({'ham': 0, 'spam': 1}).values
clf.fit(train_sms_tfidf, y)

In [36]:
# Best parameters after fine-tunning
clf.best_estimator_

Evaluating on test dataset

In [37]:
y_pred_test = clf.predict(test_sms_tfidf)

y_true_test = test_df.label.map({'ham': 0, 'spam': 1}).values
print(classification_report(y_true_test, y_pred_test))

              precision    recall  f1-score   support

           0       0.97      0.97      0.97       604
           1       0.83      0.82      0.82        93

    accuracy                           0.95       697
   macro avg       0.90      0.90      0.90       697
weighted avg       0.95      0.95      0.95       697



Comparing F1-Scores, Support Vector Classifier gives the best results and achieves $95\%$ F1 score for `spam` class.