In [38]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

import warnings
warnings.filterwarnings('ignore')

### Reading the csv files created in preparation part

In [39]:
train_df = pd.read_csv('./sms+spam+collection/train.csv')
valid_df = pd.read_csv('./sms+spam+collection/valid.csv')
test_df = pd.read_csv('./sms+spam+collection/test.csv')

In [40]:
# There are some empty sms filed, hence removing them
train_df.dropna(inplace=True)

### Processing the text

In [41]:
bow_transformer = CountVectorizer(analyzer=lambda x: x.split()).fit(train_df['processed sms'])
sms_bow = bow_transformer.transform(train_df['processed sms'])
sms_bow

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 35911 stored elements and shape (4171, 6952)>

In [42]:
tfidf_transformer = TfidfTransformer().fit(sms_bow)

### Creating TF-IDF embeddings for train, valid and test datasets

In [43]:
y_train = train_df.label.map({'ham': 0, 'spam': 1}).values

train_sms_tfidf = tfidf_transformer.transform(sms_bow)
train_sms_tfidf

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 35911 stored elements and shape (4171, 6952)>

In [44]:
y_true_val = valid_df.label.map({'ham': 0, 'spam': 1}).values

valid_sms_tfidf = tfidf_transformer.transform(bow_transformer.transform(valid_df['processed sms']))
valid_sms_tfidf

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 5134 stored elements and shape (697, 6952)>

In [45]:
y_true_test = test_df.label.map({'ham': 0, 'spam': 1}).values

test_sms_tfidf = tfidf_transformer.transform(bow_transformer.transform(test_df['processed sms']))
test_sms_tfidf

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 5363 stored elements and shape (697, 6952)>

In [46]:
def fit_model(model, x_train, y_train):
    """Fit the model on training data."""
    model.fit(x_train, y_train)
    return model


def score_model(model, data):
    """Score the model on given data."""
    return model.predict(data)


def evaluate_predictions(y_true, y_pred):
    """Evaluate model predictions using accuracy and classification report."""
    return classification_report(y_true, y_pred)


def validate_model(model, x_train, y_train, x_val, y_val, param_grid=None):
    """Validate the model by training, scoring, evaluating, and tuning hyperparameters."""
    # Fit model on training data
    model = fit_model(model, x_train, y_train)
    
    # Score on train and validation data
    y_train_pred = score_model(model, x_train)
    y_val_pred = score_model(model, x_val)
    
    # Evaluate on train and validation data
    train_report = evaluate_predictions(y_train, y_train_pred)
    val_report = evaluate_predictions(y_val, y_val_pred)
    
    print("Train Report:\n", train_report)
    print("Validation Report:\n", val_report)
    
    # Hyperparameter tuning if param_grid is provided
    if param_grid:
        # Since dataset is imbalanced, I am using `recall` to decide the parameter of the best model
        grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1')
        grid_search.fit(x_train, y_train)
        model = grid_search.best_estimator_
        print("Best Parameters:", grid_search.best_params_)
    
    return model

### Fine-tunning Support Vector Classifier

In [None]:
svc = SVC()
parameters = {
    'C': [0.01, 0.1, 1, 10, 20],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'degree': [2,3,4,5],
    'gamma': ['scale', 'auto']
}

best_svc = validate_model(
    model=svc, 
    x_train=train_sms_tfidf, 
    y_train=y_train, 
    x_val=valid_sms_tfidf, 
    y_val=y_true_val, 
    param_grid=parameters
)

Train Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      3611
           1       1.00      0.99      0.99       560

    accuracy                           1.00      4171
   macro avg       1.00      0.99      1.00      4171
weighted avg       1.00      1.00      1.00      4171

Validation Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.99       603
           1       0.99      0.82      0.90        94

    accuracy                           0.97       697
   macro avg       0.98      0.91      0.94       697
weighted avg       0.97      0.97      0.97       697

Best Parameters: {'C': 10, 'degree': 2, 'gamma': 'scale', 'kernel': 'linear'}


Evaluating on test dataset

In [48]:
y_pred_test = score_model(best_svc, test_sms_tfidf)
print(evaluate_predictions(y_true_test, y_pred_test))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       604
           1       0.95      0.95      0.95        93

    accuracy                           0.99       697
   macro avg       0.97      0.97      0.97       697
weighted avg       0.99      0.99      0.99       697



### Fine-tunning Logistic Regression model

In [49]:
lr = LogisticRegression()
parameters = {
    'penalty': ['l1', 'l2', 'elasticnet', None],
    'C': [0.001, 0.1, 1.0, 10, 20],
}

best_lr = validate_model(
    model=lr, 
    x_train=train_sms_tfidf, 
    y_train=y_train, 
    x_val=valid_sms_tfidf, 
    y_val=y_true_val, 
    param_grid=parameters
)

Train Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.98      3611
           1       0.99      0.78      0.87       560

    accuracy                           0.97      4171
   macro avg       0.98      0.89      0.93      4171
weighted avg       0.97      0.97      0.97      4171

Validation Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98       603
           1       0.97      0.71      0.82        94

    accuracy                           0.96       697
   macro avg       0.96      0.85      0.90       697
weighted avg       0.96      0.96      0.96       697

Best Parameters: {'C': 0.001, 'penalty': None}


Evaluating on test dataset

In [50]:
y_pred_test = score_model(best_lr, test_sms_tfidf)
print(evaluate_predictions(y_true_test, y_pred_test))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       604
           1       0.93      0.90      0.92        93

    accuracy                           0.98       697
   macro avg       0.96      0.95      0.95       697
weighted avg       0.98      0.98      0.98       697



### Fine-tunning Decision Tree Classifier

In [51]:
tree = DecisionTreeClassifier(random_state=0)
parameters = {
    'criterion': ['gini', 'entropy', 'log_loss'],
    'splitter': ['best', 'random'],
    'max_features': ['sqrt', 'log2', None, 1, 2]
}

best_tree = validate_model(
    model=tree, 
    x_train=train_sms_tfidf, 
    y_train=y_train, 
    x_val=valid_sms_tfidf, 
    y_val=y_true_val, 
    param_grid=parameters
)

Train Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      3611
           1       1.00      1.00      1.00       560

    accuracy                           1.00      4171
   macro avg       1.00      1.00      1.00      4171
weighted avg       1.00      1.00      1.00      4171

Validation Report:
               precision    recall  f1-score   support

           0       0.97      0.98      0.97       603
           1       0.87      0.79      0.83        94

    accuracy                           0.96       697
   macro avg       0.92      0.88      0.90       697
weighted avg       0.95      0.96      0.95       697

Best Parameters: {'criterion': 'entropy', 'max_features': None, 'splitter': 'random'}


Evaluating on test dataset

In [52]:
y_pred_test = score_model(best_tree, test_sms_tfidf)
print(evaluate_predictions(y_true_test, y_pred_test))

              precision    recall  f1-score   support

           0       0.97      0.98      0.97       604
           1       0.84      0.77      0.80        93

    accuracy                           0.95       697
   macro avg       0.90      0.88      0.89       697
weighted avg       0.95      0.95      0.95       697



Comparing F1-Scores, Support Vector Classifier gives the best results and achieves $95\%$ F1 score for `spam` class.