# Reading data

In [27]:
import warnings

# Suppress all warnings
warnings.filterwarnings("ignore")

# Suppress specific warnings (e.g., DeprecationWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)


In [1]:
import pandas as pd

train_data = pd.read_csv('/content/train_data.csv')
test_data = pd.read_csv('/content/test_data.csv')

In [2]:
train_data.dropna(inplace=True)
test_data.dropna(inplace=True)

# Vectorizing using  Term Frequency-Inverse Document Frequency

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

# Convert text to Bag of Words (BoW) representation
vectorizer = CountVectorizer(analyzer=lambda x: x.split())
train_bow = vectorizer.fit_transform(train_data['processed msg'])

In [4]:
from sklearn.feature_extraction.text import TfidfTransformer

# Transform BoW into TF-IDF representation
tfidf_converter = TfidfTransformer()
train_tfidf = tfidf_converter.fit_transform(train_bow)


In [5]:
# Transform validation and test sets using fitted vectorizer and transformer

test_bow = vectorizer.transform(test_data['processed msg'])
test_tfidf = tfidf_converter.transform(test_bow)


In [6]:
y_train = train_data.Label.map({'ham': 0, 'spam': 1}).values
y_test = test_data['Label'].map({'ham': 0, 'spam': 1}).values

# Finetuning for best Random Forest Model

In [19]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import GridSearchCV

# Define the model
rf = RandomForestClassifier(class_weight='balanced', random_state=42)

# Define hyperparameters to tune
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Use F1-score as the metric
f1_scorer = make_scorer(f1_score)

# Perform GridSearchCV
grid_search = GridSearchCV(rf, param_grid, scoring=f1_scorer, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(train_tfidf, y_train)

# Get the best model
best_rf = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200}


In [18]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Function to print metrics
def print_metrics(y_true, y_pred, dataset_name):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    f1 = f1_score(y_true, y_pred, average='weighted')

    print(f"{dataset_name} Metrics:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print("-" * 30)

In [32]:
y_test_pred = best_rf.predict(test_tfidf)
print_metrics(y_test, y_test_pred, "Test Set")

Test Set Metrics:
Accuracy: 0.9847
Precision: 0.9848
Recall: 0.9847
F1 Score: 0.9844
------------------------------


# Finetuning for best Support Vector Classifier

In [20]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# Compute class weights to handle imbalance
class_weights = compute_class_weight('balanced', classes=np.array([0, 1]), y=y_train)
class_weight_dict = {0: class_weights[0], 1: class_weights[1]}

# Define the SVC model
svc = SVC(kernel='linear', class_weight=class_weight_dict)

# Define hyperparameter grid
param_grid = {
    'C': [0.1, 1, 10, 100],  # Regularization parameter
    'kernel': ['linear', 'rbf'],  # Kernel type
    'gamma': ['scale', 'auto']  # Kernel coefficient for 'rbf'
}

# Perform GridSearchCV with F1-score optimization
grid_search = GridSearchCV(svc, param_grid, scoring='f1', cv=5, n_jobs=-1, verbose=1)
grid_search.fit(train_tfidf, y_train)

# Best model
best_svc = grid_search.best_estimator_
print(f'Best Parameters: {grid_search.best_params_}')

Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best Parameters: {'C': 1, 'gamma': 'scale', 'kernel': 'linear'}


In [14]:
y_test_pred = best_svc.predict(test_tfidf)
print_metrics(y_test, y_test_pred, "Test Data")

Test Data Metrics:
Accuracy: 0.9820
Precision: 0.9818
Recall: 0.9820
F1 Score: 0.9818
------------------------------


# Fine tuning for best Logistic regression

In [30]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score

# Define parameter grid for tuning
param_grid = {
    'C': [0.01, 0.1, 1, 10, 20],  # Regularization strength
    'penalty': ['l1', 'l2', 'elasticnet'],  # Type of regularization
    'solver' : ['saga']
}

# Initialize Logistic Regression model
log_reg = LogisticRegression()

grid_search = GridSearchCV(log_reg, param_grid, scoring='f1', cv=5, n_jobs=-1)
grid_search.fit(train_tfidf, y_train)

# Best model
best_model = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)

Best Parameters: {'C': 20, 'penalty': 'l2', 'solver': 'saga'}


In [31]:
y_test_pred = best_model.predict(test_tfidf)
print_metrics(y_test, y_test_pred, "Test Data")

Test Data Metrics:
Accuracy: 0.9829
Precision: 0.9831
Recall: 0.9829
F1 Score: 0.9825
------------------------------


| Model             | Accuracy | Precision | Recall | F1 Score |
|-------------------|----------|-----------|--------|----------|
| **RandomForest**   | 0.9847   | 0.9848    | 0.9847 | 0.9844   |
| **SVC**            | 0.9820   | 0.9818    | 0.9820 | 0.9818   |
| **Logistic Regression** | 0.9829   | 0.9831    | 0.9829 | 0.9825   |

### Conclusion:
Based on the metrics, **RandomForest** outperforms both **SVC** and **Logistic Regression** in terms of accuracy, precision, recall, and F1 score, making it the best-performing model for this particular dataset. The slight differences between the models indicate that RandomForest might be more robust for this problem.
