1: Import Libraries

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

2. Load and Preprocess Data

# 1. Load dataset
raw_mail_data = pd.read_csv('mail_l7_dataset.csv')

# 2. Handle missing values
mail_data = raw_mail_data.where((pd.notnull(raw_mail_data)), '')

# 3. Label Encoding: spam = 0, ham = 1
mail_data.loc[mail_data['Category'] == 'spam', 'Category'] = 0
mail_data.loc[mail_data['Category'] == 'ham', 'Category'] = 1

# 4. Features (X) and Target (y)
X = mail_data['Message']
y = mail_data['Category'].astype('int')

# 5. Split Data (80% Train, 20% Test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3)

3. Text Feature Extraction (TF-IDF)

# Transform text to feature vectors
feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

4. Model Training & Evaluation

def evaluate_model(name, model):
    model.fit(X_train_features, y_train)
    prediction = model.predict(X_test_features)
    
    print(f"\n{name} Performance:")
    print(f"  Accuracy  : {accuracy_score(y_test, prediction):.3f}")
    print(f"  Precision : {precision_score(y_test, prediction):.3f}")
    print(f"  Recall    : {recall_score(y_test, prediction):.3f}")
    print(f"  F1-Score  : {f1_score(y_test, prediction):.3f}")
    print(f"  Confusion Matrix:\n{confusion_matrix(y_test, prediction)}")
    return model

# Train all three models
lr_model = evaluate_model("Logistic Regression", LogisticRegression())
rf_model = evaluate_model("Random Forest", RandomForestClassifier(n_estimators=100))
nb_model = evaluate_model("Naive Bayes", MultinomialNB())

5. Sanity Checks (Predictions)


test_messages = [
    "Free entry in 2 a weekly competition!",
    "I will meet you at the cafe tomorrow",
    "Congratulations, you won a free ticket"
]

print("\n--- Sanity Check Results ---")
for msg in test_messages:
    input_data = feature_extraction.transform([msg])
    
    res_lr = "Ham" if lr_model.predict(input_data)[0] == 1 else "Spam"
    res_rf = "Ham" if rf_model.predict(input_data)[0] == 1 else "Spam"
    res_nb = "Ham" if nb_model.predict(input_data)[0] == 1 else "Spam"
    
    print(f"Message: {msg}")
    print(f"LR: {res_lr} | RF: {res_rf} | NB: {res_nb}\n")
    