In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import ComplementNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import nltk
from nltk.corpus import stopwords
import string
import zipfile
import requests
from io import BytesIO
import joblib

In [2]:
# Download the dataset
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip'
response = requests.get(url)
zip_file = zipfile.ZipFile(BytesIO(response.content))

In [3]:
# Read the SMSSpamCollection file
with zip_file.open('SMSSpamCollection') as file:
    df = pd.read_csv(file, sep='\t', header=None, names=['label', 'message'])

In [4]:
# Map labels to numerical values: 'ham' -> 0, 'spam' -> 1
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

In [5]:
# Download stopwords from NLTK
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mayur\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
# Function to clean text: lowercase, remove punctuation, remove stop words
def clean_text(text):
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

In [7]:
# Split the data into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [8]:
# Apply text cleaning to both training and testing data
train_df['message'] = train_df['message'].apply(clean_text)
test_df['message'] = test_df['message'].apply(clean_text)

In [9]:
# Feature extraction using TF-IDF (fit on training data, transform on both)
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train_df['message']).toarray()
X_test = vectorizer.transform(test_df['message']).toarray()
y_train = train_df['label'].values
y_test = test_df['label'].values

In [10]:
# Train the Complement Naive Bayes model
clf = ComplementNB()
clf.fit(X_train, y_train)

ComplementNB()

In [11]:
# Predict on the test set
y_pred = clf.predict(X_test)

In [12]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.9686
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.97      0.98       966
           1       0.85      0.93      0.89       149

    accuracy                           0.97      1115
   macro avg       0.92      0.95      0.93      1115
weighted avg       0.97      0.97      0.97      1115

Confusion Matrix:
[[941  25]
 [ 10 139]]


In [13]:
# Save the trained model and vectorizer using joblib
joblib.dump(clf, 'spam_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

print("Model and vectorizer saved as 'spam_model.pkl' and 'tfidf_vectorizer.pkl'")

Model and vectorizer saved as 'spam_model.pkl' and 'tfidf_vectorizer.pkl'
