In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
import string
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\manth\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\manth\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
data = pd.read_csv('spam.csv')

print(data.head())

  Category                                            Message
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...


In [3]:
data['Category'] = data['Category'].map({'ham': 0, 'spam': 1})

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.lower()
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

data['Message'] = data['Message'].apply(preprocess_text)

print(data.head())

   Category                                            Message
0         0  go jurong point crazy available bugis n great ...
1         0                            ok lar joking wif u oni
2         1  free entry 2 wkly comp win fa cup final tkts 2...
3         0                u dun say early hor u c already say
4         0           nah dont think go usf life around though


In [4]:
vectorizer = TfidfVectorizer(max_features=5000)

X = vectorizer.fit_transform(data['Message']).toarray()
y = data['Category']

print("Features shape:", X.shape)

Features shape: (5572, 5000)


In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print("Training set size:", X_train.shape[0])
print("Testing set size:", X_test.shape[0])

Training set size: 3900
Testing set size: 1672


In [6]:
model = MultinomialNB()
model.fit(X_train, y_train)

print("Model training completed.")

Model training completed.


In [7]:
y_pred = model.predict(X_test)

print("Classification Report:\n", classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.99      1448
           1       1.00      0.81      0.89       224

    accuracy                           0.97      1672
   macro avg       0.99      0.90      0.94      1672
weighted avg       0.98      0.97      0.97      1672

Accuracy: 0.9742822966507177


In [8]:
import pickle

with open('spam_classifier.pkl', 'wb') as file:
    pickle.dump(model, file)

with open('tfidf_vectorizer.pkl', 'wb') as file:
    pickle.dump(vectorizer, file)

print("Model and vectorizer saved successfully.")

Model and vectorizer saved successfully.


In [9]:
with open('spam_classifier.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

with open('tfidf_vectorizer.pkl', 'rb') as file:
    loaded_vectorizer = pickle.load(file)

new_messages = ["Free entry in a contest to win $1000!", 
                "Hey, are you coming to the meeting today?"]

new_messages_preprocessed = [preprocess_text(msg) for msg in new_messages]
new_messages_tfidf = loaded_vectorizer.transform(new_messages_preprocessed).toarray()

predictions = loaded_model.predict(new_messages_tfidf)

for msg, pred in zip(new_messages, predictions):
    print(f"Message: {msg}\nPrediction: {'Spam' if pred == 1 else 'Ham'}\n")

Message: Free entry in a contest to win $1000!
Prediction: Spam

Message: Hey, are you coming to the meeting today?
Prediction: Ham

