In [1]:
!pip install pandas numpy scikit-learn nltk joblib



In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.corpus import stopwords
import joblib

In [20]:
pip install pandas

Note: you may need to restart the kernel to use updated packages.


In [3]:
data = pd.read_csv("spam.csv")
data = data.drop_duplicates()
data.columns = ['label', 'text']
data['label'] = data['label'].map({1: 1, 0: 0})  # Ensure numeric labels
print(f"Data shape: {data.shape}")
print(f"Label distribution:\n{data['label'].value_counts()}")
print(f"Missing values:\n{data.isnull().sum()}")

Data shape: (83448, 2)
Label distribution:
label
1    43910
0    39538
Name: count, dtype: int64
Missing values:
label    0
text     0
dtype: int64


In [None]:
# Add neutral short messages to improve model (NOT spam examples)
neutral_messages = [
    "Mounika is a girl",
    "Ravi is absent today",
    "Meeting postponed",
    "Call me later",
    "Where are you",
    "What is your name",
    "See you tomorrow",
    "Thanks for your help",
    "Can you help me",
    "I am busy now",
    "Talk to you soon",
    "Have a good day",
    "What time is it",
    "I am at home",
    "Going to office",
    "Lunch break time",
    "Document ready",
    "Project completed",
    "Email sent",
    "Order placed"
]

neutral_df = pd.DataFrame({
    "label": 0,
    "text": neutral_messages
})

data = pd.concat([data, neutral_df], ignore_index=True)

# Shuffle the data to avoid bias
data = data.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"Updated Data shape: {data.shape}")
print(f"Label distribution after adding neutral messages:\n{data['label'].value_counts()}")

Updated Data shape: (83499, 2)
Label distribution after adding neutral messages:
label
1    43910
0    39589
Name: count, dtype: int64


In [13]:
def preprocess_text(text, stop_words):
    text = text.lower()
    # Keep letters, numbers, and spaces; remove other special characters
    text = ''.join([c if c.isalnum() or c.isspace() else '' for c in text])
    text = ' '.join(text.split())
    # Remove stopwords
    text = ' '.join(word for word in text.split() if word not in stop_words)
    return text

In [14]:
# Create custom stopwords
stop_words_nltk = set(stopwords.words('english'))
exclude_words = {'hi', 'i', 'you', 'please'}
stop_words = stop_words_nltk - exclude_words

# Convert label to binary: 1 = spam, 0 = ham
data['label'] = data['label'].astype(int)

# Apply preprocessing
data['text'] = data['text'].apply(lambda x: preprocess_text(x, stop_words))
print(f"Preprocessing complete. Sample texts:\n{data['text'].head()}")

Preprocessing complete. Sample texts:
0    ounce feather bowl hummingbird opec moment ala...
1    wulvob get medircations online qnb ikud viagra...
2    computer connection cnn com wednesday escapenu...
3    university degree obtain prosperous future mon...
4    thanks answers guys i know i checked rsync man...
Name: text, dtype: object


In [15]:
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(data['text'])
y = data['label'].values  # Use the already-converted numeric labels

In [16]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [17]:
model = MultinomialNB(alpha=1.0)
model.fit(X_train, y_train)

In [18]:
y_pred = model.predict(X_test)

In [19]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.9649700598802395

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.97      0.96      7863
           1       0.97      0.96      0.97      8837

    accuracy                           0.96     16700
   macro avg       0.96      0.97      0.96     16700
weighted avg       0.97      0.96      0.96     16700



In [20]:
joblib.dump(model, "spam_model.pkl")
joblib.dump(tfidf, "tfidf_vectorizer.pkl")

['tfidf_vectorizer.pkl']

In [30]:
test_messages = [
    "Hi Rahul, I'll be late to the meeting today because of traffic.",
    "Congratulations! You have won ₹10,00,000. Click the link now to claim your prize!"
]

In [76]:
test_messages_clean = [preprocess_text(msg, stop_words) for msg in test_messages]
X_test_new = tfidf.transform(test_messages_clean)
y_pred_new = model.predict(X_test_new)

In [77]:
for msg, label in zip(test_messages, y_pred_new):
    prediction = "Spam" if label == 1 else "Not Spam"
    print(f"Message: {msg}\nPrediction: {prediction}\n")

Message: Hi Rahul, I'll be late to the meeting today because of traffic.
Prediction: Spam

Message: Congratulations! You have won ₹10,00,000. Click the link now to claim your prize!
Prediction: Spam



In [18]:
data['text'] = data['text'].str.replace(r'[^a-z\s]', '', regex=True)
print(data['text'].head())


0    ounce feather bowl hummingbird opec moment ala...
1    wulvob get your medircations online qnb ikud v...
2     computer connection from cnn com wednesday es...
3    university degree obtain a prosperous future m...
4    thanks for all your answers guys i know i shou...
Name: text, dtype: object


In [19]:
data['text'] = data['text'].str.replace(r'\s+', ' ', regex=True).str.strip()
print(data['text'].head())


0    ounce feather bowl hummingbird opec moment ala...
1    wulvob get your medircations online qnb ikud v...
2    computer connection from cnn com wednesday esc...
3    university degree obtain a prosperous future m...
4    thanks for all your answers guys i know i shou...
Name: text, dtype: object


In [21]:
pip install nltk

Collecting nltk
  Using cached nltk-3.9.2-py3-none-any.whl.metadata (3.2 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2026.1.15-cp312-cp312-win_amd64.whl.metadata (41 kB)
Collecting tqdm (from nltk)
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Using cached nltk-3.9.2-py3-none-any.whl (1.5 MB)
Downloading regex-2026.1.15-cp312-cp312-win_amd64.whl (277 kB)
Using cached tqdm-4.67.1-py3-none-any.whl (78 kB)
Installing collected packages: tqdm, regex, nltk

   ------------- -------------------------- 1/3 [regex]
   -------------------------- ------------- 2/3 [nltk]
   -------------------------- ------------- 2/3 [nltk]
   -------------------------- ------------- 2/3 [nltk]
   -------------------------- ------------- 2/3 [nltk]
   -------------------------- ------------- 2/3 [nltk]
   -------------------------- ------------- 2/3 [nltk]
   -------------------------- ------------- 2/3 [nltk]
   -------------------------- ------------- 2/3 [nltk]
   ----------

In [22]:
import nltk
from nltk.corpus import stopwords


In [23]:
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kvmou\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [None]:
stop_words = set(stopwords.words('english')) - {'hi','i','you','please'}

print(list(stop_words)[:10])


NameError: name 'stopwords' is not defined

In [25]:
data['text'] = data['text'].apply(
    lambda x: ' '.join(word for word in x.split() if word not in stop_words)
)

print(data['text'].head())


0    ounce feather bowl hummingbird opec moment ala...
1    wulvob get medircations online qnb ikud viagra...
2    computer connection cnn com wednesday escapenu...
3    university degree obtain prosperous future mon...
4    thanks answers guys know checked rsync manual ...
Name: text, dtype: object


In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [27]:
tfidf = TfidfVectorizer(max_features=5000)


In [29]:
X = tfidf.fit_transform(data['text'])
y = data['label']


In [30]:
print(X.shape)
print(y.shape)


(83448, 5000)
(83448,)


In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(data['text'])
y = data['label']

print(X.shape)
print(y.shape)


(81551, 5000)
(81551,)


In [38]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)


(65240, 5000)
(16311, 5000)
(65240,)
(16311,)


In [39]:
from sklearn.naive_bayes import MultinomialNB


In [40]:
model = MultinomialNB()
model.fit(X_train, y_train)


In [41]:
y_pred = model.predict(X_test)


In [42]:
from sklearn.metrics import accuracy_score

print("Accuracy:", accuracy_score(y_test, y_pred))


Accuracy: 0.9620501502053829


In [43]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.95      0.97      0.96      7692
           1       0.97      0.96      0.96      8619

    accuracy                           0.96     16311
   macro avg       0.96      0.96      0.96     16311
weighted avg       0.96      0.96      0.96     16311



In [44]:
import joblib


In [45]:
joblib.dump(model, "spam_model.pkl")


['spam_model.pkl']

In [46]:
joblib.dump(tfidf, "tfidf_vectorizer.pkl")


['tfidf_vectorizer.pkl']

In [47]:
import os
print(os.listdir())


['.github', '.venv', 'app.py', 'model.ipynb', 'README.md', 'requirements.txt', 'Sample.png', 'spam.csv', 'spam_model.pkl', 'templates', 'tfidf_vectorizer.pkl']


In [48]:
pip install flask


Note: you may need to restart the kernel to use updated packages.
