1️ DATASET COLLECTION

In [None]:
from google.colab import files
uploaded = files.upload()

Saving SMSSpamCollection to SMSSpamCollection


In [None]:
import pandas as pd

df = pd.read_csv("SMSSpamCo inllection", sep='\t', names=["label", "message"])

df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [21]:
import pandas as pd

df = pd.read_csv("SMSSpamCollection",
                 sep='\t',
                 names=["label", "message"])

df.head()


Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


2️ DATA CLEANING

In [22]:
# Check null values
print(df.isnull().sum())

# Remove duplicates
df.drop_duplicates(inplace=True)

print("Dataset Shape:", df.shape)

label      0
message    0
dtype: int64
Dataset Shape: (5169, 2)


3 TEXT PREPROCESSING

In [23]:
import nltk
import string
from nltk.corpus import stopwords
nltk.download('stopwords')

def preprocess(text):
    text = text.lower()
    text = "".join([char for char in text if char not in string.punctuation])
    words = text.split()
    words = [word for word in words if word not in stopwords.words('english')]
    return " ".join(words)

df['cleaned_message'] = df['message'].apply(preprocess)

df.head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,label,message,cleaned_message
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah dont think goes usf lives around though


4️ TF-IDF VECTORIZATION

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['cleaned_message'])

print("TF-IDF Shape:", X.shape)


TF-IDF Shape: (5169, 9437)


5️ MODEL TRAINING

In [25]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = MultinomialNB()
model.fit(X_train, y_train)

print("Model Training Completed")


Model Training Completed


6️ MODEL TESTING

In [26]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9632495164410058

Confusion Matrix:
 [[894   0]
 [ 38 102]]

Classification Report:
               precision    recall  f1-score   support

         ham       0.96      1.00      0.98       894
        spam       1.00      0.73      0.84       140

    accuracy                           0.96      1034
   macro avg       0.98      0.86      0.91      1034
weighted avg       0.96      0.96      0.96      1034



7️ PREDICTION OUTPUT

In [27]:
sample = ["Congratulations! You have won a free iPhone"]
sample_vector = vectorizer.transform(sample)
prediction = model.predict(sample_vector)

print("Message:", sample[0])
print("Prediction:", prediction[0])


Message: Congratulations! You have won a free iPhone
Prediction: ham


In [28]:
def predict_spam(message):
    message_clean = preprocess(message)
    vector = vectorizer.transform([message_clean])
    result = model.predict(vector)
    return result[0]

# Test UI
msg = input("Enter message: ")
print("Prediction:", predict_spam(msg))

Enter message: Hi, Karan this side
Prediction: ham
