In [1]:
import pandas as pd

In [6]:
df = pd.read_csv('SMSSpamCollection')

ParserError: Error tokenizing data. C error: Expected 2 fields in line 12, saw 4


 getting a ParserError because the SMSSpamCollection dataset is tab-separated, but I'm using the default comma separator in pd.read_csv().
 fix using explicitly specify sep='\t'

In [7]:
df = pd.read_csv('SMSSpamCollection' , sep='\t')

In [8]:
df.head()

Unnamed: 0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."
0,ham,Ok lar... Joking wif u oni...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,U dun say so early hor... U c already then say...
3,ham,"Nah I don't think he goes to usf, he lives aro..."
4,spam,FreeMsg Hey there darling it's been 3 week's n...


In [9]:
df.columns = ['label', 'message']

In [10]:
df.head()

Unnamed: 0,label,message
0,ham,Ok lar... Joking wif u oni...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,U dun say so early hor... U c already then say...
3,ham,"Nah I don't think he goes to usf, he lives aro..."
4,spam,FreeMsg Hey there darling it's been 3 week's n...


In [11]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [12]:
# Features and labels
X = df['message']
y = df['label']

 # Converting text to feature vectors 
 (like using CountVectorizer or TfidfVectorizer) is necessary when using Naive Bayes for text classification — because Naive Bayes models work with numerical inputs, not raw text.

In [13]:
# Text vectorization (BoW)
vectorizer = CountVectorizer()
X_vectorized = vectorizer.fit_transform(X)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.25, random_state=42)

In [15]:
# Train model
model = MultinomialNB()
model.fit(X_train, y_train)

MultinomialNB is used for count data.

In [16]:
y_pred = model.predict(X_test)

In [17]:
y_pred

array(['ham', 'spam', 'ham', ..., 'ham', 'ham', 'ham'], dtype='<U4')

In [18]:
# Evaluation
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))

Confusion Matrix:
 [[1187   16]
 [  10  180]]

Classification Report:
               precision    recall  f1-score   support

         ham       0.99      0.99      0.99      1203
        spam       0.92      0.95      0.93       190

    accuracy                           0.98      1393
   macro avg       0.96      0.97      0.96      1393
weighted avg       0.98      0.98      0.98      1393

Accuracy Score: 0.9813352476669059


In [23]:
# Sample input message
sample_msg = ["Congratulations! You have won a $1000 Walmart gift card. Click here to claim now.",
             "hey! nice to meet you"]


In [24]:
# Preprocess and vectorize the message using the same vectorizer used during training
sample_msg_vectorized = vectorizer.transform(sample_msg)

# Predict the label
prediction = model.predict(sample_msg_vectorized)

In [26]:
print("Predicted Label:", prediction[1])

Predicted Label: ham


In [48]:
sample_messages = [
    "Congratulations! You have won a free cruise to the Bahamas!",
    "Are we still on for dinner tonight?",
    "Urgent! Call this number to claim your prize now.",
    "Hey, can you send me the notes from class?",
    "Get cheap products mesho.",
    "We tried reaching you for  shipment MYSC11687. Please call delivery executive today on 08045177777",
    "Dear User Last chance to shop at price drop! ",
    "Dear User Last chance to shop at price drop! Catch 2kg Biozyme Performance Range @ Rs. 4999 today. MuscleBlaze"
]

In [49]:
sample_vectorized = vectorizer.transform(sample_messages)
predictions = model.predict(sample_vectorized)

In [50]:
for msg, label in zip(sample_messages, predictions):
    print(f"Message: '{msg}'\n Predicted Label: {label}\n")

Message: 'Congratulations! You have won a free cruise to the Bahamas!'
 Predicted Label: spam

Message: 'Are we still on for dinner tonight?'
 Predicted Label: ham

Message: 'Urgent! Call this number to claim your prize now.'
 Predicted Label: spam

Message: 'Hey, can you send me the notes from class?'
 Predicted Label: ham

Message: 'Get cheap products mesho.'
 Predicted Label: ham

Message: 'We tried reaching you for  shipment MYSC11687. Please call delivery executive today on 08045177777'
 Predicted Label: ham

Message: 'Dear User Last chance to shop at price drop! '
 Predicted Label: spam

Message: 'Dear User Last chance to shop at price drop! Catch 2kg Biozyme Performance Range @ Rs. 4999 today. MuscleBlaze'
 Predicted Label: ham

