In [13]:
# Import necessary Libraries and modules
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report


In [4]:
#Load the dataset

url = "https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv"
df = pd.read_csv(url, sep='\t', header=None)
df.columns = ['label', 'message']
df.head(100)

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
95,spam,Your free ringtone is waiting to be collected....
96,ham,Watching telugu movie..wat abt u?
97,ham,i see. When we finish we have loads of loans t...
98,ham,Hi. Wk been ok - on hols now! Yes on for a bit...


In [5]:
#Encode the labels - Ham = 0 ; Spam = 1
df['label'] = df.label.map({'ham': 0, 'spam': 1})
df.head()

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
X_train, X_test, y_train, y_test = train_test_split(df['message'], df['label'], test_size = 0.2) # test split of 20%

In [8]:
# Text processing and feature extraction
vectorizer = TfidfVectorizer(stop_words='english')
X_train_transformed = vectorizer.fit_transform(X_train)
X_test_transformed = vectorizer.transform(X_test)

In [9]:
# Train with NB classifier

classifier = MultinomialNB()
classifier.fit(X_train_transformed, y_train)

In [10]:
#Make predictions on the test set

predictions = classifier.predict(X_test_transformed)

In [11]:
# Evaluate the model
print("Accuracy", accuracy_score(y_test, predictions))


Accuracy 0.9730941704035875


In [14]:
classification_report = classification_report(y_test, predictions)
print("Classification Report:\n", classification_report)

Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.98       968
           1       0.99      0.80      0.89       147

    accuracy                           0.97      1115
   macro avg       0.98      0.90      0.94      1115
weighted avg       0.97      0.97      0.97      1115



In [16]:

# Sample predictions
sample_messages = ["Congratulations! You've won a free ticket to the Bahamas. Text WIN to claim.",
                   "Hey, are we still meeting for lunch tomorrow?"]

# Transform and predict
sample_transformed = vectorizer.transform(sample_messages)
sample_predictions = classifier.predict(sample_transformed)

# Show predictions
for i, message in enumerate(sample_messages):
    print(f"\nMessage: {message}\nPrediction: {'Spam' if sample_predictions[i] else 'Ham'}")



Message: Congratulations! You've won a free ticket to the Bahamas. Text WIN to claim.
Prediction: Spam

Message: Hey, are we still meeting for lunch tomorrow?
Prediction: Ham
