In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

In [4]:
# Load data
df = pd.read_csv("mail_data.csv")
data = df.where((pd.notnull(df)), "")
data.loc[data['Category'] == 'spam', 'Category',] = 0
data.loc[data['Category'] == 'ham', 'Category',] = 1

X = data['Message']
Y = data['Category']

In [5]:
# Create a pipeline for feature extraction and model fitting
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)),
    ('clf', LogisticRegression())
])

In [6]:
# Split data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=3)
Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [7]:
# Fit the pipeline on the training data
pipeline.fit(X_train, Y_train)

# Make predictions on the test data
predictions = pipeline.predict(X_test)

# Evaluate accuracy
accuracy = accuracy_score(Y_test, predictions)
print("Accuracy:", accuracy)

Accuracy: 0.9659192825112107


In [8]:
new_email = "Congratulations! You've won a free trip to the Bahamas!"

prediction = pipeline.predict([new_email])  # Note the list format for a single input

print("Prediction:", prediction[0])  # Access the prediction value

if prediction[0] == 0:
    print("The email is predicted as spam.")
else:
    print("The email is predicted as ham.")

Prediction: 1
The email is predicted as ham.


In [9]:
new_email = "This is the 2nd time weve tried to contact u. 2 claim is easy, just call 0983 9339 7645!"

prediction = pipeline.predict([new_email])  # Note the list format for a single input

print("Prediction:", prediction[0])  # Access the prediction value

if prediction[0] == 0:
    print("The email is predicted as spam.")
else:
    print("The email is predicted as ham.")

Prediction: 0
The email is predicted as spam.


In [10]:
!pip install -q joblib
import joblib

joblib.dump(pipeline, 'spam_classifier.pkl')


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


['spam_classifier.pkl']

In [11]:
new_email = "We r trying to reach u for 2 long. Call 9383 2383 9292 for help. Thx u"

prediction = pipeline.predict([new_email])  # Note the list format for a single input

print("Prediction:", prediction[0])  # Access the prediction value

if prediction[0] == 0:
    print("The email is predicted as spam.")
else:
    print("The email is predicted as ham.")

Prediction: 1
The email is predicted as ham.
