# Spam Mail Classification using Machine Learning

In [13]:
import numpy as np
import pandas as pd


import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix 

In [15]:
data = pd.read_csv('spam.csv')
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [17]:
data.tail()

Unnamed: 0,Category,Message
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...
5571,ham,Rofl. Its true to its name


In [19]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [21]:
print("No. of Rows:", data.shape[0])
print("No. of Columns:", data.shape[1])

No. of Rows: 5572
No. of Columns: 2


In [23]:
data.loc[data['Category'] == 'ham', 'Category',] = 1
data.loc[data['Category'] == 'spam', 'Category',] = 0
data.head()

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [112]:
input_col = 'Message'
output_col = 'Category'

X = data[input_col]
Y = data[output_col]

print("X shape", X.shape)
print("Y shape", Y.shape)

X shape (5572,)
Y shape (5572,)


In [43]:
vectorizer = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)
X = vectorizer.fit_transform(data['Message'])
Y = data['Category'].astype('int')

In [45]:
X_train, X_test, Y_train, Y_test =  train_test_split(X, Y, test_size=0.2 ,random_state=42)

In [49]:
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(4457, 8440)
(1115, 8440)
(4457,)
(1115,)


In [109]:
logreg = LogisticRegression()
logreg.fit(X_train, Y_train)
logreg_pred = logreg.predict(X_train)
training_logreg_acc = accuracy_score(logreg_pred, Y_train)

print("Accuracy: {:.2f}%".format(training_logreg_acc*100))

Accuracy: 96.79%


In [103]:
logreg_pred = logreg.predict(X_test)
logreg_acc = accuracy_score(logreg_pred, Y_test)

print("Accuracy: {:.2f}%".format(logreg_acc*100))

Accuracy: 95.70%


In [91]:
print("Confusion Matrix:\n",confusion_matrix(Y_test, logreg_pred))

Confusion Matrix:
 [[101  48]
 [  0 966]]


In [95]:
print("Classification Report:\n", classification_report(Y_test, logreg_pred))

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.68      0.81       149
           1       0.95      1.00      0.98       966

    accuracy                           0.96      1115
   macro avg       0.98      0.84      0.89      1115
weighted avg       0.96      0.96      0.95      1115



In [122]:
import joblib
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')
joblib.dump(logreg, 'logistic_regression_model.pkl')

['logistic_regression_model.pkl']

In [124]:
import gradio as gr

vectorizer = joblib.load('tfidf_vectorizer.pkl')
model = joblib.load('logistic_regression_model.pkl')

def predict_message(message):
    transformed_message = vectorizer.transform([message])
    prediction = model.predict(transformed_message)
    return "Spam" if prediction[0] == 0 else "Ham"

interface = gr.Interface(fn=predict_message, 
                          inputs=gr.Textbox(label="Enter your message"), 
                          outputs="text")
interface.launch()


* Running on local URL:  http://127.0.0.1:7861

To create a public link, set `share=True` in `launch()`.


