Email Spam Classification using Naive Bayes
   - Objective: Classify emails as spam or not spam using a Naive Bayes classifier.
   - Algorithm: Naive Bayes
   - Steps:
     - Use a labeled dataset of emails.
     - Perform text preprocessing (tokenization, stemming, removing stop words).
     - Train a Naive Bayes model.
     - Measure accuracy, precision, recall, and F1-score.
     - Integrate with a Flask or FastAPI API to classify emails via a REST endpoint.


In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

In [3]:
df = pd.read_csv('email_spam_classification.csv')

In [4]:
df

Unnamed: 0,email_text,label
0,Let’s schedule a call for next week.,not spam
1,"Congratulations, you have won a free iPhone!",spam
2,You’ve been selected for a free cruise trip!,spam
3,Here are the notes from our last meeting.,not spam
4,Get rich quick with this one simple trick!,spam
...,...,...
9995,You’ve been selected for a free cruise trip!,spam
9996,Let’s schedule a call for next week.,not spam
9997,"Dear user, your account has been compromised, ...",spam
9998,Win a $1000 gift card now!,spam


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   email_text  10000 non-null  object
 1   label       10000 non-null  object
dtypes: object(2)
memory usage: 156.4+ KB


In [6]:
df.info

<bound method DataFrame.info of                                              email_text     label
0                  Let’s schedule a call for next week.  not spam
1          Congratulations, you have won a free iPhone!      spam
2          You’ve been selected for a free cruise trip!      spam
3             Here are the notes from our last meeting.  not spam
4            Get rich quick with this one simple trick!      spam
...                                                 ...       ...
9995       You’ve been selected for a free cruise trip!      spam
9996               Let’s schedule a call for next week.  not spam
9997  Dear user, your account has been compromised, ...      spam
9998                         Win a $1000 gift card now!      spam
9999               Please review the attached document.  not spam

[10000 rows x 2 columns]>

In [7]:
df.describe

<bound method NDFrame.describe of                                              email_text     label
0                  Let’s schedule a call for next week.  not spam
1          Congratulations, you have won a free iPhone!      spam
2          You’ve been selected for a free cruise trip!      spam
3             Here are the notes from our last meeting.  not spam
4            Get rich quick with this one simple trick!      spam
...                                                 ...       ...
9995       You’ve been selected for a free cruise trip!      spam
9996               Let’s schedule a call for next week.  not spam
9997  Dear user, your account has been compromised, ...      spam
9998                         Win a $1000 gift card now!      spam
9999               Please review the attached document.  not spam

[10000 rows x 2 columns]>

In [8]:
df.groupby('label').describe()

Unnamed: 0_level_0,email_text,email_text,email_text,email_text
Unnamed: 0_level_1,count,unique,top,freq
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
not spam,4904,8,Don't forget the team meeting tomorrow.,656
spam,5096,8,You’ve been selected for a free cruise trip!,673


In [9]:
df1 = pd.get_dummies(df, columns=['label'], dtype='int')

In [10]:
df1

Unnamed: 0,email_text,label_not spam,label_spam
0,Let’s schedule a call for next week.,1,0
1,"Congratulations, you have won a free iPhone!",0,1
2,You’ve been selected for a free cruise trip!,0,1
3,Here are the notes from our last meeting.,1,0
4,Get rich quick with this one simple trick!,0,1
...,...,...,...
9995,You’ve been selected for a free cruise trip!,0,1
9996,Let’s schedule a call for next week.,1,0
9997,"Dear user, your account has been compromised, ...",0,1
9998,Win a $1000 gift card now!,0,1


In [39]:
X = df1['email_text']

In [41]:
X

0                    Let’s schedule a call for next week.
1            Congratulations, you have won a free iPhone!
2            You’ve been selected for a free cruise trip!
3               Here are the notes from our last meeting.
4              Get rich quick with this one simple trick!
                              ...                        
9995         You’ve been selected for a free cruise trip!
9996                 Let’s schedule a call for next week.
9997    Dear user, your account has been compromised, ...
9998                           Win a $1000 gift card now!
9999                 Please review the attached document.
Name: email_text, Length: 10000, dtype: object

In [45]:
y = df1[['label_spam']]

In [47]:
y

Unnamed: 0,label_spam
0,0
1,1
2,1
3,0
4,1
...,...
9995,1
9996,0
9997,1
9998,1


In [49]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train,y_test = train_test_split(X, y, test_size=0.25)

In [74]:
from sklearn.feature_extraction.text import CountVectorizer
v = CountVectorizer()
X_train_count = v.fit_transform(X_train.values)
X_train_count.toarray()[:10]

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,

In [None]:
"""
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

df = pd.read_csv('email_spam_classification.csv')
df1 = pd.get_dummies(df, columns=['label'], dtype='int')
X = df1['email_text']
y = df1.drop[['label_spam']]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train,y_test = train_test_split(X, y, test_size=0.25)
from sklearn.feature_extraction.text import CountVectorizer
v = CountVectorizer()
X_train_count = v.fit_transform(X_train)
X_train_count.toarray()[:3]
"""

In [55]:
"""
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

# Read dataset
df = pd.read_csv('email_spam_classification.csv')

# One-hot encoding for the 'label' column
df1 = pd.get_dummies(df, columns=['label'], dtype='int')

# Define X and y properly
X = df1['email_text']  # Use only the text column for X
y = df1[['label_spam']]  # Use one of the one-hot encoded columns for y (label_spam)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

# Vectorize the text data
v = CountVectorizer()
X_train_count = v.fit_transform(X_train)

# Show the first 3 vectors as an array
print(X_train_count.toarray()[:3])

from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train_count,y_train)

emails = [
    "Let’s schedule a call for next week.",
    "Congratulations, you have won a free iPhone!"
]
email_cnt = v.transform(emails)
model.predict(email_cnt)

X_test_count = v.transform(X_test)
model.score(X_test_count,y_test)

import pickle
with open('Email_Span.pkl','wb') as file:
    pickle.dump(model, file)


"""

"\nimport pandas as pd\nimport numpy as np\nfrom sklearn.preprocessing import OneHotEncoder\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.feature_extraction.text import CountVectorizer\n\n# Read dataset\ndf = pd.read_csv('email_spam_classification.csv')\n\n# One-hot encoding for the 'label' column\ndf1 = pd.get_dummies(df, columns=['label'], dtype='int')\n\n# Define X and y properly\nX = df1['email_text']  # Use only the text column for X\ny = df1[['label_spam']]  # Use one of the one-hot encoded columns for y (label_spam)\n\n# Split into training and testing sets\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)\n\n# Vectorize the text data\nv = CountVectorizer()\nX_train_count = v.fit_transform(X_train)\n\n# Show the first 3 vectors as an array\nprint(X_train_count.toarray()[:3])\n"

In [57]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train_count,y_train)

  y = column_or_1d(y, warn=True)


In [60]:
emails = [
    "Let’s schedule a call for next week.",
    "Congratulations, you have won a free iPhone!"
]
email_cnt = v.transform(emails)
model.predict(email_cnt)

array([0, 1])

In [62]:
X_test_count = v.transform(X_test)
model.score(X_test_count,y_test)

1.0

In [64]:
y_pred = model.predict(X_test_count)

In [66]:
y_pred

array([1, 1, 1, ..., 0, 1, 1])

In [68]:
y_pred.shape

(2500,)

In [70]:
y_train.shape

(7500, 1)

In [78]:
import pickle
with open('Email_Span.pkl','wb') as file:
    pickle.dump(model, file)


In [80]:
with open('vectorizer.pkl', 'wb') as vec_file:
    pickle.dump(v, vec_file)