# Problem Definition:Goal:
### Build a machine learning model that can classify emails as spam or not spam (ham) based on their content.


In [81]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import string
import matplotlib.pyplot as plt

#  Data Collection

In [82]:
df=pd.read_csv("enron_spam_data[1].csv")
df

Unnamed: 0,Message ID,Subject,Message,Spam/Ham,Date
0,0,christmas tree farm pictures,,ham,1999-12-10
1,1,"vastar resources , inc .","gary , production from the high island larger ...",ham,1999-12-13
2,2,calpine daily gas nomination,- calpine daily gas nomination 1 . doc,ham,1999-12-14
3,3,re : issue,fyi - see note below - already done .\nstella\...,ham,1999-12-14
4,4,meter 7268 nov allocation,fyi .\n- - - - - - - - - - - - - - - - - - - -...,ham,1999-12-14
...,...,...,...,...,...
33711,33711,= ? iso - 8859 - 1 ? q ? good _ news _ c = eda...,"hello , welcome to gigapharm onlinne shop .\np...",spam,2005-07-29
33712,33712,all prescript medicines are on special . to be...,i got it earlier than expected and it was wrap...,spam,2005-07-29
33713,33713,the next generation online pharmacy .,are you ready to rock on ? let the man in you ...,spam,2005-07-30
33714,33714,bloow in 5 - 10 times the time,learn how to last 5 - 10 times longer in\nbed ...,spam,2005-07-30


# Data Cleaning & Preprocessing

## here message ID ,date has no use but i not remove it from datasets

In [83]:
df.nunique()

Message ID    33716
Subject       24206
Message       29779
Spam/Ham          2
Date           1527
dtype: int64

### total sample size is 33716 but in subject and message has sme duplicates data so we need to remove it 

In [84]:
df = df.drop_duplicates()
df = df.drop_duplicates(subset='Message')
df = df.drop_duplicates(subset='Subject')
# df = df.drop_duplicates(subset=['text_len', 'sub_len'])
print("Final shape:", df.shape)


Final shape: (23575, 5)


### final data is (23575, 5) after removing the duplicate

In [85]:
df.nunique()

Message ID    23575
Subject       23574
Message       23574
Spam/Ham          2
Date           1513
dtype: int64

In [86]:
df

Unnamed: 0,Message ID,Subject,Message,Spam/Ham,Date
0,0,christmas tree farm pictures,,ham,1999-12-10
1,1,"vastar resources , inc .","gary , production from the high island larger ...",ham,1999-12-13
2,2,calpine daily gas nomination,- calpine daily gas nomination 1 . doc,ham,1999-12-14
3,3,re : issue,fyi - see note below - already done .\nstella\...,ham,1999-12-14
4,4,meter 7268 nov allocation,fyi .\n- - - - - - - - - - - - - - - - - - - -...,ham,1999-12-14
...,...,...,...,...,...
33706,33706,= ? iso - 8859 - 2 ? q ? my _ proposition ? =,from : dr . dan . nkanga\nzenith international...,spam,2005-07-29
33707,33707,make big bucks in the medical field bait - exc...,having problems seeing the graphics ? please g...,spam,2005-07-29
33711,33711,= ? iso - 8859 - 1 ? q ? good _ news _ c = eda...,"hello , welcome to gigapharm onlinne shop .\np...",spam,2005-07-29
33712,33712,all prescript medicines are on special . to be...,i got it earlier than expected and it was wrap...,spam,2005-07-29


In [87]:
df['Spam/Ham'].value_counts()

Spam/Ham
ham     12656
spam    10919
Name: count, dtype: int64

In [88]:
df.shape

(23575, 5)

In [89]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 23575 entries, 0 to 33714
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Message ID  23575 non-null  int64 
 1   Subject     23574 non-null  object
 2   Message     23574 non-null  object
 3   Spam/Ham    23575 non-null  object
 4   Date        23575 non-null  object
dtypes: int64(1), object(4)
memory usage: 1.1+ MB


In [90]:
df.isna().sum()

Message ID    0
Subject       1
Message       1
Spam/Ham      0
Date          0
dtype: int64

In [91]:
df.dropna(inplace=True)


In [92]:
df.shape

(23573, 5)

In [93]:
df['Target'] = df['Spam/Ham'].map({'spam': 0, 'ham': 1})


In [94]:
df

Unnamed: 0,Message ID,Subject,Message,Spam/Ham,Date,Target
1,1,"vastar resources , inc .","gary , production from the high island larger ...",ham,1999-12-13,1
2,2,calpine daily gas nomination,- calpine daily gas nomination 1 . doc,ham,1999-12-14,1
3,3,re : issue,fyi - see note below - already done .\nstella\...,ham,1999-12-14,1
4,4,meter 7268 nov allocation,fyi .\n- - - - - - - - - - - - - - - - - - - -...,ham,1999-12-14,1
5,5,mcmullen gas for 11 / 99,"jackie ,\nsince the inlet to 3 river plant is ...",ham,1999-12-14,1
...,...,...,...,...,...,...
33706,33706,= ? iso - 8859 - 2 ? q ? my _ proposition ? =,from : dr . dan . nkanga\nzenith international...,spam,2005-07-29,0
33707,33707,make big bucks in the medical field bait - exc...,having problems seeing the graphics ? please g...,spam,2005-07-29,0
33711,33711,= ? iso - 8859 - 1 ? q ? good _ news _ c = eda...,"hello , welcome to gigapharm onlinne shop .\np...",spam,2005-07-29,0
33712,33712,all prescript medicines are on special . to be...,i got it earlier than expected and it was wrap...,spam,2005-07-29,0


In [95]:
df['text_len']=df.Message.str.len()

In [96]:
df.describe()

Unnamed: 0,Message ID,Target,text_len
count,23573.0,23573.0,23573.0
mean,15826.450134,0.536843,1333.553048
std,9651.328996,0.498651,2793.689463
min,1.0,0.0,1.0
25%,7390.0,0.0,317.0
50%,15488.0,1.0,677.0
75%,23337.0,1.0,1505.0
max,33714.0,1.0,178837.0


In [97]:
df[df['text_len']==178837.0000000]

Unnamed: 0,Message ID,Subject,Message,Spam/Ham,Date,Target,text_len
13901,13901,enron mentions - 11 / 09 / 01 - 11 / 10 / 01,"rival to buy enron , top energy trader , after...",ham,2001-11-12,1,178837


In [98]:
df[df['text_len']==2]


Unnamed: 0,Message ID,Subject,Message,Spam/Ham,Date,Target,text_len
7136,7136,elena chilkina,hi,ham,2000-10-04,1,2
9751,9751,. jif,.\n,spam,2002-07-25,0,2
17258,17258,so . . . you were looking for a one night stan...,dc,ham,2001-11-26,1,2
18649,18649,re : your free pay - per view,",\n",spam,2004-04-10,0,2
33649,33649,1,1\n,spam,2005-07-23,0,2


In [99]:
df['sub_len']=df.Subject.str.len()


In [100]:
df.describe()

Unnamed: 0,Message ID,Target,text_len,sub_len
count,23573.0,23573.0,23573.0,23573.0
mean,15826.450134,0.536843,1333.553048,35.520723
std,9651.328996,0.498651,2793.689463,26.417425
min,1.0,0.0,1.0,1.0
25%,7390.0,0.0,317.0,21.0
50%,15488.0,1.0,677.0,31.0
75%,23337.0,1.0,1505.0,46.0
max,33714.0,1.0,178837.0,1355.0


In [101]:
df[df['sub_len']==2]

Unnamed: 0,Message ID,Subject,Message,Spam/Ham,Date,Target,text_len,sub_len
380,380,43,- - - - - original message - - - - -\nfrom : t...,ham,2000-02-29,1,4264,2
3725,3725,ok,new offshore pharmacy - not a single medical q...,spam,2004-01-11,0,1038,2
4315,4315,hi,"apology sorry company bent , summer moved air ...",spam,2004-09-25,0,1735,2
13177,13177,ny,"hi laura ,\ni hate to keep bothering you about...",ham,2001-10-05,1,474,2
14343,14343,ip,consensus now is that all gas and power system...,ham,2001-12-11,1,514,2
15621,15621,fw,yo !\nyou will benefit much if purchase today ...,spam,2004-12-28,0,371,2
16336,16336,yo,first only doctor approved pen . is pills and\...,spam,2005-06-12,0,207,2
19497,19497,ge,"arcane , was already breaking , ritz , the ent...",spam,2004-08-04,0,132,2
21615,21615,gr,³ ? · s ? { ¦ µl\n³ ? · s ± µ ¦ ? 2\nµ @\n,spam,2005-03-31,0,39,2
25028,25028,tl,get 200 $ for free\nno more\n,spam,2004-11-10,0,27,2


# first do for Subject columns 

### here remove non-alphabetic characters

In [102]:
import re

# Define a function to clean subject text
def clean_subject(text):
    if pd.isnull(text):
        return ''
    # Remove non-alphabetic characters, keep only letters and spaces
    text = re.sub(r'[^a-zA-Z ]', ' ', text)
    # Convert to lowercase and strip extra spaces
    return text.lower().strip()

# Apply to the Subject column
df['Subject_new'] = df['Subject'].apply(clean_subject)


In [103]:
df

Unnamed: 0,Message ID,Subject,Message,Spam/Ham,Date,Target,text_len,sub_len,Subject_new
1,1,"vastar resources , inc .","gary , production from the high island larger ...",ham,1999-12-13,1,4282,24,vastar resources inc
2,2,calpine daily gas nomination,- calpine daily gas nomination 1 . doc,ham,1999-12-14,1,38,28,calpine daily gas nomination
3,3,re : issue,fyi - see note below - already done .\nstella\...,ham,1999-12-14,1,1171,10,re issue
4,4,meter 7268 nov allocation,fyi .\n- - - - - - - - - - - - - - - - - - - -...,ham,1999-12-14,1,1124,25,meter nov allocation
5,5,mcmullen gas for 11 / 99,"jackie ,\nsince the inlet to 3 river plant is ...",ham,1999-12-14,1,534,24,mcmullen gas for
...,...,...,...,...,...,...,...,...,...
33706,33706,= ? iso - 8859 - 2 ? q ? my _ proposition ? =,from : dr . dan . nkanga\nzenith international...,spam,2005-07-29,0,3363,45,iso q my proposition
33707,33707,make big bucks in the medical field bait - exc...,having problems seeing the graphics ? please g...,spam,2005-07-29,0,549,61,make big bucks in the medical field bait exc...
33711,33711,= ? iso - 8859 - 1 ? q ? good _ news _ c = eda...,"hello , welcome to gigapharm onlinne shop .\np...",spam,2005-07-29,0,281,82,iso q good news c edaliss...
33712,33712,all prescript medicines are on special . to be...,i got it earlier than expected and it was wrap...,spam,2005-07-29,0,803,99,all prescript medicines are on special to be...


### after remove non-alphabetic characters there are same duplicate so removing it 

In [104]:
# df = df.drop_duplicates()
df = df.drop_duplicates(subset='Subject_new')
print("Final shape:", df.shape)

Final shape: (21672, 9)


### removing stopwords in the Subject

In [105]:
import nltk


In [106]:
from nltk.corpus import stopwords
nltk.download("stopwords")
print(stopwords.words("english"),len(stopwords.words("english")))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mvara\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### CountVectorizer is a tool from scikit-learn that converts text documents (like emails) into a bag-of-words representation—a format that ML models can understand.

In [107]:
cv=CountVectorizer(stop_words="english")

In [108]:
cv.fit(df['Subject_new'])

In [109]:
X=cv.transform(df['Subject_new'])

In [110]:
X.shape

(21672, 15953)

In [111]:
y=df.Target

In [112]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=22)

In [113]:
X_train

<16254x15953 sparse matrix of type '<class 'numpy.int64'>'
	with 60643 stored elements in Compressed Sparse Row format>

# Naive Bayes classifier 

In [114]:
nb=MultinomialNB()

In [115]:
nb.fit(X_train,y_train)

In [116]:
pred=nb.predict(X_test)
pred

array([1, 1, 1, ..., 1, 1, 0], dtype=int64)

In [117]:
y_test

22656    1
11238    1
31966    0
20625    0
22626    1
        ..
19795    0
1883     1
24088    0
5916     1
10586    0
Name: Target, Length: 5418, dtype: int64

In [118]:
accuracyScore = accuracy_score(pred,y_test)*100
accuracyScore

89.79328165374677

In [119]:
from sklearn.metrics import confusion_matrix,classification_report
confusion_matrix(pred,y_test)

array([[2178,  150],
       [ 403, 2687]], dtype=int64)

In [120]:
print(classification_report(pred,y_test))

              precision    recall  f1-score   support

           0       0.84      0.94      0.89      2328
           1       0.95      0.87      0.91      3090

    accuracy                           0.90      5418
   macro avg       0.90      0.90      0.90      5418
weighted avg       0.90      0.90      0.90      5418



# for Message columns

### here remove non-alphabetic charactersm

In [121]:
import re

# Define a function to clean subject text
def clean_subject(text):
    if pd.isnull(text):
        return ''
    # Remove non-alphabetic characters, keep only letters and spaces
    text = re.sub(r'[^a-zA-Z ]', ' ', text)
    # Convert to lowercase and strip extra spaces
    return text.lower().strip()

# Apply to the Subject column
df['Message_new'] = df['Message'].apply(clean_subject)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Message_new'] = df['Message'].apply(clean_subject)


In [137]:
df.nunique()

Message ID     21501
Subject        21501
Message        21501
Spam/Ham           2
Date            1502
Target             2
text_len        4523
sub_len          201
Subject_new    21501
Message_new    21501
dtype: int64

In [138]:
df = df.drop_duplicates(subset='Message_new')
print("Final shape:", df.shape)

Final shape: (21501, 10)


In [139]:
df

Unnamed: 0,Message ID,Subject,Message,Spam/Ham,Date,Target,text_len,sub_len,Subject_new,Message_new
1,1,"vastar resources , inc .","gary , production from the high island larger ...",ham,1999-12-13,1,4282,24,vastar resources inc,gary production from the high island larger ...
2,2,calpine daily gas nomination,- calpine daily gas nomination 1 . doc,ham,1999-12-14,1,38,28,calpine daily gas nomination,calpine daily gas nomination doc
3,3,re : issue,fyi - see note below - already done .\nstella\...,ham,1999-12-14,1,1171,10,re issue,fyi see note below already done stella ...
4,4,meter 7268 nov allocation,fyi .\n- - - - - - - - - - - - - - - - - - - -...,ham,1999-12-14,1,1124,25,meter nov allocation,fyi ...
5,5,mcmullen gas for 11 / 99,"jackie ,\nsince the inlet to 3 river plant is ...",ham,1999-12-14,1,534,24,mcmullen gas for,jackie since the inlet to river plant is s...
...,...,...,...,...,...,...,...,...,...,...
33706,33706,= ? iso - 8859 - 2 ? q ? my _ proposition ? =,from : dr . dan . nkanga\nzenith international...,spam,2005-07-29,0,3363,45,iso q my proposition,from dr dan nkanga zenith international ...
33707,33707,make big bucks in the medical field bait - exc...,having problems seeing the graphics ? please g...,spam,2005-07-29,0,549,61,make big bucks in the medical field bait exc...,having problems seeing the graphics please g...
33711,33711,= ? iso - 8859 - 1 ? q ? good _ news _ c = eda...,"hello , welcome to gigapharm onlinne shop .\np...",spam,2005-07-29,0,281,82,iso q good news c edaliss...,hello welcome to gigapharm onlinne shop pr...
33712,33712,all prescript medicines are on special . to be...,i got it earlier than expected and it was wrap...,spam,2005-07-29,0,803,99,all prescript medicines are on special to be...,i got it earlier than expected and it was wrap...


In [140]:
y=df.Target

In [141]:
from nltk.corpus import stopwords
nltk.download("stopwords")
print(stopwords.words("english"),len(stopwords.words("english")))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mvara\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [142]:
cv.fit(df['Message_new'])

In [143]:
X1=cv.transform(df['Subject_new'])

In [144]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X1,y,test_size=0.25,random_state=22)

In [145]:
nb.fit(X_train,y_train)

In [146]:
pred2=nb.predict(X_test)
pred2

array([0, 1, 0, ..., 0, 0, 0], dtype=int64)

In [147]:
y_test

4713     0
26328    0
15569    0
15820    0
3647     1
        ..
25564    0
2622     1
27404    0
32039    0
15216    0
Name: Target, Length: 5376, dtype: int64

In [148]:
accuracy_score(pred2,y_test)*100

89.0625

In [149]:
from sklearn.ensemble import RandomForestClassifier
rfc=RandomForestClassifier()

In [150]:
# rfc.fit(X_train,y_train)

In [151]:
from sklearn.linear_model import LogisticRegression
lg=LogisticRegression()

In [152]:
 lg.fit(X_train,y_train)

In [153]:
pred3=lg.predict(X_test)

In [154]:
confusion_matrix(pred3,y_test)

array([[2420,  317],
       [ 188, 2451]], dtype=int64)

In [155]:
print(classification_report(pred3,y_test))

              precision    recall  f1-score   support

           0       0.93      0.88      0.91      2737
           1       0.89      0.93      0.91      2639

    accuracy                           0.91      5376
   macro avg       0.91      0.91      0.91      5376
weighted avg       0.91      0.91      0.91      5376



 # both the colunmes has best accuracy using logistic and MultinomialNB that is 90 -91%