In [54]:
# Loading Libraries
import pandas as pd
import numpy as np

In [2]:
# Load Dataset
df = pd.read_csv('emails.csv')
df.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


In [3]:
df.shape

(5728, 2)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5728 entries, 0 to 5727
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5728 non-null   object
 1   spam    5728 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 89.6+ KB


In [5]:
df.spam.value_counts()

0    4360
1    1368
Name: spam, dtype: int64

In [6]:
# Checking Duplicated rows
df.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
5723    False
5724    False
5725    False
5726    False
5727    False
Length: 5728, dtype: bool

In [7]:
# Duplicated summery
df.duplicated().value_counts()

False    5695
True       33
dtype: int64

In [8]:
(~df.duplicated(subset=['text','spam'])).sum()

5695

In [9]:
# Total Duplicated rows
df.duplicated(subset=['text','spam']).sum()

33

In [10]:
# Duplicated Rows
df[df.duplicated()]

Unnamed: 0,text,spam
2155,Subject: research allocations to egm hi becky...,0
2260,Subject: departure of grant masson the resear...,0
2412,"Subject: re : schedule and more . . jinbaek ,...",0
2473,"Subject: day off tuesday stinson , i would l...",0
2763,"Subject: re : your mail zhendong , dr . kami...",0
3123,"Subject: re : grades pam , the students rese...",0
3152,Subject: tiger evals - attachment tiger hosts...,0
3248,"Subject: re : i am zhendong zhendong , thank...",0
3249,Subject: hello from enron dear dr . mcmullen ...,0
3387,"Subject: term paper dr . kaminski , attached...",0


In [11]:
# Dropping Duplicated rows
df.drop_duplicates(keep=False,inplace=True)

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5662 entries, 0 to 5727
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5662 non-null   object
 1   spam    5662 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 132.7+ KB


In [13]:
df.spam.value_counts()

0    4294
1    1368
Name: spam, dtype: int64

In [14]:
df.spam.unique()

array([1, 0], dtype=int64)

## Text Preprocessing

In [15]:
# Loading Text libraries
import nltk
import string
from nltk.corpus import stopwords, words
nltk.download('stopwords')

[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


False

In [16]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [17]:
stopwords.fileids()

['arabic',
 'azerbaijani',
 'basque',
 'bengali',
 'catalan',
 'chinese',
 'danish',
 'dutch',
 'english',
 'finnish',
 'french',
 'german',
 'greek',
 'hebrew',
 'hinglish',
 'hungarian',
 'indonesian',
 'italian',
 'kazakh',
 'nepali',
 'norwegian',
 'portuguese',
 'romanian',
 'russian',
 'slovene',
 'spanish',
 'swedish',
 'tajik',
 'turkish']

In [18]:
# Bangali stopwords
stopwords.words('bengali')

['অতএব',
 'অথচ',
 'অথবা',
 'অনুযায়ী',
 'অনেক',
 'অনেকে',
 'অনেকেই',
 'অন্তত',
 'অন্য',
 'অবধি',
 'অবশ্য',
 'অর্থাত',
 'আই',
 'আগামী',
 'আগে',
 'আগেই',
 'আছে',
 'আজ',
 'আদ্যভাগে',
 'আপনার',
 'আপনি',
 'আবার',
 'আমরা',
 'আমাকে',
 'আমাদের',
 'আমার',
 'আমি',
 'আর',
 'আরও',
 'ই',
 'ইত্যাদি',
 'ইহা',
 'উচিত',
 'উত্তর',
 'উনি',
 'উপর',
 'উপরে',
 'এ',
 'এঁদের',
 'এঁরা',
 'এই',
 'একই',
 'একটি',
 'একবার',
 'একে',
 'এক্',
 'এখন',
 'এখনও',
 'এখানে',
 'এখানেই',
 'এটা',
 'এটাই',
 'এটি',
 'এত',
 'এতটাই',
 'এতে',
 'এদের',
 'এব',
 'এবং',
 'এবার',
 'এমন',
 'এমনকী',
 'এমনি',
 'এর',
 'এরা',
 'এল',
 'এস',
 'এসে',
 'ঐ',
 'ও',
 'ওঁদের',
 'ওঁর',
 'ওঁরা',
 'ওই',
 'ওকে',
 'ওখানে',
 'ওদের',
 'ওর',
 'ওরা',
 'কখনও',
 'কত',
 'কবে',
 'কমনে',
 'কয়েক',
 'কয়েকটি',
 'করছে',
 'করছেন',
 'করতে',
 'করবে',
 'করবেন',
 'করলে',
 'করলেন',
 'করা',
 'করাই',
 'করায়',
 'করার',
 'করি',
 'করিতে',
 'করিয়া',
 'করিয়ে',
 'করে',
 'করেই',
 'করেছিলেন',
 'করেছে',
 'করেছেন',
 'করেন',
 'কাউকে',
 'কাছ',
 'কাছে',
 'কাজ',
 'কাজে',
 'কারও',
 '

In [19]:
# English Stopwords
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [20]:
# Total Bangali Stopwords
len(stopwords.words('bengali'))

398

In [21]:
def clean_text_data(text):
#     removing punctuation
    rem_punc = [char for char in text if char not in string.punctuation]
    rem_punc = ''.join(rem_punc)
    
#     remove all valueless stop words
    words = [word for word in rem_punc.split() if word.lower not in stopwords.words('english')]
    return words

In [22]:
df.text.head()

0    Subject: naturally irresistible your corporate...
1    Subject: the stock trading gunslinger  fanny i...
2    Subject: unbelievable new homes made easy  im ...
3    Subject: 4 color printing special  request add...
4    Subject: do not have money , get software cds ...
Name: text, dtype: object

In [23]:
# Applying function to all rows
df.text.head().apply(clean_text_data)

0    [Subject, naturally, irresistible, your, corpo...
1    [Subject, the, stock, trading, gunslinger, fan...
2    [Subject, unbelievable, new, homes, made, easy...
3    [Subject, 4, color, printing, special, request...
4    [Subject, do, not, have, money, get, software,...
Name: text, dtype: object

## Feature Extraction

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [25]:
 tfidf = TfidfVectorizer(analyzer=clean_text_data)

In [90]:
all_texts = tfidf.fit_transform(df.text)
all_texts

<5662x37356 sparse matrix of type '<class 'numpy.float64'>'
	with 732429 stored elements in Compressed Sparse Row format>

In [91]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest = train_test_split(all_texts,df.spam,train_size=.7,random_state=43)

In [38]:
xtrain.shape

(3963, 37356)

In [39]:
ytrain.shape

(3963,)

### Multinomial Naive Bayes

In [40]:
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()

In [41]:
mnb.fit(xtrain,ytrain)

MultinomialNB()

In [42]:
mnb.score(xtest,ytest)

0.8658034137728076

### Bernoulli Naive Bayes

In [43]:
from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()

In [44]:
bnb.fit(xtrain,ytrain)

BernoulliNB()

In [45]:
bnb.score(xtest,ytest)

0.9846968805179518

### Gaussian Naive Bayes

In [81]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

In [92]:
all_texts2 = all_texts.toarray()
all_texts2

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [93]:
xtrain,xtest,ytrain,ytest = train_test_split(all_texts2,df.spam,train_size=.7,random_state=43)

In [94]:
gnb.fit(xtrain,ytrain)

GaussianNB()

In [95]:
gnb.score(xtest,ytest)

0.951736315479694