#### Importing Libraries

In [131]:
import pandas as pd
import numpy as np
import seaborn as sns

#### Import Dataset

In [132]:
dataset= pd.read_csv('emails.csv')
dataset.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


#### Dataset describe

In [133]:
dataset.shape

(5726, 2)

In [134]:
dataset.isnull().sum()

text    0
spam    0
dtype: int64

In [135]:
dataset.describe()

Unnamed: 0,spam
count,5726.0
mean,0.238736
std,0.426348
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


In [136]:
dataset['spam'].value_counts()

0    4359
1    1367
Name: spam, dtype: int64

In [137]:
dataset.duplicated().sum()

33

In [138]:
dataset.drop_duplicates(inplace=True)

In [139]:
dataset.shape

(5693, 2)

#### Separate the dataset

In [140]:
x= dataset['text'].values


In [141]:
y=dataset['spam'].values
y

array([1, 1, 1, ..., 0, 0, 0], dtype=int64)

#### Splitting the dataset into training set and test set

In [142]:
from sklearn.model_selection import train_test_split
Xtrain, Xtest, Ytrain, Ytest = train_test_split(x,y,test_size=0.20)

#### Data Preprocessing

In [143]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer()

In [144]:
X_train= cv.fit_transform(Xtrain)
X_train.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0]], dtype=int64)

In [145]:
X_test= cv.transform(Xtest)
X_test.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0]], dtype=int64)

## ML Algorithm

#### Fitting Naive Bayes using MultinomialNB Classifier

In [146]:
from sklearn.naive_bayes import MultinomialNB
MNBclassifier = MultinomialNB()
MNBclassifier.fit(X_train,Ytrain)

#### Predict the result

In [149]:
Ypred= MNBclassifier.predict(X_test)
Ypred

array([1, 0, 0, ..., 0, 0, 0], dtype=int64)

#### Confusion Matrix

In [150]:
from sklearn.metrics import confusion_matrix
cnf=confusion_matrix(Ytest,Ypred)
cnf

array([[845,   4],
       [  4, 286]], dtype=int64)

#### Accuracy Score

In [154]:
from sklearn.metrics import accuracy_score
print("Accuracy Score: ",accuracy_score(Ytest,Ypred)*100,"%")

Accuracy Score:  99.29762949956101 %


#### Check Spam or Not from user input

In [183]:
mail=['Subject: photoshop , windows , office . cheap . main trending abasements darer prudently fortuitous undergone lighthearted charm orinoco taster railroad affluent pornographic cuvierirvin parkhouse blameworthy chlorophyll robed diagrammatic fogarty clears bayda inconveniencing managing represented smartness hashish academies shareholders unload badness danielson pure caffein spaniard chargeable levin','Subject: enron methanol ; meter # : 988291 this is a follow up to the note i gave you on monday , 4 / 3 / 00 { preliminary flow data provided by daren } . please override pop  s daily volume { presently zero } to reflect daily activity you can obtain from gas control .this change is needed asap for economics purposes .']

In [184]:
cv_mail=cv.transform(mail)

In [185]:
MNBclassifier.predict(cv_mail)

array([1, 0], dtype=int64)