# Email Classifier using Naive Bayes ML Algorithm in Python.

## Step - 1 : import necessary libraries

In [31]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB

## Step - 2 : importing the csv file (input file)

In [2]:
df = pd.read_csv('spam.csv')
df

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


## Step - 3 : Converting String to Numerical Data for ML Y-variable.

In [5]:
df['spam'] = df['Category'].apply([lambda x: 1 if x=='spam' else 0])
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


## Step - 4 : splitting dataset into train and test data.

In [6]:
X_train, X_test, y_train, y_test = train_test_split(df.Message,df.spam,test_size=0.25)

## Step - 5 : Converting email Message into binary vector for ML modeling.

In [11]:
v = CountVectorizer()
X_train_count = v.fit_transform(X_train)
X_train_count.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

## Step - 6 : Fitting ML model.

In [14]:
model = MultinomialNB()
model.fit(X_train_count,y_train)

MultinomialNB()

## Step - 7 : Transforming test Email messages into vector for ML.

In [16]:
X_test_count = v.transform(X_test)

## Step - 8 : Checking accuracy of our ML Model.

In [17]:
model.score(X_test_count,y_test)

0.9870782483847811

## Step - 9 : sample Email for predicting our model output.

In [21]:
email = [
    'Hey meet, can we get together to watch UEFA tomorrow?',
    'Upto 20% discount on parking,exclusive offer for you. Dont miss this reward '
]

In [22]:
emails_count = v.transform(email)

In [23]:
model.predict(emails_count)

array([0, 1], dtype=int64)

### With the accuracy of 98.7% our ML Model predict that, the first mail was ham and second one is spam, which is correct.

- - - - - - 

# We can follow yet another approach by using pipeline from step - 5.

## Step - 5 : importing pipeline library

In [25]:
from sklearn.pipeline import Pipeline

## Step - 6 : Creating Pipeline  

In [26]:
clf = Pipeline([
    ('vectorizer',CountVectorizer()),
    ('nb',MultinomialNB())
])

## Step - 7 : Using pipeline for model fitting and find accuracy.

### By using Pipeline, we can omitt manually converting Email Message into Binary Vector, then modelling the ML Model.

In [27]:
clf.fit(X_train,y_train)
clf.score(X_test,y_test)

0.9870782483847811

## Step - 8 : Predicting model  with our sample Email.

In [39]:
clf.predict(email)

array([0, 1], dtype=int64)

### With the accuracy of 98.7% our ML Model predict that, the first mail was ham and second one is spam, which is correct.