# Email Spam Classifier

## Importing the libraries

In [52]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [67]:
dataset = pd.read_csv('spam.csv')

## Analysing the dataset

In [68]:
dataset

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [69]:
dataset.describe()

Unnamed: 0,Category,Message
count,5572,5572
unique,2,5157
top,ham,"Sorry, I'll call later"
freq,4825,30


In [70]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [71]:
dataset.shape

(5572, 2)

## Checking for missing values

In [72]:
dataset.isnull().sum()

Category    0
Message     0
dtype: int64

In [73]:
dataset['Spam']=dataset['Category'].apply(lambda x:1 if x=='spam' else 0)
dataset

Unnamed: 0,Category,Message,Spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,1
5568,ham,Will ü b going to esplanade fr home?,0
5569,ham,"Pity, * was in mood for that. So...any other s...",0
5570,ham,The guy did some bitching but I acted like i'd...,0


## Cleaning the text

## Cleaning the stopwords

In [74]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
for i in range(0, 5572):
    review = re.sub('[^a-zA-z]',' ',dataset['Message'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')
    review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
    review = ' '.join(review)

    corpus.append(review)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mdehteshamansari00/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Removing https tags

In [75]:
import re
processed_data=[]
for i in range(5572):
    corpus[i] = re.sub(r"http\S+", "",corpus[i])
    processed_data.append(corpus[i])

## Removing HTML tags

In [76]:
from bs4 import BeautifulSoup
data=[]
for i in range(5572): 
    soup = BeautifulSoup(processed_data[i], 'lxml')
    text = soup.get_text()
    data.append(text)
    



## Decontraction of text

In [77]:
def decontracted(phrase):
    phrase = re.sub(r"won't","will not",phrase)
    phrase = re.sub(r"can\'t","can not",phrase)
    
    phrase = re.sub(r"n\'t"," not",phrase)
    phrase = re.sub(r"\'re"," are",phrase)
    phrase = re.sub(r"\'s"," is",phrase)
    phrase = re.sub(r"\'d"," would",phrase)
    phrase = re.sub(r"\'ll"," will",phrase)
    phrase = re.sub(r"n\'t"," not",phrase)
    phrase = re.sub(r"\'ve"," have",phrase)
    phrase = re.sub(r"\'m"," am",phrase)
    return phrase

In [78]:
for i in range(5572):
    data[i] = decontracted(data[i])

## Creating Bag of Words model

In [81]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 5572)
x = cv.fit_transform(data).toarray()
y = dataset.iloc[:,2].values

## Splitting the dataset into the Training set and Test set

In [84]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 1)

## Naive Bayes Classifier

In [86]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train,y_train)

GaussianNB()

In [87]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[0 0]
 [0 0]
 [0 0]
 ...
 [0 0]
 [0 0]
 [0 0]]


In [88]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[834 134]
 [ 16 131]]


0.8654708520179372

## Checking for the sample email inputs

In [93]:
new_review = "FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, ¬£1.50 to rcv"
new_review = re.sub('[^a-zA-Z]', ' ', new_review)
new_review = new_review.lower()
new_review = new_review.split()
ps = PorterStemmer()
all_stopwords = stopwords.words('english')
all_stopwords.remove('not')
new_review = [ps.stem(word) for word in new_review if not word in set(all_stopwords)]
new_review = ' '.join(new_review)
new_corpus = [new_review]
new_X_test = cv.transform(new_corpus).toarray()
new_y_pred = classifier.predict(new_X_test)
if new_y_pred==1:
    print(new_y_pred,"Spam Email")
elif new_y_pred==0:
    print(new_y_pred,"Not a Spam Email")


[1] Spam Email
