# Spam Ham classifier

In this code, we try to classify an email as spam or non-spam email.

You can get the data from the following URL and save the data in directory you are programming

https://www.kaggle.com/uciml/sms-spam-collection-dataset#spam.csv

In [1]:
# Read the data
import pandas as pd
data = pd.read_csv("spam.csv",encoding ='latin1')

data.columns

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')

In [2]:
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [3]:
# We need the emails and their target values as ham/spam 
data = data[['v1', 'v2']]
data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
# rename the columns
data = data.rename(columns={"v1":"Target", "v2":"Email"})
data.head()

Unnamed: 0,Target,Email
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# Import the libraries

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import string
from nltk.stem import SnowballStemmer, PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
import os
from textblob import TextBlob, Word
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
import sklearn.feature_extraction.text as text

# Preprocessing

In [6]:
data['Email'] = data['Email'].apply(lambda x: " ".join(x.lower() for x in x.split()))
stop = stopwords.words('english')
data['Email'] = data['Email'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
st = PorterStemmer()
data['Email'] = data['Email'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))
data['Email'] = data['Email'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
data.head()

Unnamed: 0,Target,Email
0,ham,"go jurong point, crazy.. avail bugi n great wo..."
1,ham,ok lar... joke wif u oni...
2,spam,free entri 2 wkli comp win fa cup final tkt 21...
3,ham,u dun say earli hor... u c alreadi say...
4,ham,"nah think goe usf, live around though"


# Splitting the data

In [7]:
train_x, test_x, train_y, test_y = model_selection.train_test_split(data['Email'], data['Target'])

# feature generating

In [9]:
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
test_y = encoder.fit_transform(test_y)
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(train_x)
xtrain_tfidf = tfidf_vect.transform(train_x)
xtest_tfidf = tfidf_vect.transform(test_x)

# Model training

In [10]:
def model_classifier(classifier, x_train, y_train, x_test, y_test):
    classifier.fit(x_train, y_train)                        # training the model
    predictions = classifier.predict(x_test)                # prediction
    return metrics.accuracy_score(predictions, y_test)

# Naive Bayes Classifier

In [12]:
nb = naive_bayes.MultinomialNB(alpha=0.2)
nb_accuracy = model_classifier(nb,xtrain_tfidf, train_y, xtest_tfidf, test_y)
print ("Accuracy of the spam classifier with Naive Bayes algorithm is: {:.4f}".format(nb_accuracy))

Accuracy of the spam classifier with Naive Bayes algorithm is: 0.9856


In [14]:
log_reg = linear_model.LogisticRegression()
lr_accuracy = model_classifier(log_reg, xtrain_tfidf, train_y, xtest_tfidf, test_y)
print ("Accuracy of the spam classifier with Logistic Regression algorithm is: {:.4f}".format(lr_accuracy))

Accuracy of the spam classifier with Logistic Regression algorithm is: 0.9770
