# Spam Classifier using python NLP💻

### Importing libraries

In [20]:
import pandas as pd
import re
import nltk

### Reading the dataset

In [6]:
messages = pd.read_csv('spamDataset.csv',encoding='latin')

In [25]:
messages.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [27]:
# Checking the number of records (5572 records and 2 columns)
messages.shape

(5572, 2)

### Cleaning the dataset before implmenting NLP

In [8]:
messages.columns

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')

In [9]:
messages.isnull().sum()

v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

In [10]:
messages=messages.drop([ 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'],axis=1)

In [11]:
messages.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [14]:
# Renaming the column names
dataset = messages.rename(columns={'v1': 'label','v2':'message'})

In [17]:
dataset.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### Data cleaning and preprocessing for NLP

In [22]:
# stopwords contain all the common words which is not necessary for the classdification
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Nazhim\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [29]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

ps = PorterStemmer()

# corpus is a name given to a collection of text
corpus = []

# looping through each and every record
for i in range (0, len(dataset)):
    
    # removing all the other characters other than the alphabets & replace with a blank space
    # for each message record in the dataset
    review = re.sub('[^a-zA-Z]', ' ', dataset['message'][i])
    
    # converting it to lowercase
    review = review.lower()
    
    # splitting the words in each message
    review = review.split()
    
    # performing the stemming process and removing all the common words which won't be usefull for the classificaiton
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    
    # joining all the list of words to make a new sentence which is clean enough for the classificaiton
    review = ' '.join(review)
    
    # adding the final updated sentense to the list of sentences 
    corpus.append(review)
    

In [31]:
# displaying the list of the text inside the corpus
for i in range (len(corpus)):
    print(corpus[i])

go jurong point crazi avail bugi n great world la e buffet cine got amor wat
ok lar joke wif u oni
free entri wkli comp win fa cup final tkt st may text fa receiv entri question std txt rate c appli
u dun say earli hor u c alreadi say
nah think goe usf live around though
freemsg hey darl week word back like fun still tb ok xxx std chg send rcv
even brother like speak treat like aid patent
per request mell mell oru minnaminungint nurungu vettam set callertun caller press copi friend callertun
winner valu network custom select receivea prize reward claim call claim code kl valid hour
mobil month u r entitl updat latest colour mobil camera free call mobil updat co free
gonna home soon want talk stuff anymor tonight k cri enough today
six chanc win cash pound txt csh send cost p day day tsandc appli repli hl info
urgent week free membership prize jackpot txt word claim c www dbuk net lccltd pobox ldnw rw
search right word thank breather promis wont take help grant fulfil promis wonder bles

miss vday parachut doubl coin u must know well
sorri call later
sister got place birla soft da
free entri weekli comp chanc win ipod txt pod get entri std txt rate c appli detail
wah oki oki muz make use e unlimit haha
peopl mu tabl lambda
stop old man get build snowman snow angel snowbal fight
ello babe u ok
hello beauti r u ok kinda ad row wiv walk pub want night wiv u miss u
u go ikea str aft dat
becoz lt gt jan whn al post ofic holiday cn go fr post ofic got duffer
lol grr mom take forev prescript pharmaci like minut away ugh
real tho suck even cook whole electr hungri
want go
new textbuddi chat horni guy ur area p free receiv search postcod gaytextbuddi com txt one name rpl stop cnl
time month mid time
fffff text kadeem far gone
leav yet ok lor go elsewher n eat u thk
fujitsu seri lifebook good
yar want scold u yest late alreadi got zhong se qing u ask b ask go w u lor n u still act real
dont know bring food
current food alon also
sch fr dun haf da book sch home
hello go villag pu

repli win weekli profession sport tiger wood play send stop end servic
see see mayb reboot ym seen buzz
still grinder
polyphon tone ur mob everi week txt pt st tone free get txtin tell ur friend p tone repli hl info
love decis feel could decid love life would much simpler less magic
hot live fantasi call p per min ntt ltd po box croydon cr wb nation rate call
k see k
list buyer
idea guess work hour suppos leav sinc usual nobodi interest figur shit last second
mm entir sure understood text hey ho weekend
releas vday shirt u put make bottom half nake instead white underwear
know watch film comput
b thursday
oh phone phone disconnect
id onluy matter get offcampu
messag free welcom new improv sex dog club unsubscrib servic repli stop msg p
excel see riley plan
see half hour
tkt euro cup final cash collect call b pobox ppm
ew one
also hi wesley
ah see lingo let know wot earth finish make
loan purpos homeown tenant welcom previous refus still help call free text back help
updat mth half pric

u attend ur drive lesson mani time wk n day
uncl g check reward month
hello boytoy geeee miss today like send tm remind think love love kiss
think two still need get cash def readi
hey gal u wanna meet dinner n te
dear xxxxxxx u invit xchat final attempt contact u txt chat p msgrcvdhg suit land row w j hl ldn yr
babe talk think good boy miss love
great offic today
cool last littl get time soon
sad puppi nois
ye possibl dint tri pl dont tell one k
anyway holla whenev around need excus go creep peopl sarasota
happen
gonna ask lol think
ur cash balanc current pound maxim ur cash send go p meg cc hg suit land row w j hl
privat account statement show un redeem point call identifi code xx expir
go chase run cross street
like tell deepest darkest fantasi call p min stop text call nat rate
come friday leav pongal get news work place
hey inconveni si huh
ok vl u know got adsens approv
realli good see day dudett miss
want go perumbavoor
mani time told stage use laugh listen aha
believ true incre

### Using Bag of Words (BoW)

In [33]:
# Creating the Bag of Words
from sklearn.feature_extraction.text import CountVectorizer

# initializing the count vectorizer for bag of words, we are getting the top 5000 most fequently used words
# because all the words might not be useful such as name of a person
cv = CountVectorizer(max_features=5000)

# performing a fit transform and converting it to an array, getting the array of the values after applying BoW
X = cv.fit_transform(corpus).toarray()

# Using one hot encoding to covert the label column from normal text to binary numbers so that the modal will understand
y = pd.get_dummies(dataset['label'])

# Getting only one column from the two columns craeting from one hot encoding therefore '0' = 'ham' and '1' = 'spam'
y = y.iloc[:,1].values


### Performing the Train Test Split

In [34]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)


### Training model using Native Bayes Classifier

In [35]:
# we are selecting the Navtive Bayes Classifier because it works well with NLP
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X_train, y_train)


### Making predictions

In [37]:
# getting the predictions made from the model
y_pred = spam_detect_model.predict(X_test)


### Creating the Confusion Matrix 

In [39]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)


[[964  13]
 [ 10 128]]


### checking the accuracy

In [50]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: " + str(round(accuracy*100,2)) + "%")

Accuracy: 97.94%
