In [72]:
import pandas as pd

import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [4]:
df = pd.read_csv("spam.csv", encoding='latin')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [5]:
df.shape

(5572, 5)

In [10]:
# No of duplicate entries
df.duplicated().sum()

403

In [11]:
# Remove duplicate entries
df = df.drop_duplicates()

In [12]:
# No of null entries
df.isnull().sum()

v1               0
v2               0
Unnamed: 2    5126
Unnamed: 3    5159
Unnamed: 4    5164
dtype: int64

In [13]:
df.drop(df.columns[2:], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(df.columns[2:], axis=1, inplace=True)


In [14]:
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [42]:
def preprocess(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s]", " ", text)
    words_list = [word for word in word_tokenize(text) if len(word)!=1 and word not in stopwords.words('english')]
    processed_text = ' '.join(words_list)
    return processed_text

In [44]:
sentences = []
for text in df['v2']:
    processed_text = preprocess(text)
    sentences.append(processed_text)

In [45]:
for sent in sentences[:10]:
    print(sent)

go jurong point crazy available bugis great world la buffet cine got amore wat
ok lar joking wif oni
free entry wkly comp win fa cup final tkts 21st may 2005 text fa 87121 receive entry question std txt rate apply 08452810075over18
dun say early hor already say
nah think goes usf lives around though
freemsg hey darling week word back like fun still tb ok xxx std chgs send 50 rcv
even brother like speak treat like aids patent
per request melle melle oru minnaminunginte nurungu vettam set callertune callers press copy friends callertune
winner valued network customer selected receivea 900 prize reward claim call 09061701461 claim code kl341 valid 12 hours
mobile 11 months entitled update latest colour mobiles camera free call mobile update co free 08002986030


In [49]:
vectorizer = TfidfVectorizer(max_features = 5000)
X = vectorizer.fit_transform(sentences).toarray()
X.shape

(5169, 5000)

In [52]:
df['Label'] = df['v1'].map({'ham':0,'spam':1})
df.drop('v1', axis=1, inplace=True)

In [55]:
df['Text'] = sentences
df.drop('v2', axis=1, inplace=True)

In [56]:
df.head()

Unnamed: 0,Label,Text
0,0,go jurong point crazy available bugis great wo...
1,0,ok lar joking wif oni
2,1,free entry wkly comp win fa cup final tkts 21s...
3,0,dun say early hor already say
4,0,nah think goes usf lives around though


## Train test split

In [66]:
X_train, X_test, y_train, y_test = train_test_split(X, df['Label'], test_size=0.2, random_state=42)

In [67]:
X_train.shape

(4135, 5000)

In [68]:
y_train.shape

(4135,)

In [69]:
X_test.shape

(1034, 5000)

In [70]:
y_test.shape

(1034,)

## Multinomial Naive Bayes classifier

In [73]:
nb = MultinomialNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)
accuracy_score(y_test, y_pred)

0.9748549323017408

<br>We've got a great accuracy of 97.5%