In [47]:
from sklearn.naive_bayes import MultinomialNB
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [95]:
import warnings
warnings.filterwarnings('ignore')

In [48]:

data = pd.read_csv('spam.csv', encoding='latin-1')

### Explore the data

In [50]:
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [51]:
data['v1'].value_counts()

ham     4825
spam     747
Name: v1, dtype: int64

In [73]:
data['Unnamed: 2'].head(20)

0     NaN
1     NaN
2     NaN
3     NaN
4     NaN
5     NaN
6     NaN
7     NaN
8     NaN
9     NaN
10    NaN
11    NaN
12    NaN
13    NaN
14    NaN
15    NaN
16    NaN
17    NaN
18    NaN
19    NaN
Name: Unnamed: 2, dtype: object

In [56]:
data.describe()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
count,5572,5572,50,12,6
unique,2,5169,43,10,5
top,ham,"Sorry, I'll call later","bt not his girlfrnd... G o o d n i g h t . . .@""",GE,"GNT:-)"""
freq,4825,30,3,2,2


### Seems like we may not need all of these columns, so lets rebuild this dataset

1. Drop the columns Unnamed: 2, Unnamed: 3, Unnamed: 4 

2. Rename column headers

3. Define Ham and Spam in binary

In [67]:
data_trimmed = data.iloc[:,0:2]

In [69]:
data_trimmed.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [74]:
data_trimmed.columns = ['SPAM','Messages'] 

In [75]:
data_trimmed.head()

Unnamed: 0,SPAM,Messages
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [96]:
data_trimmed['BINARY'] = data_trimmed['SPAM'].map({'ham':0, 'spam':1})

In [85]:
data_trimmed.head(10)

Unnamed: 0,SPAM,Messages,BINARY
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0
5,spam,FreeMsg Hey there darling it's been 3 week's n...,1
6,ham,Even my brother is not like to speak with me. ...,0
7,ham,As per your request 'Melle Melle (Oru Minnamin...,0
8,spam,WINNER!! As a valued network customer you have...,1
9,spam,Had your mobile 11 months or more? U R entitle...,1


In [90]:
data_final = data_trimmed.iloc[:,1:3]

In [91]:
data_final.head(5)

Unnamed: 0,Messages,BINARY
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


#### build train-test split datasets

In [98]:
population = data_final['Messages'].shape[0]
print(population)


5572


In [97]:
from sklearn.model_selection import train_test_split

In [100]:
X_train, X_test, y_train, y_test = train_test_split(data_final["Messages"], data_final["BINARY"], test_size=0.3, random_state = 10)

In [101]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(3900,)
(1672,)
(3900,)
(1672,)


### Now we need to transform the text so it can be read into the models for prediction

In [103]:
from sklearn.feature_extraction.text import CountVectorizer

In [104]:
vect = CountVectorizer()

In [105]:
vect.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [106]:
X_train_df = vect.transform(X_train)

In [107]:
X_test_df = vect.transform(X_test)

### Let's first try a Naive Bayes model

In [108]:
model = MultinomialNB()
model.fit(X_train_df,y_train)
score = model.score(X_test_df,y_test)
print("Classification score for NB {}".format(score))

Classification score for NB 0.9892344497607656


### Let's now try the AdaBoostClassifier as a comparision
#### Ada is a ensemble method

In [19]:
from sklearn.ensemble import AdaBoostClassifier

In [109]:
model_ada = AdaBoostClassifier()
model_ada.fit(X_train_df,y_train)
print("Score for AdaBoost model", model_ada.score(X_test_df,y_test))


Score for AdaBoost model 0.9688995215311005
