In [1]:
import pandas as pd
import numpy as np

In [11]:
# load in the data
df = pd.read_csv('/Users/RyanMburu/Documents/DS-Coursework/Module-I/week-4/bayes-theorem/SMSSpamCollection 2', sep='\t', header=None, names=['label', 'message'], encoding='latin-1')
df

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ã¼ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


## Pre processing / Cleaning for ML

### 1. Converting label column from string to binary values for classifying

In [12]:
#hot encoding / replace values with binary 1 and 0
df['label'] = df.label.map({'ham' : 0, 'spam' : 1})

In [14]:
#Preview new df
df

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will Ã¼ b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


### 2. Convert all characters in message to lowercase

In [15]:
df['message'] = df.message.map(lambda x : x.lower())
df.head()

Unnamed: 0,label,message
0,0,"go until jurong point, crazy.. available only ..."
1,0,ok lar... joking wif u oni...
2,1,free entry in 2 a wkly comp to win fa cup fina...
3,0,u dun say so early hor... u c already then say...
4,0,"nah i don't think he goes to usf, he lives aro..."


### 3. Removing any punctuations in the messages

In [16]:
df['message'] = df.message.str.replace('[^\w\s]', '')
df.head()

  df['message'] = df.message.str.replace('[^\w\s]', '')


Unnamed: 0,label,message
0,0,go until jurong point crazy available only in ...
1,0,ok lar joking wif u oni
2,1,free entry in 2 a wkly comp to win fa cup fina...
3,0,u dun say so early hor u c already then say
4,0,nah i dont think he goes to usf he lives aroun...


### 4. Tokenizing messages into single letters

-> will import ntlk module  (Natural language tool kit)

In [18]:
import nltk

nltk.download('popular')

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     /Users/RyanMburu/nltk_data...
[nltk_data]    |   Unzipping corpora/cmudict.zip.
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     /Users/RyanMburu/nltk_data...
[nltk_data]    |   Unzipping corpora/gazetteers.zip.
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     /Users/RyanMburu/nltk_data...
[nltk_data]    |   Unzipping corpora/genesis.zip.
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     /Users/RyanMburu/nltk_data...
[nltk_data]    |   Unzipping corpora/gutenberg.zip.
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     /Users/RyanMburu/nltk_data...
[nltk_data]    |   Unzipping corpora/inaugural.zip.
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /Users/RyanMburu/nltk_data...
[nltk_data]    |   Unzipping corpora/movie_reviews.zip.
[nltk_data]    | 

True

In [19]:
# perform tokenizing
df['message'] = df['message'].apply(nltk.word_tokenize)
df.head()

Unnamed: 0,label,message
0,0,"[go, until, jurong, point, crazy, available, o..."
1,0,"[ok, lar, joking, wif, u, oni]"
2,1,"[free, entry, in, 2, a, wkly, comp, to, win, f..."
3,0,"[u, dun, say, so, early, hor, u, c, already, t..."
4,0,"[nah, i, dont, think, he, goes, to, usf, he, l..."


### 5. Word Stemming

what stemming does is it normalizes all words to ones with same meaning, regardless of the tense

Will use Porter Stemmer algorithm, a popular stemming algo

In [21]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

In [22]:
# stem our messages

df['message'] = df['message'].apply(lambda x: [stemmer.stem(y) for y in x])
df.head()

Unnamed: 0,label,message
0,0,"[go, until, jurong, point, crazi, avail, onli,..."
1,0,"[ok, lar, joke, wif, u, oni]"
2,1,"[free, entri, in, 2, a, wkli, comp, to, win, f..."
3,0,"[u, dun, say, so, earli, hor, u, c, alreadi, t..."
4,0,"[nah, i, dont, think, he, goe, to, usf, he, li..."


### 6. Transforming the data into occurences

-> features we will feed into our model

-> will use count vectorizer from feature extraction in skitlearn lib

In [23]:
from sklearn.feature_extraction.text import CountVectorizer

In [24]:
# will convert our lists of words into space separated strings

df['message'] = df['message'].apply(lambda x: ' '.join(x))
df.head()

Unnamed: 0,label,message
0,0,go until jurong point crazi avail onli in bugi...
1,0,ok lar joke wif u oni
2,1,free entri in 2 a wkli comp to win fa cup fina...
3,0,u dun say so earli hor u c alreadi then say
4,0,nah i dont think he goe to usf he live around ...


In [25]:
count_vector = CountVectorizer()
counts = count_vector.fit_transform(df['message'])
df.head()

Unnamed: 0,label,message
0,0,go until jurong point crazi avail onli in bugi...
1,0,ok lar joke wif u oni
2,1,free entri in 2 a wkli comp to win fa cup fina...
3,0,u dun say so earli hor u c alreadi then say
4,0,nah i dont think he goe to usf he live around ...


### Lets use td-idf

In [26]:
from sklearn.feature_extraction.text import TfidfTransformer

transformer = TfidfTransformer().fit(counts)

counts = transformer.transform(counts)
df.head()


Unnamed: 0,label,message
0,0,go until jurong point crazi avail onli in bugi...
1,0,ok lar joke wif u oni
2,1,free entri in 2 a wkli comp to win fa cup fina...
3,0,u dun say so earli hor u c alreadi then say
4,0,nah i dont think he goe to usf he live around ...


## Training our model

In [28]:
# Divide data into training and test sets

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(counts, df['label'], test_size=0.1, random_state=69)

In [29]:
# Initialize Naive bayes and fit the data
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB().fit(X_train, y_train)

## Model Evaluation

In [30]:
predicted = model.predict(X_test)

print(np.mean(predicted == y_test))

0.9480286738351255


Our Model has 0.94 accuracy / 94.8% accuracy