In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB

In [2]:
# load the dataset
df = pd.read_csv('https://raw.githubusercontent.com/mohitgupta-omg/Kaggle-SMS-Spam-Collection-Dataset-/master/spam.csv', encoding = 'iso-8859-1')

In [3]:
data = df.iloc[:, [0, 1]]

In [4]:
data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
data['v2'][435]

'The message sent is askin for  &lt;#&gt; dollars. Shoul i pay  &lt;#&gt;  or  &lt;#&gt; ?'

### Steps to clean the text for classification task

1. Remove all the punctuation marks
2. Lower case all the letters
3. Remove all the stopwords
4. Lemmetize the data
4. find a way to convert the text data to numerical data
5. pass it to the machine learning model

In [6]:
import string
df['v2'] = df['v2'].apply(lambda x: ''.join([i for i in x if i not in string.punctuation]))

In [7]:
df['v2'] = df['v2'].apply(lambda x: x.lower())

In [8]:
# removing all the stopwords from our vocabulary as it carries no meaning for our classification task
# Remember that stopwords are common to both ham and spam class

import nltk
from nltk.corpus import stopwords
stop = stopwords.words('english')
df['v2'] = df['v2'].apply(lambda x: ' '.join([i for i in x.split() if i not in stop]))

In [9]:
df['v2'][2]

'free entry 2 wkly comp win fa cup final tkts 21st may 2005 text fa 87121 receive entry questionstd txt ratetcs apply 08452810075over18s'

In [10]:
from spacy.lang.en.stop_words import STOP_WORDS
df['v2'] = df['v2'].apply(lambda x: ' '.join([i for i in x.split() if i not in STOP_WORDS]))

In [11]:
df['v2'][2]

'free entry 2 wkly comp win fa cup final tkts 21st 2005 text fa 87121 receive entry questionstd txt ratetcs apply 08452810075over18s'

In [12]:
import nltk
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Microsoft\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Microsoft\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [13]:
from nltk.corpus import wordnet
from nltk.stem.wordnet import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()


def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [14]:
df['v2'] = df['v2'].apply(lambda x: ' '.join([lemmatizer.lemmatize(i[0], get_wordnet_pos(i[1])) for i in nltk.pos_tag(x.split())]))

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
vec = TfidfVectorizer(max_features=4000)
tfidf_vectors = vec.fit_transform(df['v2'])

In [16]:
tfidf_vectors

<5572x4000 sparse matrix of type '<class 'numpy.float64'>'
	with 37611 stored elements in Compressed Sparse Row format>

In [17]:
y = df['v1'] = df['v1'].map({'ham': 0, 'spam': 1})

In [18]:
y.value_counts()

0    4825
1     747
Name: v1, dtype: int64

In [19]:
X_train, X_test, y_train, y_test = train_test_split(tfidf_vectors, y, test_size = 0.2)

In [20]:
X_train.shape

(4457, 4000)

In [21]:
new_df = pd.DataFrame(tfidf_vectors.toarray(), columns = vec.get_feature_names())
new_df.head(30)

Unnamed: 0,008704050406,01223585334,020603,0207,02073162414,020903,021,050703,0578,07090298926,...,yun,yunny,yuo,yup,zed,zoe,åð,ìll,ìï,ûò
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
model = GaussianNB()
model.fit(X_train.toarray(), y_train)

GaussianNB()

In [23]:
from sklearn.metrics import accuracy_score
y_pred = model.predict(X_test.toarray())

In [24]:
score = accuracy_score(y_pred, y_test)
print(score)

0.8618834080717489


In [25]:
model1 = MultinomialNB()
model1.fit(X_train.toarray(), y_train)

MultinomialNB()

In [26]:
y_pred_m = model1.predict(X_test.toarray())
score_m = accuracy_score(y_pred_m, y_test)
print(score_m)

0.9668161434977578
