In [51]:
# Importing linear algebra/data manipulation libraries
import numpy as np
import pandas as pd

import re
from collections import defaultdict

from nltk.corpus import stopwords


# Printing files in input folder
import os
print(os.listdir("./input"))

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils import shuffle
from sklearn.externals import joblib

['spam.csv']


# Loading Data

In [2]:
data = pd.read_csv("./input/spam.csv", encoding = "latin-1")
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [3]:
data = data[['v1','v2']]
data.columns = ['label','text']

In [4]:
data.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
data['label'] = data['label'].map({'ham':0,'spam':1})

In [6]:
ham = data[data['label'] == 0]
spam = data[data['label'] == 1]
new_ham = ham.sample(len(spam), random_state = 5)
new_data = pd.concat([new_ham,spam],axis = 0)
data = shuffle(new_data, random_state = 5).reset_index(drop=True)

In [7]:
def textParser(text):
    tokens = re.sub('[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]', "", text).split(" ")
    tokens = list(filter(lambda x: len(x) > 0 , map(str.lower,tokens)))
    tokens = list(filter(lambda x: x not in stopwords.words("english"),tokens))
    return tokens

In [8]:
bow_data = CountVectorizer(analyzer = textParser).fit_transform(data['text'])

In [9]:
tfidf_data = TfidfTransformer().fit_transform(bow_data)

In [10]:
X_train, X_test, Y_train, Y_test = train_test_split(tfidf_data,data[['label']], test_size=0.3, random_state = 5)

In [16]:
model = MultinomialNB()

In [12]:
fitted_model = model.fit(X_train.toarray(), np.array(Y_train).ravel())

In [13]:
pred = fitted_model.predict(X_test.toarray())
acc_MNB = accuracy_score(np.array(Y_test).ravel(), pred)
acc_MNB

0.9287305122494433

In [14]:
print(classification_report(np.array(Y_test).ravel(),pred))

              precision    recall  f1-score   support

           0       0.93      0.92      0.93       218
           1       0.92      0.94      0.93       231

   micro avg       0.93      0.93      0.93       449
   macro avg       0.93      0.93      0.93       449
weighted avg       0.93      0.93      0.93       449



In [22]:
training_pipe = Pipeline(
    steps = [
        ('bow', CountVectorizer(analyzer = textParser)),
        ('tfdif', TfidfTransformer()),
        ('model',MultinomialNB())
    ]
)

In [42]:
X_train, X_test, Y_train, Y_test = train_test_split(data['text'], data['label'], test_size = 0.3, random_state = 5)

In [50]:
training_pipe.fit(X_train,Y_train)
pred_test_MNB = training_pipe.predict(X_test)
acc_MNB = accuracy_score(Y_test, pred_test_MNB)
print(acc_MNB)
print(training_pipe.score(X_test, Y_test))

0.9487750556792873
0.9487750556792873


In [53]:
filename = 'training_pipeline.sav'
joblib.dump(training_pipe,filename)

['training_pipeline.sav']

In [56]:
new = joblib.load(filename)

In [73]:
X_test[993]

'You are guaranteed the latest Nokia Phone, a 40GB iPod MP3 player or a å£500 prize! Txt word: COLLECT to No: 83355! IBHltd LdnW15H 150p/Mtmsgrcvd18+'