In [1]:
import pandas as pd
import nltk
from sklearn.metrics.classification import accuracy_score, log_loss
from sklearn.linear_model import SGDClassifier
from sklearn.calibration import CalibratedClassifierCV

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import matplotlib.pyplot as plt
import warnings
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import time
import numpy as np
warnings.filterwarnings("ignore")


In [2]:
data=pd.read_csv('spam.csv', encoding = 'latin-1')
data.head(10)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
5,spam,FreeMsg Hey there darling it's been 3 week's n...,,,
6,ham,Even my brother is not like to speak with me. ...,,,
7,ham,As per your request 'Melle Melle (Oru Minnamin...,,,
8,spam,WINNER!! As a valued network customer you have...,,,
9,spam,Had your mobile 11 months or more? U R entitle...,,,


In [3]:
data.shape

(5572, 5)

In [4]:
data=data.rename(columns={'v1':'tags','v2':'message'}) 
data.head(10)
                 

Unnamed: 0,tags,message,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
5,spam,FreeMsg Hey there darling it's been 3 week's n...,,,
6,ham,Even my brother is not like to speak with me. ...,,,
7,ham,As per your request 'Melle Melle (Oru Minnamin...,,,
8,spam,WINNER!! As a valued network customer you have...,,,
9,spam,Had your mobile 11 months or more? U R entitle...,,,


In [5]:
data=data.loc[:,['tags','message']]
data.head(5)

Unnamed: 0,tags,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
data['tags'].describe()

count     5572
unique       2
top        ham
freq      4825
Name: tags, dtype: object

In [7]:
data['message'].describe()

count                       5572
unique                      5169
top       Sorry, I'll call later
freq                          30
Name: message, dtype: object

In [8]:
data['tags'].value_counts()

ham     4825
spam     747
Name: tags, dtype: int64

In [9]:
data['tags']=data['tags'].map({'ham':0,'spam':1})
data.head()

Unnamed: 0,tags,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [10]:
data.isnull().any()

tags       False
message    False
dtype: bool

In [11]:
data.duplicated().any()

True

In [12]:
data['message'].head(10)

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
5    FreeMsg Hey there darling it's been 3 week's n...
6    Even my brother is not like to speak with me. ...
7    As per your request 'Melle Melle (Oru Minnamin...
8    WINNER!! As a valued network customer you have...
9    Had your mobile 11 months or more? U R entitle...
Name: message, dtype: object

In [15]:
#preprocessing 
#removing stop words
stop_words = set(stopwords.words('english'))
print ('list of stop words:', stop_words)

def pre_process(text, index, column):
    if type(text) is not int:
        string = ""
        for words in text.split():
            # remove the special chars  like '"#$@!%^&*()_+-~?>< etc.
            word = ("".join(s for s in words if s.isalnum()))
            # Conver all letters to lower-case
            word = word.lower()
            # stop-word removal
            if not word in stop_words:
                string += word + " "
        data[column][index] = string

list of stop words: {'yours', 'can', 'so', 'herself', 'wouldn', 'no', 'after', 'was', 'itself', 'o', 're', "should've", "hasn't", 'doing', 'between', 'below', 'that', 'few', 'her', 'be', 'me', 'about', 'ours', 'have', 'the', 'do', 'll', "that'll", 'these', 'from', 'its', 's', "won't", 'it', 'aren', 'down', 'both', 'are', 'she', "hadn't", 'into', 've', 'himself', 'all', 'couldn', 'should', 'or', 'i', 'they', 'why', 'before', 'ourselves', 'at', 'their', "she's", 'if', 'own', 'your', 'mustn', 'than', "don't", 'will', "weren't", 'themselves', 'does', 'some', 'there', 'which', 'of', 'he', "mustn't", 'but', 'during', 't', 'just', 'when', 'we', 'had', 'only', "couldn't", "you're", 'whom', "you'll", 'most', 'myself', 'd', 'ain', 'those', 'a', "aren't", 'having', 'off', 'don', 'hasn', 'further', 'once', 'him', 'other', 'our', 'now', 'needn', 'against', 'on', 'being', 'my', 'weren', 'very', 'in', "haven't", 'been', 'as', "it's", 'hadn', 'yourselves', 'same', 'm', 'an', 'did', 'such', 'how', 'bec

In [16]:
import time
start_time = time.clock()
# we take each title and we text-preprocess it.
for index, row in data.iterrows():
    pre_process(row['message'], index, 'message')
# we print the time it took to preprocess whole titles 
print(time.clock() - start_time, "seconds")


238.657935 seconds


In [18]:
data['message'].head(10)

0    go jurong point crazy available bugis n great ...
1                             ok lar joking wif u oni 
2    free entry 2 wkly comp win fa cup final tkts 2...
3                 u dun say early hor u c already say 
4         nah dont think goes usf lives around though 
5    freemsg hey darling 3 weeks word back id like ...
6      even brother like speak treat like aids patent 
7    per request melle melle oru minnaminunginte nu...
8    winner valued network customer selected receiv...
9    mobile 11 months u r entitled update latest co...
Name: message, dtype: object

In [19]:
target = pd.DataFrame(data['tags'])
target.columns = ['tags']
data = data.drop('tags',axis=1)
print(target.head())
print(data.columns)

   tags
0     0
1     0
2     1
3     0
4     0
Index(['message'], dtype='object')


In [20]:
target.shape

(5572, 1)

In [21]:
Counter(target['tags'])

Counter({0: 4825, 1: 747})

In [22]:
data.shape

(5572, 1)

In [25]:
x_train,x_test,y_train,y_test=train_test_split(data,target,stratify=target,test_size=0.2)



In [26]:
print(x_train.shape)
print(x_test.shape)




(4457, 1)
(1115, 1)


In [27]:
print(y_train.shape)
print(y_test.shape)

(4457, 1)
(1115, 1)


In [28]:
#Bag Of Words
count=CountVectorizer()
vector_message_train=count.fit_transform(x_train['message'])
vector_message_test=count.transform(x_test['message'])


In [29]:
vector_message_train.shape

(4457, 8289)

In [30]:
count_tfidf=TfidfVectorizer()
vector_tfidf_train=count_tfidf.fit_transform(x_train['message'])
vector_tfidf_test=count_tfidf.transform(x_test['message'])


In [31]:
vector_tfidf_train.shape

(4457, 8289)

In [36]:
from sklearn.svm import SVC 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB 
from sklearn import tree
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression


# Train KNeighborsClassifier Model
KNN_Classifier_model = KNeighborsClassifier(n_jobs=-1)
KNN_Classifier_model.fit(vector_tfidf_train, y_train); 

# Train LogisticRegression Model
LGR_Classifier_model = LogisticRegression(n_jobs=-1, random_state=0)
LGR_Classifier_model.fit(vector_tfidf_train, y_train);

# Train Gaussian Naive Baye Model
BNB_Classifier_model = BernoulliNB()
BNB_Classifier_model.fit(vector_tfidf_train, y_train)
            
# Train Decision Tree Model
DTC_Classifier_model = tree.DecisionTreeClassifier(criterion='entropy', random_state=0)
DTC_Classifier_model.fit(vector_tfidf_train, y_train);
   

In [39]:
from sklearn import metrics

models = []
models.append(('KNeighborsClassifier', KNN_Classifier_model))
models.append(('Naive Baye Classifier', BNB_Classifier_model))
models.append(('Decision Tree Classifier', DTC_Classifier_model))
models.append(('LogisticRegression', LGR_Classifier_model))
for i, v in models:
    scores = cross_val_score(v, vector_tfidf_train, y_train, cv=10)
    accuracy = metrics.accuracy_score(y_train, v.predict(vector_tfidf_train))
    confusion_matrix = metrics.confusion_matrix(y_train, v.predict(vector_tfidf_train))
    classification = metrics.classification_report(y_train, v.predict(vector_tfidf_train))
    print('{} Model evaluation'.format(i))
    print()
    print ("Cross Validation Mean Score:" "\n", scores.mean())
    print()
    print ("Model Accuracy:" "\n", accuracy)
    print()
    print("Confusion matrix:" "\n", confusion_matrix)
    print()
    print("Classification report:" "\n", classification) 
    print()

KNeighborsClassifier Model evaluation

Cross Validation Mean Score:
 0.903075432464

Model Accuracy:
 0.919901278887

Confusion matrix:
 [[3859    0]
 [ 357  241]]

Classification report:
              precision    recall  f1-score   support

          0       0.92      1.00      0.96      3859
          1       1.00      0.40      0.57       598

avg / total       0.93      0.92      0.90      4457


Naive Baye Classifier Model evaluation

Cross Validation Mean Score:
 0.973296683593

Model Accuracy:
 0.98474310074

Confusion matrix:
 [[3857    2]
 [  66  532]]

Classification report:
              precision    recall  f1-score   support

          0       0.98      1.00      0.99      3859
          1       1.00      0.89      0.94       598

avg / total       0.98      0.98      0.98      4457


Decision Tree Classifier Model evaluation

Cross Validation Mean Score:
 0.961633961517

Model Accuracy:
 1.0

Confusion matrix:
 [[3859    0]
 [   0  598]]

Classification report:
         

In [40]:
for i, v in models:
    accuracy = metrics.accuracy_score(y_test, v.predict(vector_tfidf_test))
    confusion_matrix = metrics.confusion_matrix(y_test, v.predict(vector_tfidf_test))
    classification = metrics.classification_report(y_test, v.predict(vector_tfidf_test))
    print('{} Model Test Results '.format(i))
    print()
    print ("Model Accuracy:" "\n", accuracy)
    print("Confusion matrix:" "\n", confusion_matrix)
    print("Classification report:" "\n", classification) 
          

KNeighborsClassifier Model Test Results 

Model Accuracy:
 0.911210762332
Confusion matrix:
 [[966   0]
 [ 99  50]]
Classification report:
              precision    recall  f1-score   support

          0       0.91      1.00      0.95       966
          1       1.00      0.34      0.50       149

avg / total       0.92      0.91      0.89      1115

Naive Baye Classifier Model Test Results 

Model Accuracy:
 0.966816143498
Confusion matrix:
 [[964   2]
 [ 35 114]]
Classification report:
              precision    recall  f1-score   support

          0       0.96      1.00      0.98       966
          1       0.98      0.77      0.86       149

avg / total       0.97      0.97      0.97      1115

Decision Tree Classifier Model Test Results 

Model Accuracy:
 0.95067264574
Confusion matrix:
 [[951  15]
 [ 40 109]]
Classification report:
              precision    recall  f1-score   support

          0       0.96      0.98      0.97       966
          1       0.88      0.73      0