In [1]:
import sys
import nltk
import sklearn
import numpy
import pandas


In [None]:
# dataset is taken from https://archive.ics.uci.edu/ml/datasets/sms+spam+collection

In [2]:
import pandas as pd
import numpy as np

# load the dataset
df= pd.read_table('SMSSpamCollection', header=None, encoding='utf-8')

In [6]:
print(df.info) # or print(df.describe) for this case

<bound method DataFrame.info of          0                                                  1
0      ham  Go until jurong point, crazy.. Available only ...
1      ham                      Ok lar... Joking wif u oni...
2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
3      ham  U dun say so early hor... U c already then say...
4      ham  Nah I don't think he goes to usf, he lives aro...
...    ...                                                ...
5567  spam  This is the 2nd time we have tried 2 contact u...
5568   ham               Will ü b going to esplanade fr home?
5569   ham  Pity, * was in mood for that. So...any other s...
5570   ham  The guy did some bitching but I acted like i'd...
5571   ham                         Rofl. Its true to its name

[5572 rows x 2 columns]>


In [9]:
# check class distribution

classes= df[0]
print(classes.value_counts())

ham     4825
spam     747
Name: 0, dtype: int64


In [12]:
# Preprocess the data

# convert ham and spam to 0 and 1 respectively
from sklearn.preprocessing import LabelEncoder
encoder= LabelEncoder()
Y= encoder.fit_transform(classes)
print(classes[:10])
print(Y[:10])

0     ham
1     ham
2    spam
3     ham
4     ham
5    spam
6     ham
7     ham
8    spam
9    spam
Name: 0, dtype: object
[0 0 1 0 0 1 0 0 1 1]


In [13]:
# store the SMS message data
text_messages= df[1]
print(text_messages[:10])

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
5    FreeMsg Hey there darling it's been 3 week's n...
6    Even my brother is not like to speak with me. ...
7    As per your request 'Melle Melle (Oru Minnamin...
8    WINNER!! As a valued network customer you have...
9    Had your mobile 11 months or more? U R entitle...
Name: 1, dtype: object


In [15]:
# to create expression for email and other things, we can use regular expression website 
# with those expressions we will replace email, mob nos, urls, symbols, etc

# replacing emails with 'emailaddr'
processed= text_messages.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$', 'emailaddr')

# replace urls with 'webaddress'
processed= processed.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$','webaddress')

# replace money symbols with 'moneysymb'
processed= processed.str.replace(r'£|\$', 'moneysymb')

# replace 10 digit phone numbers with 'phonenumber'
processed= processed.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$','phonenumbr')

# replace normal numbers with 'numbr'
processed= processed.str.replace(r'\d+(\.\d+)?', 'numbr')

# remove punctuation
processed= processed.str.replace(r'[^\w\d\s]', ' ')

# replace whitespace between terms with single space
processed= processed.str.replace(r'\s+', ' ')

# replace leading and trailing whitespace
processed= processed.str.replace(r'^\s+|\s+?$', '')

  processed= text_messages.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$', 'emailaddr')
  processed= processed.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$','webaddress')
  processed= processed.str.replace(r'£|\$', 'moneysymb')
  processed= processed.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$','phonenumbr')
  processed= processed.str.replace(r'\d+(\.\d+)?', 'numbr')
  processed= processed.str.replace(r'[^\w\d\s]', ' ')
  processed= processed.str.replace(r'\s+', ' ')
  processed= processed.str.replace(r'^\s+|\s+?$', '')


In [20]:
# changing the words to lower case so that HELLO, Hello, hello, are all same thing.
processed= processed.str.lower()

In [59]:
# remove stopwords from messages
from nltk.corpus import stopwords
stop_words=set(stopwords.words('english'))

processed= processed.apply(lambda x: ' '.join(term for term in x.split() if term not in stop_words))

In [60]:
# remove word stems using porter stemmer
ps= nltk.PorterStemmer()

processed= processed.apply(lambda x: ' '.join(ps.stem(term) for term in x.split()))

In [61]:
print(processed)

0       go jurong point crazi avail bugi n great world...
1                                   ok lar joke wif u oni
2       free entri numbr wkli comp win fa cup final tk...
3                     u dun say earli hor u c alreadi say
4                    nah think goe usf live around though
                              ...                        
5567    numbrnd time tri numbr contact u u moneysymbnu...
5568                              ü b go esplanad fr home
5569                                    piti mood suggest
5570    guy bitch act like interest buy someth el next...
5571                                       rofl true name
Name: 1, Length: 5572, dtype: object


In [62]:
from nltk.tokenize import word_tokenize

# creating a bag-of-words
all_words= []

for message in processed:
    words= word_tokenize(message)
    for w in words:
        all_words.append(w)
        
all_words= nltk.FreqDist(all_words)

# print total no of words and 15 most common words
print('total words: {}'.format(len(all_words)))
print('most common: {}'.format(all_words.most_common(15)))

total words: 6530
most common: [('numbr', 2648), ('u', 1207), ('call', 674), ('go', 456), ('get', 451), ('ur', 391), ('gt', 318), ('lt', 316), ('come', 304), ('moneysymbnumbr', 303), ('ok', 293), ('free', 284), ('day', 276), ('know', 275), ('love', 266)]


In [93]:
# no of words used as features
word_features= list(all_words.keys())[:] # add no of top common words you want to use as features. i am using all of them.

In [94]:
# define a find features function
def find_features(message):
    words= word_tokenize(message)
    features={}
    for word in word_features:
        features[word]=(word in words)
        
    return features

# lets see an example
features=find_features(processed[0])
for key, value in features.items():
    if value==True:
        print(key)

go
jurong
point
crazi
avail
bugi
n
great
world
la
e
buffet
cine
got
amor
wat


In [95]:
# Now lets do it for all the messages
messages = list(zip(processed, Y))

# define a seed for reproducibility
seed = 1
np.random.seed = seed
np.random.shuffle(messages)

# call find_features function for each SMS message
featuresets = [(find_features(text), label) for (text, label) in messages]

In [96]:
# split training and testing data sets using sklearn
from sklearn import model_selection

training, testing = model_selection.train_test_split(featuresets, test_size=0.25, random_state=seed)

print('training: {}'.format(len(training)))
print('testing: {}'.format(len(testing)))

training: 4179
testing: 1393


In [97]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score,confusion_matrix

In [98]:
# defining models to train

names=['K Nearest Neighbours', 'Decision Tree', 'Random Forect', 'Logistic Regression', 
       'SGD Classifier', 'Naive Bayes', 'SVM Linear']
classifier=[KNeighborsClassifier(), DecisionTreeClassifier(), RandomForestClassifier(),
           LogisticRegression(), SGDClassifier(max_iter=100), MultinomialNB(),
           SVC(kernel='linear')]

models=zip(names, classifier)
#print(list(models))

In [99]:
# wrap models in NLTK

from nltk.classify.scikitlearn import SklearnClassifier

for name, model in models:
    nltk_model = SklearnClassifier(model)
    nltk_model.train(training)
    accuracy=nltk.classify.accuracy(nltk_model, testing)* 100
    print('{} : Accuracy : {}'.format(name, accuracy))

K Nearest Neighbours : Accuracy : 92.89303661162958
Decision Tree : Accuracy : 97.34386216798278
Random Forect : Accuracy : 97.70279971284997
Logistic Regression : Accuracy : 97.5592246949031
SGD Classifier : Accuracy : 97.70279971284997
Naive Bayes : Accuracy : 97.91816223977028
SVM Linear : Accuracy : 97.70279971284997


In [101]:
# ensemble method - Voting Classifier
from sklearn.ensemble import VotingClassifier

# defining models to train

names=['K Nearest Neighbours', 'Decision Tree', 'Random Forect', 'Logistic Regression', 
       'SGD Classifier', 'Naive Bayes', 'SVM Linear']
classifier=[KNeighborsClassifier(), DecisionTreeClassifier(), RandomForestClassifier(),
           LogisticRegression(), SGDClassifier(max_iter=100), MultinomialNB(),
           SVC(kernel='linear')]

models=list(zip(names, classifier))

nltk_ensemble= SklearnClassifier(VotingClassifier(estimators= models, voting= 'hard', n_jobs=-1))
nltk_ensemble.train(training)
accuracy= nltk.classify.accuracy(nltk_ensemble, testing) * 100
print('Ensemble Method Accuracy: {}'.format(accuracy))

Ensemble Method Accuracy: 97.5592246949031


In [103]:
# make class label prediction for testing set
txt_features, labels = zip(*testing)
prediction= nltk_ensemble.classify_many(txt_features)

In [106]:
# print a confusion matrix and classification report
print(classification_report(labels, prediction))

pd.DataFrame(
confusion_matrix(labels, prediction),
index=[['actual', 'actual'], ['ham', 'spam']],
columns=[['predicted', 'predicted'], ['ham', 'spam']])

              precision    recall  f1-score   support

           0       0.97      1.00      0.99      1212
           1       0.99      0.82      0.90       181

    accuracy                           0.98      1393
   macro avg       0.98      0.91      0.94      1393
weighted avg       0.98      0.98      0.97      1393



Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,ham,spam
actual,ham,1210,2
actual,spam,32,149
