In [1]:
import sys
import nltk
import sklearn
import pandas
import numpy

In [2]:
print('Python: {}'.format(sys.version))
print('NLTK: {}'.format(nltk.__version__))
print('Scikit-learn: {}'.format(sklearn.__version__))
print('Pandas: {}'.format(pandas.__version__))
print('Numpy: {}'.format(numpy.__version__))

Python: 3.7.2 (default, Dec 29 2018, 00:00:04) 
[Clang 4.0.1 (tags/RELEASE_401/final)]
NLTK: 3.3
Scikit-learn: 0.19.2
Pandas: 0.23.4
Numpy: 1.15.1


In [3]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

## 1. Load the Dataset

In [4]:
import pandas as pd
import numpy as np

In [5]:
# Load the dataset of sms messages
df = pd.read_table('./data/smsspamcollection/SMSSpamCollection',
                 header=None, encoding='utf-8')

In [6]:
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
0    5572 non-null object
1    5572 non-null object
dtypes: object(2)
memory usage: 87.1+ KB
None
      0                                                  1
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [7]:
#check class distribution
classes = df[0]
print(classes.value_counts())

ham     4825
spam     747
Name: 0, dtype: int64


* Sometimes the skewed ration has a negative impact on the machine learning model, but in this case we   will not normalize it

# 2. Preprocess the Data

In [8]:
#convert class labels to binary values, 0 = ham, 1 = spam
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
Y = encoder.fit_transform(classes)

print(classes[:10])
print(Y[:10])

0     ham
1     ham
2    spam
3     ham
4     ham
5    spam
6     ham
7     ham
8    spam
9    spam
Name: 0, dtype: object
[0 0 1 0 0 1 0 0 1 1]


In [9]:
# store the sms message data
text_message  = df[1]
print(text_message[:10])

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
5    FreeMsg Hey there darling it's been 3 week's n...
6    Even my brother is not like to speak with me. ...
7    As per your request 'Melle Melle (Oru Minnamin...
8    WINNER!! As a valued network customer you have...
9    Had your mobile 11 months or more? U R entitle...
Name: 1, dtype: object


Using regular expression we will replace email id's with email, weburls with web address, money symbols, phone numbers and other numbers with generic terms. Individual values will not help in learning the algorithm as they are unique values. In comparision to generic terms will help in learning

In [10]:
# use regular expression for the above mentioned unique terms
# replace email addresses with 'emailaddr'
processed = text_message.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$',
                                 'emailaddress')

#replace web address
processed = processed.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$',
                                 'webaddress')

#replace money symbols 
processed = processed.str.replace(r'£|\$', 'moneysymb')

# Replace 10 digit phone numbers (formats include paranthesis, spaces, no spaces, dashes) with 'phonenumber'
processed = processed.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$',
                                  'phonenumbr')

#replace numbers
processed = processed.str.replace(r'\d+(\.\d+)?', 'numbr')

In [11]:
# remove punctuations
processed = processed.str.replace(r'[^\w\d\s]', ' ')

#replace whitespace between terms with a single space
processed = processed.str.replace(r'\s+', ' ')

# remove leading and trailing whitespace
processed = processed.str.replace(r'^\s+|\s+?$', '')

In [12]:
# change words to lower case - Hello and HELLO are the same words
processed = processed.str.lower()
print(processed[:10])

0    go until jurong point crazy available only in ...
1                              ok lar joking wif u oni
2    free entry in numbr a wkly comp to win fa cup ...
3          u dun say so early hor u c already then say
4    nah i don t think he goes to usf he lives arou...
5    freemsg hey there darling it s been numbr week...
6    even my brother is not like to speak with me t...
7    as per your request melle melle oru minnaminun...
8    winner as a valued network customer you have b...
9    had your mobile numbr months or more u r entit...
Name: 1, dtype: object


Now we will remove stopwords 

In [13]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
processed = processed.apply(lambda x: ' '.join(
    term for term in x.split() if term not in stop_words ))

In [14]:
# Remove word stems using a Porter stemmer
ps = nltk.PorterStemmer()

processed = processed.apply(lambda x: ' '.join(
    ps.stem(term) for term in x.split()))

print(processed[:10])

0    go jurong point crazi avail bugi n great world...
1                                ok lar joke wif u oni
2    free entri numbr wkli comp win fa cup final tk...
3                  u dun say earli hor u c alreadi say
4                 nah think goe usf live around though
5    freemsg hey darl numbr week word back like fun...
6        even brother like speak treat like aid patent
7    per request mell mell oru minnaminungint nurun...
8    winner valu network custom select receivea mon...
9    mobil numbr month u r entitl updat latest colo...
Name: 1, dtype: object


# 3. Generating Features #
Feature engineering is the process of using domain knowledge of the data to create features for machine learning algorithms. In this project, the words in each text message will be our features. For this purpose, it will be necessary to tokenize each word. We will use the 1500 most common words as features.

In [15]:
from nltk.tokenize import word_tokenize

# create bag-of-words
all_words = []

for message in processed:
    words = word_tokenize(message)
    for w in words:
        all_words.append(w)
        

all_words = nltk.FreqDist(all_words)

In [16]:
# Print the total number of words and the 15 most common words
print('Number of words: {}'.format(len(all_words)))
print('Most common words: {}'.format(all_words.most_common(15)))

Number of words: 6579
Most common words: [('numbr', 2648), ('u', 1207), ('call', 674), ('go', 456), ('get', 451), ('ur', 391), ('gt', 318), ('lt', 316), ('come', 304), ('moneysymbnumbr', 303), ('ok', 293), ('free', 284), ('day', 276), ('know', 275), ('love', 266)]


In [19]:
# use the 1500 most common words as features
word_features = list(dict(all_words.most_common(1500)).keys())
# You can choose to use all words as it will help in the performance of the model

In [20]:
# Deine find_features function that will deftermine which of the 1500 word featurews are contained in the message
def find_features(message):
    words = word_tokenize(message)
    features = {}
    for word in word_features:
        features[word] = (word in words)
        
    return features

# Let's run it
features = find_features(processed[0])
for key,value in features.items():
    if value == True:
        print(key)

go
got
n
great
wat
e
world
point
avail
crazi
bugi
la
cine


In [47]:
# Now lets do it for all the messages
messages = list(zip(processed,Y))

# define a seed for reproducibility
# seed = 1
# np.random.seed = seed
# np.random.shuffle(messages)

# Call find_features function for each SMS message
featuresets = [(find_features(text),label) for text,label in messages]

In [48]:
#We can split the featuresets in train and test datasets using sklearn
from sklearn import model_selection
seed = 1
training, testing = model_selection.train_test_split(featuresets, test_size=0.25, random_state=seed)

In [49]:
print("Training length: {}".format(len(training)))
print("Testing lenght: {}".format(len(testing)))

Training length: 4179
Testing lenght: 1393


# 4. Scikit-Learn Classifiers with NLTK
Now that we have our dataset, we   can start building algorithms. Let's start with simple linear support vecrot classifier, then expand to other algorithms. We will need to import each algorithm we plan on using from sklearn. We also need to import some performance metricds, such as accuracy_score and classification_report

In [50]:
# We can use sklearn algorithms in NLTK
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import SVC

model = SklearnClassifier(SVC(kernel ='linear'))

#train the model on training data
model.train(training)

# and test on the testing dataset!
accuracy = nltk.classify.accuracy(model, testing)*100
print("SVC Accuracy: {}".format(accuracy))

SVC Accuracy: 98.77961234745155


In [51]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix


# Define models to train

names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
         "Naive Bayes", "SVM Linear"]

classifiers = [KNeighborsClassifier(),
              DecisionTreeClassifier(),
              RandomForestClassifier(),
              LogisticRegression(),
              SGDClassifier(max_iter=100),
              MultinomialNB(),
              SVC(kernel = 'linear')]

models = list(zip(names, classifiers))

for name, model in models:
    nltk_model = SklearnClassifier(model)
    nltk_model.train(training)
    accuracy = nltk.classify.accuracy(nltk_model, testing)*100
    print("{} Accuracy: {}".format(name, accuracy))


K Nearest Neighbors Accuracy: 94.54414931801867
Decision Tree Accuracy: 97.70279971284997
Random Forest Accuracy: 97.91816223977028
Logistic Regression Accuracy: 98.77961234745155
SGD Classifier Accuracy: 98.7078248384781
Naive Bayes Accuracy: 99.06676238334529
SVM Linear Accuracy: 98.77961234745155


In [52]:
#print(classifiers)

In [55]:
# Build an ensemble method - Voting classifier
from sklearn.ensemble import VotingClassifier

names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
         "Naive Bayes", "SVM Linear"]

classifiers = [KNeighborsClassifier(),
              DecisionTreeClassifier(),
              RandomForestClassifier(),
              LogisticRegression(),
              SGDClassifier(max_iter=100),
              MultinomialNB(),
              SVC(kernel = 'linear')]

models = list(zip(names, classifiers))

nltk_ensemble = SklearnClassifier(VotingClassifier(estimators = models, voting= 'hard', n_jobs = -1))
nltk_ensemble.train(training)
accuracy = nltk.classify.accuracy(nltk_ensemble, testing) *100
print('Ensemble method accuracy: {}'.format(accuracy))

Ensemble method accuracy: 98.92318736539842


  if diff:


In [56]:
# The Naive Bayes classifier was the best and ensemble method does not imporve in this case

In [58]:
# make class label prediction for test set
txt_features, labels = list(zip(*testing))

prediction = nltk_ensemble.classify_many(txt_features)

  if diff:


In [60]:
# print a confustion matrix and a classification report
print(classification_report(labels,prediction))

pd.DataFrame(confusion_matrix(labels,prediction),
            index = [['actual','actual'],['ham','spam']],
            columns = [['predicted','predicted'],['ham','spam']])

             precision    recall  f1-score   support

          0       0.99      1.00      0.99      1208
          1       0.99      0.92      0.96       185

avg / total       0.99      0.99      0.99      1393



Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,ham,spam
actual,ham,1207,1
actual,spam,14,171


In [61]:
# Model performs really well. It only marked an actual message as a spam. Event though accuracy is not very high