##  1. Import Libraries



In [1]:
import sys
import nltk
import sklearn
import pandas as pd 
import numpy as np
import re

## 2. Importing the Dataset

In [2]:
# Read the dataset using Pandas
# The dataset is collected from the UCI Machine Learning Repository 
# "https://archive.ics.uci.edu/ml/machine-learning-databases/00228/"
sms = pd.read_csv('SMSSpamCollection', sep='\t', names=["label", "message"])

In [3]:
# print necessary information of the dataset
print (sms.info())
print (sms.head())
print (sms['label'].value_counts())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
label      5572 non-null object
message    5572 non-null object
dtypes: object(2)
memory usage: 87.1+ KB
None
  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...
ham     4825
spam     747
Name: label, dtype: int64


## 3. Preprocessing and cleaning the dataset

In [4]:
# This part consist with removal of punctuation marks, convert to lowercase, tokeninzing, lemmatizing, stopwords removing.
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mdutp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
ps = nltk.PorterStemmer()
corpus = []

In [6]:
for i in range (0, len(sms)):
    dataset = re.sub('[^a-zA-Z]', ' ', sms['message'][i])
    dataset = dataset.lower()
    dataset = dataset.split()
    dataset = [ps.stem(word) for word in dataset if not word in stopwords.words('english')]
    dataset = ' '.join(dataset)
    corpus.append(dataset)

In [7]:
from nltk.tokenize import word_tokenize

# The Bag of Words model
word_bag = []

for text in corpus:
    words = word_tokenize(text)
    for j in words:
        word_bag.append(j)
        
word_bag = nltk.FreqDist(word_bag)

In [8]:
print('Total words: {}'.format(len(word_bag)))
print('Common words: {}'.format(word_bag.most_common(50)))

Total words: 6312
Common words: [('u', 1228), ('call', 695), ('go', 462), ('get', 458), ('ur', 391), ('gt', 318), ('lt', 316), ('come', 305), ('ok', 293), ('day', 293), ('free', 288), ('know', 275), ('love', 266), ('like', 261), ('time', 254), ('got', 253), ('want', 248), ('good', 248), ('text', 233), ('send', 214), ('txt', 197), ('need', 190), ('p', 188), ('one', 185), ('today', 181), ('n', 177), ('take', 174), ('see', 173), ('stop', 173), ('r', 171), ('home', 167), ('think', 166), ('repli', 164), ('lor', 162), ('k', 160), ('sorri', 160), ('still', 158), ('tell', 158), ('mobil', 157), ('back', 153), ('da', 152), ('dont', 149), ('make', 148), ('phone', 142), ('week', 141), ('pleas', 141), ('say', 140), ('hi', 140), ('work', 136), ('new', 136)]


## 4. Setup featureset

In [9]:
# As featureset use 2000 most common words and featureset function will return those feature
common_word_feature = list(word_bag.keys())[:2000]
def featureset(txt):
    words = word_tokenize(txt)
    feature = {}
    for word in common_word_feature:
        feature[word] = (word in words)

    return feature


# Convert HAM and SPAM as binary value for the model
from sklearn.preprocessing import LabelEncoder

binary = sms['label']
encode = LabelEncoder()
Y = encode.fit_transform(binary)

print(Y[:15])


[0 0 1 0 0 1 0 0 1 1 0 1 1 0 0]


In [10]:
all_sms = list(zip(corpus, Y))
seed = 1
np.random.seed = seed
np.random.shuffle(all_sms)

featuresets = [(featureset(text), label) for (text, label) in all_sms]

## 5. Split the dataset into training set and testing set

In [11]:
from sklearn import model_selection

training, testing = model_selection.train_test_split(featuresets, test_size = 0.30, random_state=seed)

## 6. Use different classification algorithms for the same training and testing datasets

In [None]:
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from nltk.classify.scikitlearn import SklearnClassifier


classification_algorithms = ["SVM Linear", "Naive Bayes", "Decision Tree", "K Nearest Neighbors",  "Random Forest", 
                             "Logistic Regression", "SGD Classifier",]

classifier_set = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel = 'linear')
]

models = zip(classification_algorithms, classifier_set)

for classification_algorithms, model in models:
    final_model = SklearnClassifier(model)
    final_model.train(training)
    accuracy = nltk.classify.accuracy(final_model, testing)*100
    print("{} Accuracy: {}".format(classification_algorithms, accuracy))

SVM Linear Accuracy: 92.04545454545455
Naive Bayes Accuracy: 96.5909090909091
