## 1.Loading the Dataset!!!

In [2]:
import sys
import nltk
import sklearn
import numpy 
import pandas 

print("Python: {}".format(sys.version))
print("NLTK: {}".format(nltk.__version__))
print("Scikit-learn: {}".format(sklearn.__version__))
print("Numpy :{}".format(numpy.__version__))
print("Pandas: {}".format(pandas.__version__))

Python: 3.6.8 |Anaconda, Inc.| (default, Dec 30 2018, 01:22:34) 
[GCC 7.3.0]
NLTK: 3.4
Scikit-learn: 0.20.3
Numpy :1.15.4
Pandas: 0.24.1


In [3]:
#load the sms messagges dataset
import numpy as np
import pandas as pd
df=pd.read_csv("SMSSpamCollection", header=None, encoding='utf-8',sep='\t')  #we dont want to give a name so header is none and since it is a text file encoding='utf-8'



In [4]:
#lets see some useful info
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
0    5572 non-null object
1    5572 non-null object
dtypes: object(2)
memory usage: 87.1+ KB
None
      0                                                  1
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [5]:
#let's check class distributions
classes=df[0]
print(classes.value_counts())

ham     4825
spam     747
Name: 0, dtype: int64


## 2. Pre-Process the Data

In [6]:
#let's convert the class labels to binary values, 0 for ham and 1 for spam
from sklearn.preprocessing import LabelEncoder   #labelencoder will do above task

encoder = LabelEncoder()  #making instance

Y=encoder.fit_transform(classes)

#let's check 
print(classes[:10])
print(Y[:10])

0     ham
1     ham
2    spam
3     ham
4     ham
5    spam
6     ham
7     ham
8    spam
9    spam
Name: 0, dtype: object
[0 0 1 0 0 1 0 0 1 1]


In [7]:
#store the SMS msg data

text_messages= df[1]    #since 2nd col contain msgs
print(text_messages[:10])  #let's print first 10 of them

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
5    FreeMsg Hey there darling it's been 3 week's n...
6    Even my brother is not like to speak with me. ...
7    As per your request 'Melle Melle (Oru Minnamin...
8    WINNER!! As a valued network customer you have...
9    Had your mobile 11 months or more? U R entitle...
Name: 1, dtype: object


In [8]:
#now we have to use regular exp to replace email addr,url,phone no.,other num,symbols
#we can find more reg exp online

#replace email address to 'emailaddr'
processed= text_messages.str.replace(r'^\w+@[a-zA-Z_]+?\.[a-zA-Z]{2,3}$','emaladdr')

#replace urls with 'webaddress'
processed= processed.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$','webaddress')

#replace money symbols with 'moneysymb'
processed=processed.str.replace(r'£/\$','moneysymb')

#replace 10 digit phone number with 'phonenumbr'
processed=processed.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$','phonenumbr')

#replace normal numbers with 'numbr'
processed=processed.str.replace(r'\d+(\.\d+)?','numbr')
                                    




In [9]:
#replace punctuations
processed=processed.str.replace(r'[^\w\d\s]',' ')

#replace whitespaces between terms with single space
processed=processed.str.replace(r'\s+',' ')

#replace leading and trailing whitespaces
processed=processed.str.replace(r'^\s+/\s+?$','')




In [10]:
#now let's change all words to lowercase
processed=processed.str.lower()
print(processed)

0       go until jurong point crazy available only in ...
1                                ok lar joking wif u oni 
2       free entry in numbr a wkly comp to win fa cup ...
3            u dun say so early hor u c already then say 
4       nah i don t think he goes to usf he lives arou...
5       freemsg hey there darling it s been numbr week...
6       even my brother is not like to speak with me t...
7       as per your request melle melle oru minnaminun...
8       winner as a valued network customer you have b...
9       had your mobile numbr months or more u r entit...
10      i m gonna be home soon and i don t want to tal...
11      six chances to win cash from numbr to numbr nu...
12      urgent you have won a numbr week free membersh...
13      i ve been searching for the right words to tha...
14                     i have a date on sunday with will 
15      xxxmobilemovieclub to use your credit click th...
16                                oh k i m watching here 
17      eh u r

In [11]:
#remove stop words from text msgs
from nltk.corpus import stopwords

stop_words =set(stopwords.words('english'))

processed=processed.apply(lambda x: ' '.join(term for term in x.split() if term not in stop_words))  #x is above msgs




In [12]:
#remove word stem using Porter stemmer ....porter is a type of stemming technique
ps=nltk.PorterStemmer()

processed=processed.apply(lambda x:' '.join(ps.stem(term) for term in x.split()))


In [13]:
print(processed)

0       go jurong point crazi avail bugi n great world...
1                                   ok lar joke wif u oni
2       free entri numbr wkli comp win fa cup final tk...
3                     u dun say earli hor u c alreadi say
4                    nah think goe usf live around though
5       freemsg hey darl numbr week word back like fun...
6           even brother like speak treat like aid patent
7       per request mell mell oru minnaminungint nurun...
8       winner valu network custom select receivea num...
9       mobil numbr month u r entitl updat latest colo...
10      gonna home soon want talk stuff anymor tonight...
11      six chanc win cash numbr numbr numbr pound txt...
12      urgent numbr week free membership numbr numbr ...
13      search right word thank breather promis wont t...
14                                            date sunday
15      xxxmobilemovieclub use credit click wap link n...
16                                             oh k watch
17      eh u r

In [14]:
#now we have to generate feature using feature engg.
#feature engg. is the process to use domain knowledge of data to create features

from nltk.tokenize import word_tokenize

#creating a bag of words model .....i.e. simply extracting all the words as tokens
all_words=[]

for message in processed:
    words=word_tokenize(message)
    for w in words:
        all_words.append(w)
        
all_words=nltk.FreqDist(all_words)


In [51]:
#print total no. of words and most common 15 words
print("Total num of words: {}".format(len(all_words)))
print("Most common words: {}".format(all_words.most_common(15)))

Total num of words: 6569
Most common words: [('numbr', 2975), ('u', 1207), ('call', 679), ('go', 456), ('get', 452), ('ur', 391), ('gt', 318), ('lt', 316), ('come', 304), ('ok', 293), ('free', 284), ('day', 276), ('know', 275), ('love', 266), ('like', 261)]


In [52]:
#use the 3000 most common words as features....we can take 1500 or more words
word_features=list(all_words.keys())[:3000]   


In [53]:
#define find features function
def find_features(message):
    words=word_tokenize(message)
    features = {}
    for word in word_features:
        features[word]=(word in words)
        
    return features

#let's see some results
features=find_features(processed[0])
for key,value in features.items():
    if value==True:
        print(key)

go
jurong
point
crazi
avail
bugi
n
great
world
la
e
buffet
cine
got
amor
wat


In [54]:
#find features for all messages
messages= list(zip(processed,Y))

#define a seed for reproducibility
seed=1
np.random.seed=seed
np.random.shuffle(messages)

#call find_features function for each sms message
featuresets= [(find_features(text),label) for(text,label) in messages]

In [55]:
#let's split the data now using scikit-learn
from sklearn import model_selection

training,testing=model_selection.train_test_split(featuresets,test_size=0.25,random_state=seed)

In [56]:
print("Training: {}".format(len(training)))
print("Testing: {}".format(len(testing)))


Training: 4179
Testing: 1393


## 4. Using Scikit-learn Classifiers

In [57]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix

In [58]:
#Define models to train
names=['K Nearest Neighbors', 'Decision Tree', 'Random Forest', 'Logistic Regression','SGDClassifier','Naive-Bayes','SVM Linear']

classifiers={
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(n_estimators=100),
    LogisticRegression(solver='lbfgs',tol=0.0001),
    SGDClassifier(max_iter=100,tol=1e-3),
    MultinomialNB(),
    SVC(kernel='linear')
}

models=list(zip(names,classifiers))

#print(models)


In [59]:
#wrap models in nltk
from nltk.classify.scikitlearn import SklearnClassifier

for name,model in models:
    nltk_model=SklearnClassifier(model)
    nltk_model.train(training)
    accuracy=nltk.classify.accuracy(nltk_model,testing)*100
    print('{} :Accuracy:{}'.format(name,accuracy))

#result may be different every iteration

K Nearest Neighbors :Accuracy:97.34386216798278
Decision Tree :Accuracy:98.27709978463747
Random Forest :Accuracy:91.95979899497488
Logistic Regression :Accuracy:97.91816223977028
SGDClassifier :Accuracy:98.06173725771716
Naive-Bayes :Accuracy:98.27709978463747
SVM Linear :Accuracy:97.91816223977028


## ensemble method - Voting classifier...
E.g., if the prediction for a given sample is

classifier 2 -> class 1

classifier 3 -> class 2

the VotingClassifier   (with voting="hard") 

would classify the sample as--> “class 1” 

based on the majority class label.
In the cases of a tie, the VotingClassifier will select the class based on the ascending sort order. E.g., in the following scenario

classifier 1 -> class 2

classifier 2 -> class 1

-->the class label 1 will be assigned to the sample.

-->Here class 1,2 are spam,ham...

In contrast to majority voting (hard voting), 
-->soft voting" returns the class label as argmax of the sum of predicted probabilities.

The weighted average probabilities for a sample would then be calculated as follows:

classifier ------class 1-------class 2---------class 3

classifier 1 --- w1 * 0.2------w1 * 0.5 -------w1 * 0.3

classifier 2 --- w2 * 0.6 -----w2 * 0.3 -------w2 * 0.1

classifier 3 ----w3 * 0.3 -----w3 * 0.4 -------w3 * 0.3

weighted avg ---- 0.37   -------0.4  ------------0.23

-->Here, the predicted class label is 2, since it has the highest average probability.

If your algorithms are optimized then go for soft

In [60]:
from sklearn.ensemble import VotingClassifier

#Define models to train
names=['K Nearest Neighbors', 'Decision Tree', 'Random Forest', 'Logistic Regression','SGDClassifier','Naive-Bayes','SVM Linear']

classifiers={
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(n_estimators=100),
    LogisticRegression(solver='lbfgs',tol=0.0001),
    SGDClassifier(max_iter=100,tol=1e-3),
    MultinomialNB(),
    SVC(kernel='linear')
}

models=list(zip(names,classifiers))

nltk_ensemble=SklearnClassifier(VotingClassifier(estimators=models,voting='hard',n_jobs=None))    #n_jobs--no. of cores to be used of cpu or else -1 means all
nltk_ensemble.train(training)
accuracy=nltk.classify.accuracy(nltk_ensemble,testing)*100
print('Ensemble Method Accuracy:{}'.format(accuracy))



Ensemble Method Accuracy:98.06173725771716


In [61]:
#let's make some class label predictions for testing set
txt_features,labels=zip(*testing)

prediction=nltk_ensemble.classify_many(txt_features)

In [62]:
#print a confusion matrix and classification report
print(classification_report(labels,prediction))

pd.DataFrame(
        confusion_matrix(labels,prediction),
        index=[['actual','actual'],['ham','spam']],
        columns=[['predicted','predicted'],['ham','spam']]
            )

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1214
           1       1.00      0.85      0.92       179

   micro avg       0.98      0.98      0.98      1393
   macro avg       0.99      0.92      0.95      1393
weighted avg       0.98      0.98      0.98      1393



Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,ham,spam
actual,ham,1214,0
actual,spam,27,152
