In [6]:
import sys
import numpy as np
import nltk
import sklearn
import pandas as pd

## 1.loading dataset

In [4]:
##load the dataset
import os
os.chdir("F:\\spam filter")

In [7]:
df=pd.read_table("SMSSpamCollection",header=None,encoding='utf-8')

  """Entry point for launching an IPython kernel.


In [9]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
0    5572 non-null object
1    5572 non-null object
dtypes: object(2)
memory usage: 87.1+ KB
None


In [10]:
#checking class distribution
classes=df[0]
print(classes.value_counts())

ham     4825
spam     747
Name: 0, dtype: int64


## 2. preprocess the data

In [11]:
#converting labels into binary
from  sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
Y=encoder.fit_transform(classes)

print(classes[:10])

0     ham
1     ham
2    spam
3     ham
4     ham
5    spam
6     ham
7     ham
8    spam
9    spam
Name: 0, dtype: object


In [12]:
#store the messages
text_messages=df[1]
print(text_messages[:10])

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
5    FreeMsg Hey there darling it's been 3 week's n...
6    Even my brother is not like to speak with me. ...
7    As per your request 'Melle Melle (Oru Minnamin...
8    WINNER!! As a valued network customer you have...
9    Had your mobile 11 months or more? U R entitle...
Name: 1, dtype: object


## 3.regular expression

In [15]:
#use regular expression to replace email,numbers,symbols,urls,other numbers
processed=text_messages.str.replace(r'^,+@[^\.].*\.[a-z]{2,}$','emailaddr')
processed=processed.str.replace(r'^http\://[a-zA-Z0-9\-\,]+\.[a-zA-Z]{2,3}(/\S*)?$','webaddress')

In [16]:
processed=processed.str.replace(r'£|\$','moneysymb')

In [17]:
processed=processed.str.replace(r'^\(?[\d]{3}\)?[\d]{3}[\s-]?[\d]{4}$','phonenumber')

In [18]:
processed=processed.str.replace(r'^\d+{\.\d+}?','numbr')

In [20]:
#remove punctuation
processed=processed.str.replace(r'[^\w\d\s]',' ')
#replacing double space with single space
processed=processed.str.replace(r'\s+',' ')
#replacing leading and trailing whitespaces
processed=processed.str.replace(r'^\s+|\s+?$','')


In [22]:
#changing words to lowecase hello HELLO Hello all are same words
processed=processed.str.lower()
print(processed)

0       go until jurong point crazy available only in ...
1                                 ok lar joking wif u oni
2       free entry in 2 a wkly comp to win fa cup fina...
3             u dun say so early hor u c already then say
4       nah i don t think he goes to usf he lives arou...
5       freemsg hey there darling it s been 3 week s n...
6       even my brother is not like to speak with me t...
7       as per your request melle melle oru minnaminun...
8       winner as a valued network customer you have b...
9       had your mobile 11 months or more u r entitled...
10      i m gonna be home soon and i don t want to tal...
11      six chances to win cash from 100 to 20 000 pou...
12      urgent you have won a 1 week free membership i...
13      i ve been searching for the right words to tha...
14                      i have a date on sunday with will
15      xxxmobilemovieclub to use your credit click th...
16                                 oh k i m watching here
17      eh u r

In [25]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\LOHANI\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [26]:
#stop words removal
from nltk.corpus import stopwords
stop_words=set(stopwords.words('english'))
processed=processed.apply(lambda x:' '.join(term for term in x.split() if term not in stop_words))

In [27]:
#stemming using porter stemmer
ps=nltk.PorterStemmer()
processed=processed.apply(lambda x:' '.join(term for term in x.split()))

In [29]:
print (processed)

0       go jurong point crazy available bugis n great ...
1                                 ok lar joking wif u oni
2       free entry 2 wkly comp win fa cup final tkts 2...
3                     u dun say early hor u c already say
4                  nah think goes usf lives around though
5       freemsg hey darling 3 week word back like fun ...
6          even brother like speak treat like aids patent
7       per request melle melle oru minnaminunginte nu...
8       winner valued network customer selected receiv...
9       mobile 11 months u r entitled update latest co...
10      gonna home soon want talk stuff anymore tonigh...
11      six chances win cash 100 20 000 pounds txt csh...
12      urgent 1 week free membership moneysymb100 000...
13      searching right words thank breather promise w...
14                                            date sunday
15      xxxmobilemovieclub use credit click wap link n...
16                                          oh k watching
17      eh u r

In [31]:
#tokenizing
from nltk.tokenize import word_tokenize
#creating word bag
all_words=[]

for message in processed:
    words=word_tokenize(message)
    for w  in words:
        all_words.append(w)
        
all_words=nltk.FreqDist(all_words)        

In [32]:
print(len(all_words))

8642


In [33]:
print(all_words.most_common(15))

[('u', 1207), ('call', 593), ('2', 519), ('ur', 391), ('get', 391), ('4', 323), ('gt', 318), ('lt', 316), ('ok', 293), ('free', 284), ('go', 283), ('know', 262), ('got', 252), ('like', 247), ('good', 247)]


In [34]:
#use 1500 most common words as features
word_features=list(all_words.keys())[:1500]

In [94]:
#define find features function
def find_features(message):
    words=word_tokenize(message)
    features={}
    for word in word_features:
        features[word] = (word in words)
    
    return features        

In [97]:
features=find_features(processed[0])

In [98]:
features

{'go': True,
 'jurong': True,
 'point': True,
 'crazy': True,
 'available': True,
 'bugis': True,
 'n': True,
 'great': True,
 'world': True,
 'la': True,
 'e': True,
 'buffet': True,
 'cine': True,
 'got': True,
 'amore': True,
 'wat': True,
 'ok': False,
 'lar': False,
 'joking': False,
 'wif': False,
 'u': False,
 'oni': False,
 'free': False,
 'entry': False,
 '2': False,
 'wkly': False,
 'comp': False,
 'win': False,
 'fa': False,
 'cup': False,
 'final': False,
 'tkts': False,
 '21st': False,
 'may': False,
 '2005': False,
 'text': False,
 '87121': False,
 'receive': False,
 'question': False,
 'std': False,
 'txt': False,
 'rate': False,
 'c': False,
 'apply': False,
 '08452810075over18': False,
 'dun': False,
 'say': False,
 'early': False,
 'hor': False,
 'already': False,
 'nah': False,
 'think': False,
 'goes': False,
 'usf': False,
 'lives': False,
 'around': False,
 'though': False,
 'freemsg': False,
 'hey': False,
 'darling': False,
 '3': False,
 'week': False,
 'word': 

In [102]:
#find features for all messages
messages=list(zip(processed,Y))

seed=1
np.random.seed=seed
np.random.shuffle(messages)

featuresets=[(find_features(text),label) for (text,label) in messages]

In [104]:
from sklearn import model_selection
training,testing=model_selection.train_test_split(featuresets,test_size=0.25,random_state=seed)

## 4.scikit learn classifier with nltk

In [106]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix

In [109]:
#define models to train
names=['K Nearest Neighbors','decision tree','random forest','logistic regression','sgd classifier','naive bays','svm linear']
classifier=[KNeighborsClassifier(),DecisionTreeClassifier(),RandomForestClassifier(),LogisticRegression(),SGDClassifier(),MultinomialNB(),SVC(kernel='linear')]

models=list(zip(names,classifier))
print(models)

[('K Nearest Neighbors', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')), ('decision tree', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')), ('random forest', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf

In [110]:
#wrap models in nltk
from nltk.classify.scikitlearn import SklearnClassifier

for name,model in models:
    nltk_model=SklearnClassifier(model)
    nltk_model.train(training)
    accuracy=nltk.classify.accuracy(nltk_model,testing)*100
    print('{}:accuracy:{}'.format(name,accuracy))#

91.17013639626705
96.91313711414213




97.48743718592965




97.84637473079684
97.70279971284997
98.34888729361091
97.70279971284997


In [112]:
#build ensemble-voting classifier
from sklearn.ensemble import VotingClassifier

names=['K Nearest Neighbors','decision tree','random forest','logistic regression','sgd classifier','naive bays','svm linear']
classifier=[KNeighborsClassifier(),DecisionTreeClassifier(),RandomForestClassifier(),LogisticRegression(),SGDClassifier(),MultinomialNB(),SVC(kernel='linear')]

models=list(zip(names,classifier))

nltk_ensemble=SklearnClassifier(VotingClassifier(estimators=models,voting='hard',n_jobs=-1))
nltk_ensemble.train(training)
accuracy=nltk.classify.accuracy(nltk_ensemble,testing)*100

print('{}:ensemble accuracy:{}'.format(name,accuracy))

svm linear:ensemble accuracy:98.20531227566404


In [113]:
#prediction on test
txt_features,labels=list(zip(*testing))

prediction=nltk_ensemble.classify_many(txt_features)


In [116]:
#confusion matrix
print(classification_report(labels,prediction))
pd.DataFrame(confusion_matrix(labels,prediction),
index=[['actual','actual'],['ham','spam']],
columns=[['predicted','predicted'],['predicted','predicted']])

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1201
           1       0.99      0.88      0.93       192

    accuracy                           0.98      1393
   macro avg       0.98      0.94      0.96      1393
weighted avg       0.98      0.98      0.98      1393



Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,predicted,predicted.1
actual,ham,1199,2
actual,spam,23,169
