In [1]:
import sklearn
import re
import nltk
from nltk.tokenize import RegexpTokenizer
from sklearn.datasets import load_files
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics



In [41]:
email = load_files('email',shuffle = True,encoding="utf-8")
x = email.data
y = email.target
x_train ,x_test , y_train , y_test = train_test_split(x,y,test_size=0.2,random_state=101)

cv = CountVectorizer()

#fit and transform the training data
cv.fit(x_train)
transformed_train = cv.transform(x_train)
print('shape of transformed training',transformed_train.shape)


# transform the testing data
transformed_test = cv.transform(x_test)
print('shape of transformed testing',transformed_test.shape)


shape of transformed training (2314, 53636)
shape of transformed testing (579, 53636)


In [46]:
#Multinomial NaiveBayes
MNB = MultinomialNB()

MNB.fit(transformed_train, y_train)
predMNB = MNB.predict(transformed_test)

print('Accuracy:',metrics.accuracy_score(y_test,predMNB) *100 , '%')
print(metrics.classification_report(y_test, predMNB,target_names=email.target_names))

Accuracy: 99.481865285 %
             precision    recall  f1-score   support

        ham       1.00      1.00      1.00       486
       spam       0.98      0.99      0.98        93

avg / total       0.99      0.99      0.99       579



In [47]:
#K-neighbours Classifier
neigh = KNeighborsClassifier()

neigh.fit(transformed_train, y_train)
predNeigh = neigh.predict(transformed_test)

print('Accuracy:',metrics.accuracy_score(y_test,predNeigh) *100 , '%')
print(metrics.classification_report(y_test, predNeigh,target_names=email.target_names))

Accuracy: 94.4732297064 %
             precision    recall  f1-score   support

        ham       0.97      0.96      0.97       486
       spam       0.80      0.87      0.84        93

avg / total       0.95      0.94      0.95       579



In [49]:
#Random Forest Classifier
RFC = RandomForestClassifier(random_state=0)

RFC.fit(transformed_train, y_train)
predRFC = RFC.predict(transformed_test)

print('Accuracy:',metrics.accuracy_score(y_test,predRFC) *100 , '%')
print(metrics.classification_report(y_test, predRFC,target_names=email.target_names))

Accuracy: 97.0639032815 %
             precision    recall  f1-score   support

        ham       0.97      1.00      0.98       486
       spam       1.00      0.82      0.90        93

avg / total       0.97      0.97      0.97       579



In [50]:
#Features Methods

#Word contains a digit
def contains_digit(s):
    return any(i.isdigit() for i in s)

#The number of sentences in an email.
def numberSentences(s):
    return len(nltk.sent_tokenize(s))

#The number of words found in spam list.
def numberSpam(s, spam):
    count =0
    tokenizer = RegexpTokenizer(r'\w+')
    words = tokenizer.tokenize(s)
    for w in words:
        if w in spam:
            count +=1            
    return count

#The number of words containing numeric characters.
def numberNumSentences(s):
    numericW =0
    tokenizer = RegexpTokenizer(r'\w+')
    words = tokenizer.tokenize(s)
    for w in words:
        if (contains_digit(w)):
            numericW +=1
            
    return numericW

#The number of words containing alphabetical characters.
def numberAlphaSentences(s):
    AlphaW =0
    tokenizer = RegexpTokenizer(r'\w+')
    words = tokenizer.tokenize(s)
    for w in words:
        if not(contains_digit(w)):
            AlphaW +=1            
    return AlphaW

#The number of words containing both numeric and alphabetical characters. 
def numberANSentences(s):
    count =0
    tokenizer = RegexpTokenizer(r'\w+')
    words = tokenizer.tokenize(s)
    for w in words:
        if re.match('[A-Z0-9]',w):
            count+=1
    return count

#The number of verbs in an email.
def numberVerbs(s):
    verbs =0
    tokenizer = RegexpTokenizer(r'\w+')
    words = tokenizer.tokenize(s)
    tags = nltk.pos_tag(words)
    for t in tags:
        if t[1] == 'VB':
            verbs+=1
    return verbs

#The number of syllables in a word.
def sylco(word) :
 
    word = word.lower()
    exception_add = ['serious','crucial']
    exception_del = ['fortunately','unfortunately']
    co_one = ['cool','coach','coat','coal','count','coin','coarse','coup','coif','cook','coign','coiffe','coof','court']
    co_two = ['coapt','coed','coinci']
    pre_one = ['preach'] 
    syls = 0 #added syllable number
    disc = 0 #discarded syllable number 

    if len(word) <= 3 :
        syls = 1
        return syls
  
    if word[-2:] == "es" or word[-2:] == "ed" :
        doubleAndtripple_1 = len(re.findall(r'[eaoui][eaoui]',word))
        if doubleAndtripple_1 > 1 or len(re.findall(r'[eaoui][^eaoui]',word)) > 1 :
            if word[-3:] == "ted" or word[-3:] == "tes" or word[-3:] == "ses" or word[-3:] == "ied" or word[-3:] == "ies" :
                pass
            else :
                disc+=1

    le_except = ['whole','mobile','pole','male','female','hale','pale','tale','sale','aisle','whale','while']
 
    if word[-1:] == "e" :
        if word[-2:] == "le" and word not in le_except :
            pass
 
        else :
            disc+=1

    doubleAndtripple = len(re.findall(r'[eaoui][eaoui]',word))
    tripple = len(re.findall(r'[eaoui][eaoui][eaoui]',word))
    disc+=doubleAndtripple + tripple
 
    numVowels = len(re.findall(r'[eaoui]',word))
 
    if word[:2] == "mc" :
        syls+=1
 
    if word[-1:] == "y" and word[-2] not in "aeoui" :
        syls +=1
 
    for i,j in enumerate(word) :
        if j == "y" :
            if (i != 0) and (i != len(word)-1) :
                if word[i-1] not in "aeoui" and word[i+1] not in "aeoui" :
                    syls+=1
  
    if word[:3] == "tri" and word[3] in "aeoui" :
        syls+=1
 
    if word[:2] == "bi" and word[2] in "aeoui" :
        syls+=1
  
    if word[-3:] == "ian" : 
        if word[-4:] == "cian" or word[-4:] == "tian" :
            pass
        else :
            syls+=1
  
    if word[:2] == "co" and word[2] in 'eaoui' :
 
        if word[:4] in co_two or word[:5] in co_two or word[:6] in co_two :
            syls+=1
        elif word[:4] in co_one or word[:5] in co_one or word[:6] in co_one :
            pass
        else :
            syls+=1
  
    if word[:3] == "pre" and word[3] in 'eaoui' :
        if word[:6] in pre_one :
            pass
        else :
            syls+=1 
    negative = ["doesn't", "isn't", "shouldn't", "couldn't","wouldn't"]
 
    if word[-3:] == "n't" :
        if word in negative :
            syls+=1
        else :
            pass  

    return numVowels - disc + syls

#The number of words in an email that have more than 3 syllables. 
def moreThanThree(s):
    count =0
    tokenizer = RegexpTokenizer(r'\w+')
    words = tokenizer.tokenize(s)
    for w in words:
        if sylco(w) >3:
            count+=1
    return count

#The average number of syllables of words in an email. 
def averageSyllables(s):
    syllables =0
    tokenizer = RegexpTokenizer(r'\w+')
    words = tokenizer.tokenize(s)
    for w in words:
            syllables+= sylco(w)
    return syllables/len(words)


with open('spam.txt') as f:
    spam = f.read().splitlines()
        
#print(spam)

f1 = [numberSentences(e) for e in x] # number of sentences of a document.
f2 = [numberVerbs(e) for e in x] # The number of verbs in an email.
f3 = [numberANSentences(e) for e in x] #The number of words containing both numeric and alphabetical characters. 
f4 = [numberSpam(e,spam) for e in x] # The number of words in an email that are found in the spam list.
f5 = [moreThanThree(e) for e in x]  # The number of words in an email that have more than 3 syllables. 
f6 = [averageSyllables(e) for e in x]  # The average number of syllables of words in an email. 

feat_matrix = [[f1[i], f2[i], f3[i],f4[i],f5[i],f6[i]] for i in range(len(x))]
print(len(feat_matrix))
print(len(feat_matrix[1]))
##FOR TESTING, N.B every function tested and working
##par  = "run, kick or push the wall, This is a parag2raph  So1 it is! Ok, there are 3 sentences. ability"
##averageSyllables(par)

2893
6


In [52]:
dividing = int(len(feat_matrix) *0.8)
feat_train = feat_matrix[:dividing]
feat_test = feat_matrix[dividing:]
print('length of featureTrainMatrix' , len(feat_train))
print('length of featureTestMatrix' , len(feat_test))

length of featureTrainMatrix 2314
length of featureTestMatrix 579


In [53]:
#Feeding the feature matrix and labels to NaiveBayes
MNB = MultinomialNB()

MNB.fit(feat_train, y_train)
predMNB = MNB.predict(feat_test)

print('Accuracy:',metrics.accuracy_score(y_test,predMNB) *100 , '%')
print(metrics.classification_report(y_test, predMNB,target_names=email.target_names))

Accuracy: 80.310880829 %
             precision    recall  f1-score   support

        ham       0.84      0.94      0.89       486
       spam       0.23      0.10      0.14        93

avg / total       0.75      0.80      0.77       579



In [54]:
#Feeding the feature matrix and labels to KNeighbour
neigh = KNeighborsClassifier()

neigh.fit(feat_train, y_train)
predNeigh = neigh.predict(feat_test)

print('Accuracy:',metrics.accuracy_score(y_test,predNeigh) *100 , '%')
print(metrics.classification_report(y_test, predNeigh,target_names=email.target_names))

Accuracy: 81.3471502591 %
             precision    recall  f1-score   support

        ham       0.84      0.96      0.90       486
       spam       0.20      0.05      0.08        93

avg / total       0.74      0.81      0.77       579



In [55]:
#Feeding the feature matrix and labels to RandomForest
RFC = RandomForestClassifier(random_state=0)

RFC.fit(feat_train, y_train)
predRFC = RFC.predict(feat_test)

print('Accuracy:',metrics.accuracy_score(y_test,predRFC) *100 , '%')
print(metrics.classification_report(y_test, predRFC,target_names=email.target_names))

Accuracy: 81.6925734024 %
             precision    recall  f1-score   support

        ham       0.84      0.97      0.90       486
       spam       0.07      0.01      0.02        93

avg / total       0.71      0.82      0.76       579

