In [1]:
import re
import numpy as np
import pandas as pd
from sklearn import svm
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from sklearn.feature_extraction.text import CountVectorizer

f = open("Email_spam.txt")
lines = []
for l in f.readlines():
  lines.append(l.rsplit("\t"))
f.close()
lines = np.array(lines)

target = lines[:, 0]
text = lines[:, 1]

In [2]:
text

array([ 'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...\n',
       'Ok lar... Joking wif u oni...\n',
       "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's\n",
       ..., 'Pity, * was in mood for that. So...any other suggestions?\n',
       "The guy did some bitching but I acted like i'd be interested in buying something else next week and he gave it to us for free\n",
       'Rofl. Its true to its name\n'],
      dtype='<U911')

In [3]:
stops = set(stopwords.words('english'))

def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    
lemmatizer = WordNetLemmatizer()

def clean_data(data):
    meaningful_words = []
                    
    for w in data:   
        if w.lower() not in stops:
            pos = pos_tag([w])
            n_word = lemmatizer.lemmatize(w, pos = get_simple_pos(pos[0][1]))
            meaningful_words.append(n_word)
            
    return (" ".join(meaningful_words))

clean_text = []
for i in range(text.size):
    temp_text = re.sub('[^A-Za-z0-9 ]+', '', text[i])
    clean_text.append(clean_data(word_tokenize(temp_text)))

In [4]:
clean_text

['Go jurong point crazy Available bugis n great world la e buffet Cine get amore wat',
 'Ok lar Joking wif u oni',
 'Free entry 2 wkly comp win FA Cup final tkts 21st May 2005 Text FA 87121 receive entry questionstd txt rateTCs apply 08452810075over18s',
 'U dun say early hor U c already say',
 'Nah dont think go usf life around though',
 'FreeMsg Hey darling 3 week word back Id like fun still Tb ok XxX std chgs send 150 rcv',
 'Even brother like speak treat like aid patent',
 'per request Melle Melle Oru Minnaminunginte Nurungu Vettam set callertune Callers Press 9 copy friend Callertune',
 'WINNER value network customer select receivea 900 prize reward claim call 09061701461 Claim code KL341 Valid 12 hour',
 'mobile 11 month U R entitle Update late colour mobile camera Free Call Mobile Update Co FREE 08002986030',
 'Im gon na home soon dont want talk stuff anymore tonight k Ive cry enough today',
 'SIX chance win CASH 100 20000 pound txt CSH11 send 87575 Cost 150pday 6days 16 TsandCs

In [5]:
from sklearn.cross_validation import train_test_split

x_train, x_test, y_train, y_test = train_test_split(clean_text, target, test_size = 0.2, random_state = 42)
count_vec = CountVectorizer(analyzer = "word", max_features = 4500, ngram_range = (1, 2))

train_transformed = count_vec.fit_transform(x_train)
test_transformed = count_vec.transform(x_test)



In [6]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB()
clf.fit(train_transformed, y_train)
clf.score(test_transformed, y_test)

0.98026905829596411

In [7]:
import pickle
save_classifier = open("EmailSpam_MNB.pickle", "wb")
pickle.dump(clf, save_classifier)
save_classifier.close()