In [1]:
import sys
import numpy as np
import math
import pickle 
from tqdm import tqdm
import nltk, re, pprint
from nltk.tokenize import WordPunctTokenizer
wpt = WordPunctTokenizer()
np.set_printoptions(threshold='nan')

In [2]:
def get_data(file_name):
    myfile = open(file_name)
    mytext = myfile.read()
    mytxt = mytext.splitlines()
    myfile.close()
    return mytxt


In [3]:
#get training data
mytxt = get_data("SMSSpamCollection.train")


In [4]:
#get dev data
mytxt_dev = get_data("SMSSpamCollection.devel")

In [5]:
#get test data
mytxt_test = get_data("SMSSpamCollection.test")

In [6]:
#all the words in training
vocab_list = []
for line in mytxt:
    token_line = wpt.tokenize(line) 
    del token_line[0]
    for token in token_line:
        vocab_list.append(token)

pickle.dump(vocab_list, open ("vocab_list.p", "wb"))


In [7]:
#getting the vocab list (features)
ult_vocab = set(vocab_list)


In [8]:
#getting the original theta vector

theta = []
for i in range(0, len(ult_vocab)):
	theta.append(0)
theta.append(0) #add bias
theta_array = np.asarray(theta)

In [9]:
#getting message vectors
def message_vectors(data):
    list_of_message_vectors = []
    for line in tqdm(data):
        message_vector = []
        token_line = wpt.tokenize(line) 
        if token_line[0] == "spam":
            message_vector.append(1)
        else:
            message_vector.append(0)
        
        hamless = token_line[1:]
        for item in ult_vocab:
            if item in hamless:
                message_vector.append(hamless.count(item))
            else:
                message_vector.append(0)
        message_vector.append(1) #adding "fake feature" to counter bias
        message_vector_array = np.asarray(message_vector)
        list_of_message_vectors.append(message_vector_array)
    return list_of_message_vectors

In [10]:
#getting the training data message vectors
list_of_message_vectors = message_vectors(mytxt)
list_of_message_vectors[5]
len(list_of_message_vectors)

100%|██████████| 3345/3345 [00:13<00:00, 242.88it/s]


3345

In [11]:
#getting the devel data message vectors
list_of_message_vectors_dev = message_vectors(mytxt_dev)


100%|██████████| 1115/1115 [00:04<00:00, 240.22it/s]


In [12]:
#getting the test data message vectors
list_of_message_vectors_test = message_vectors(mytxt_test)

100%|██████████| 1114/1114 [00:04<00:00, 251.49it/s]


In [13]:
def train(message_vectors, theta_array, learning_rate, mini_batch, epoch):
    list_of_thetas = []
    for epoch in tqdm(range(epoch)):
        correct_list = []
        wrong_list = []
        np.random.shuffle(message_vectors)
        for i in range(0, len(message_vectors) - mini_batch, mini_batch):
            adjust = 0
            for j in range(0, mini_batch):
                y = float(message_vectors[i + j][0])
                message_vector = message_vectors[i + j][1:]
                z = np.dot(theta_array, message_vector)
                sigmoid = 1 / (1 + math.exp(-z))
                adjust = adjust + learning_rate*(y - sigmoid)*message_vector	
                if sigmoid >= 0.5:
                    decision = 1
                else:
                    decision =  0
                if decision == y:
                    correct_list.append (1)
                else:
                    wrong_list.append(0) 
            adjust = adjust/mini_batch
            theta_array = theta_array + adjust
            list_of_thetas.append(theta_array)
    return list_of_thetas

In [14]:
#saving model
list_of_thetas = train(list_of_message_vectors, theta_array, 0.5, 5, 20)

model = list_of_thetas[-1]
pickle.dump(model, open ("theta_array.p", "wb"))


100%|██████████| 20/20 [00:03<00:00,  5.39it/s]


In [15]:
def testing(message_vectors, theta_array, learning_rate):
    accuracy = []
    correct_list = []
    wrong_list = []
    for message in message_vectors:
        message_vector = message[1:]
        z = np.dot(theta_array, message_vector)
        sigmoid = 1 / (1 + math.exp(-z))
        y = message[0]
        if sigmoid >= 0.5:
            decision = 1
        else:
            decision =  0
        if decision == y:
            correct_list.append (1)
        else:
            wrong_list.append(0)

    accuracy = float(len(correct_list))/ float(len(message_vectors))
    return accuracy

In [16]:
accuracy = testing(list_of_message_vectors_test, model, 0.5)
accuracy

0.9856373429084381