In [1]:
import sys
import numpy as np
import math
import pickle 
from tqdm import tqdm
import nltk, re, pprint
from nltk.tokenize import WordPunctTokenizer
wpt = WordPunctTokenizer()
np.set_printoptions(threshold='nan')

In [2]:
def get_data(file_name):
    myfile = open(file_name)
    mytext = myfile.read()
    mytxt = mytext.splitlines()
    myfile.close()
    return mytxt


In [3]:
#get training data
mytxt = get_data("SMSSpamCollection.train")


In [4]:
#get dev data
mytxt_dev = get_data("SMSSpamCollection.devel")

In [5]:
#get test data
mytxt_test = get_data("SMSSpamCollection.test")

In [6]:
#all the words in training
vocab_list = []
for line in mytxt:
    token_line = wpt.tokenize(line) 
    del token_line[0]
    for token in token_line:
        vocab_list.append(token)

pickle.dump(vocab_list, open ("vocab_list.p", "wb"))


In [7]:
#getting the vocab list (features)
ult_vocab = set(vocab_list)


In [8]:
#getting the original theta vector

theta = []
for i in range(0, len(ult_vocab)):
	theta.append(0)
theta.append(0) #add bias
theta_array = np.asarray(theta)

In [9]:
#getting message vectors
def message_vectors(data):
    list_of_message_vectors = []
    for line in tqdm(data):
        message_vector = []
        token_line = wpt.tokenize(line) 
        if token_line[0] == "spam":
            message_vector.append(1)
        else:
            message_vector.append(0)
        
        hamless = token_line[1:]
        for item in ult_vocab:
            if item in hamless:
                message_vector.append(hamless.count(item))
            else:
                message_vector.append(0)
        message_vector.append(1) #adding "fake feature" to counter bias
        message_vector_array = np.asarray(message_vector)
        list_of_message_vectors.append(message_vector_array)
    return list_of_message_vectors

In [10]:
#getting the training data message vectors
list_of_message_vectors = message_vectors(mytxt)
list_of_message_vectors[5]
len(list_of_message_vectors)

100%|██████████| 3345/3345 [00:13<00:00, 245.53it/s]


3345

In [11]:
#getting the devel data message vectors
list_of_message_vectors_dev = message_vectors(mytxt_dev)


100%|██████████| 1115/1115 [00:04<00:00, 254.87it/s]


In [29]:
def train_tune(train_set, dev_set, theta_array, epochs, learning_rate):
    list_of_thetas = []
    accuracy_train = []
    key_list = []
    value_list = []
    for mini_batch in tqdm(range(5, 25, 5)):
        for epoch in tqdm(range(epochs)):

            correct_list = []
            wrong_list = []
            np.random.shuffle(train_set)

            for i in range(0, len(train_set) - mini_batch, mini_batch):
                adjust = 0
                for j in range(0, mini_batch):
                    y = float(train_set[i + j][0])
                    message_vector = train_set[i + j][1:]


                    z = np.dot(theta_array, message_vector)
                    sigmoid = 1 / (1 + math.exp(-z))
                    adjust = adjust + learning_rate*(y - sigmoid)*message_vector	
                    if sigmoid >= 0.5:
                        decision = 1
                    else:
                        decision =  0
                    if decision == y:
                        correct_list.append (1)
                    else:
                        wrong_list.append(0) 


                adjust = adjust/mini_batch
                theta_array = theta_array + adjust
                list_of_thetas.append(theta_array)

            accuracy_train = float(len(correct_list))/ float(len(train_set))

            accuracy_dev = []


            correct_list_dev = []
            wrong_list_dev = []
            for message in dev_set:
                message_vector = message[1:]
                z = np.dot(theta_array, message_vector)
                sigmoid = 1 / (1 + math.exp(-z))
                y = message[0]
                if sigmoid >= 0.5:
                    decision = 1
                else:
                    decision =  0
                if decision == y:
                    correct_list_dev.append (1)
                else:
                    wrong_list_dev.append(0)

            accuracy_dev = 100 * float(len(correct_list_dev))/ float(len(dev_set))
            value_list.append(accuracy_dev)
            key = str(epoch) + " " + str (mini_batch) + " " + str(learning_rate)
            key_list.append(key)

    
    dictionary = dict(zip(key_list, value_list))
    return dictionary, key_list 


In [32]:
dictionary, key_list = train_tune(list_of_message_vectors, list_of_message_vectors_dev, theta_array, 50, 0.5)

#printing epoch, mini-batch size, learning rate, and accuracy with accuracies over 98%
for key in key_list:
    if float(dictionary[key]) > 98.0:
        print key, dictionary[key]
    else:
        continue

  0%|          | 0/4 [00:00<?, ?it/s]
  0%|          | 0/50 [00:00<?, ?it/s][A
  2%|▏         | 1/50 [00:00<00:10,  4.83it/s][A
  4%|▍         | 2/50 [00:00<00:09,  4.95it/s][A
  6%|▌         | 3/50 [00:00<00:09,  5.02it/s][A
  8%|▊         | 4/50 [00:00<00:09,  5.06it/s][A
 10%|█         | 5/50 [00:00<00:08,  5.08it/s][A
 12%|█▏        | 6/50 [00:01<00:08,  5.10it/s][A
 14%|█▍        | 7/50 [00:01<00:08,  5.12it/s][A
 16%|█▌        | 8/50 [00:01<00:08,  5.12it/s][A
 18%|█▊        | 9/50 [00:01<00:08,  5.12it/s][A
 20%|██        | 10/50 [00:01<00:07,  5.13it/s][A
 22%|██▏       | 11/50 [00:02<00:07,  5.13it/s][A
 24%|██▍       | 12/50 [00:02<00:07,  5.13it/s][A
 26%|██▌       | 13/50 [00:02<00:07,  5.14it/s][A
 28%|██▊       | 14/50 [00:02<00:07,  5.14it/s][A
 30%|███       | 15/50 [00:02<00:06,  5.15it/s][A
 32%|███▏      | 16/50 [00:03<00:06,  5.14it/s][A
 34%|███▍      | 17/50 [00:03<00:06,  5.14it/s][A
 36%|███▌      | 18/50 [00:03<00:06,  5.13it/s][A
 38%|███▊  

 10%|█         | 5/50 [00:01<00:15,  2.85it/s][A
 12%|█▏        | 6/50 [00:02<00:14,  2.95it/s][A
 14%|█▍        | 7/50 [00:02<00:14,  3.05it/s][A
 16%|█▌        | 8/50 [00:02<00:13,  3.11it/s][A
 18%|█▊        | 9/50 [00:02<00:12,  3.17it/s][A
 20%|██        | 10/50 [00:03<00:13,  3.01it/s][A
 22%|██▏       | 11/50 [00:03<00:12,  3.01it/s][A
 24%|██▍       | 12/50 [00:03<00:12,  3.05it/s][A
 26%|██▌       | 13/50 [00:04<00:11,  3.09it/s][A
 28%|██▊       | 14/50 [00:04<00:11,  3.03it/s][A
 30%|███       | 15/50 [00:04<00:11,  3.08it/s][A
 32%|███▏      | 16/50 [00:05<00:10,  3.11it/s][A
 34%|███▍      | 17/50 [00:05<00:10,  3.14it/s][A
 36%|███▌      | 18/50 [00:05<00:10,  3.11it/s][A
 38%|███▊      | 19/50 [00:06<00:10,  3.07it/s][A
 40%|████      | 20/50 [00:06<00:09,  3.10it/s][A
 42%|████▏     | 21/50 [00:06<00:09,  3.12it/s][A
 44%|████▍     | 22/50 [00:06<00:08,  3.15it/s][A
 46%|████▌     | 23/50 [00:07<00:08,  3.10it/s][A
 48%|████▊     | 24/50 [00:07<00:08,

7 5 0.5 98.0269058296
12 5 0.5 98.0269058296



