In [112]:
import sys
import os
import pandas as pd
import numpy as np

In [113]:
CLASSES = ['0', '1']

In [114]:
############### QUESTION 1 ###############

def read_text_file(file_path):
    """Returns each line of file as a list of strings"""
    
    if not os.path.isfile(file_path):
        print("File {} doesn't exist".format(file_path))
    
    file_lines = []
    
    with open(file_path) as fp:
        for line in fp:
            file_lines.append(line)
    
    return file_lines


def train_val_split_sequential(file_path, ratio):
    """
    Splits a file into training and validation data by a specified ratio
    Train-validation splits are returned as lists of string
    """
    
    data = read_text_file(file_path)
    train_size = int(len(data) * ratio)
    train_data = data[0:train_size]
    val_data = data[train_size:]
    
    return (train_data, val_data)

In [115]:
train_data, val_data = train_val_split_sequential('../data/email_spam/spam_train.txt', 0.8)

Creation of a validation set allows us to tune the hyperparameters of the model to increase the accuracy.
This is something that cannot be done on the test set because ideally the test should should purely be used to report the final performance of the model. Using the test set while training can induce a bias in the model and the model may not generalize well beyond the given datasets.

In [116]:
############### QUESTION 2 ###############

def build_vocab(data, threshold=0):
    """
    Takes input as a list of strings and returns a vocab of distinct words occurring in it.
    Ignores all words occurring less than the threshold.
    """
    
    vocab = {}
    
    for line in data:
        for word in line.split(' '):
            vocab[word] = (1 if word not in vocab else (vocab[word] + 1))
    
    # remove the class names from vocab
    for class_ in CLASSES:
        vocab.pop(class_, None)
    
    # remove all keys with counts below a certain threshold
    if threshold > 0:
        for word in list(vocab.keys()):
            if vocab[word] < threshold:
                vocab.pop(word, None)
    
    return vocab


def transform_doc_to_features(doc, vocab_keys):
    """
    Transforms a single text document into a feature vector.
    The vectorizer is a binary vectorizer i.e. the features takes value 1 if the word is
        present in the text and 0 if the word is absent.
    """
    
    ret_list = []
    doc_class = doc.split(' ')[0]
    doc_words = doc.split(' ')[1:]
    
    ret_list.append(doc_class)
    
    for key in vocab_keys:
        if key in doc_words:
            ret_list.append(1)
        else:
            ret_list.append(0)
    
    return ret_list

def transform_text_to_features(data, master_vocab_keys=[]):
    """
    Transforms text data into feature vectors by building a dictionary if a dictionary is not provided.
    The vectorizer is a binary vectorizer i.e. the features takes value 1 if the word is
        present in the text and 0 if the word is absent.
    Input is a list of strings.
    Output is a dataframe of word features.
    """
    
    if len(master_vocab_keys) == 0:
        vocab = build_vocab(data, 30)
        vocab_keys = list(vocab.keys())
    else:
        vocab_keys = list(master_vocab.keys())
    
    vocab_keys.sort()
    features = ['__target__']
    features.extend(vocab_keys)
    transformed_list = []
    
    for doc in data:
        transformed_list.append(transform_doc_to_features(doc, vocab_keys))
    
    return pd.DataFrame(transformed_list, columns = features), vocab_keys

In [117]:
train_df, vocab_keys = transform_text_to_features(train_data)
val_df, _ = transform_text_to_features(val_data, vocab_keys)

In [118]:
############### QUESTION 3 ###############

def perceptron_train(data):
    """
    Trains a perceptron model on the input dataframe
    """
    
    num_features = len(data.columns)
    w = np.zeros(num_features - 1)
    num_iterations = 0
    num_updates = 0
    
    train_features = list(data.columns)
    train_features.remove('__target__')
    
    error_flag = True
    while error_flag:
        error_flag = False
        for index, row in data.iterrows():
            y = int(row['__target__'])
            x = row[train_features].to_numpy()
            
            if y == 0 and np.dot(w, x) >= 0:
                w = w - x
                error_flag = True
                num_updates += 1
            elif y == 1 and np.dot(w, x) < 0:
                w = w + x
                error_flag = True
                num_updates += 1
        
        num_iterations += 1
    
    return (w, num_updates, num_iterations)


def perceptron_test(w, data):
    """
    Tests a trained perceptron model with weights w on a dataset
    Returns the test error [number of misclassified samples]
    """
    
    train_features = list(data.columns)
    train_features.remove('__target__')
    
    test_error = 0
    for index, row in data.iterrows():
        y = int(row['__target__'])
        x = row[train_features].to_numpy()
        
        if (y == 0 and np.dot(w, x) >= 0) or (y == 1 and np.dot(w, x) < 0):
            test_error += 1
    
    test_error /= len(data.index)
    
    return test_error


In [119]:
############### QUESTION 4 ###############

print("learning weights from the Perceptron Algorithm...")
weights, num_mistakes, num_iterations = perceptron_train(train_df)
print("number of mistakes by the Perceptron Algorithm while learning = {}\n".format(num_mistakes))

print("checking the error on the training set...")
training_error = perceptron_test(weights, train_df)
print("number of mistakes the training set after learning = {}\n".format(training_error))

print("testing the Perceptron Algorithm on the validation set...")
validation_error = perceptron_test(weights, val_df)
print("error on the validation set = {}\n".format(validation_error))

learning weights from the Perceptron Algorithm...
number of mistakes by the Perceptron Algorithm while learning = 392

checking the error on the training set...
number of mistakes the training set after learning = 0.0

testing the Perceptron Algorithm on the validation set...
error on the validation set = 0.015

