In [27]:
import sys
import os
import pandas as pd

In [28]:
############### QUESTION 1 ###############

def read_text_file(file_path):
    """Returns each line of file as a list of strings"""
    
    if not os.path.isfile(file_path):
        print("File {} doesn't exist".format(file_path))
    
    file_lines = []
    
    with open(file_path) as fp:
        for line in fp:
            file_lines.append(line)
    
    return file_lines


def train_val_split_sequential(file_path, ratio):
    """
    Splits a file into training and validation data by a specified ratio
    Train-validation splits are returned as lists of string
    """
    
    data = read_text_file(file_path)
    train_size = int(len(data) * ratio)
    train_data = data[0:train_size]
    val_data = data[train_size:]
    
    return (train_data, val_data)

In [29]:
train_data, val_data = train_val_split_sequential('../data/email_spam/spam_train.txt', 0.8)

Creation of a validation set allows us to tune the hyperparameters of the model to increase the accuracy.
This is something that cannot be done on the test set because ideally the test should should purely be used to report the final performance of the model. Using the test set while training can induce a bias in the model and the model may not generalize well beyond the given datasets.

In [45]:
############### QUESTION 2 ###############

def build_vocab(data, threshold=0):
    """
    Takes input as a list of strings and returns a vocab of distinct words occurring in it.
    Ignores all words occurring less than the threshold.
    """
    
    vocab = {}
    
    for line in data:
        for word in line.split(' '):
            vocab[word] = (1 if word not in vocab else (vocab[word] + 1))
    
    # remove all keys with counts below a certain threshold
    if threshold > 0:
        for word in list(vocab.keys()):
            if vocab[word] < threshold:
                vocab.pop(word, None)
    
    return vocab


def transform_doc_to_features(doc, vocab_keys):
    """
    Transforms a single text document into a feature vector.
    The vectorizer is a binary vectorizer i.e. the features takes value 1 if the word is
        present in the text and 0 if the word is absent.
    """
    
    ret_list = []
    doc_words = doc.split(' ')
    
    for key in vocab_keys:
        if key in doc_words:
            ret_list.append(1)
        else:
            ret_list.append(0)
            
    return ret_list

def transform_text_to_features(data):
    """
    Transforms text data into feature vectors by building a dictionary.
    The vectorizer is a binary vectorizer i.e. the features takes value 1 if the word is
        present in the text and 0 if the word is absent.
    Input is a list of strings.
    Output is a dataframe of word features.
    """
    
    vocab = build_vocab(data, 30)
    vocab_keys = list(vocab.keys())
    vocab_keys.sort()
    transformed_list = []
    
    for doc in data:
        transformed_list.append(transform_doc_to_features(doc, vocab_keys))
    
    return pd.DataFrame(transformed_list, columns = vocab_keys)

In [51]:
df = transform_text_to_features(train_data)