# Header

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math as math
import janitor
from sklearn.linear_model import LinearRegression
import os

exec(open("../header.py").read())

Header initialized


# Import

In [9]:
threshold = 30
data_folder = processed_root("02-train-validation-test-split/threshold-"+str(threshold)+"/")

In [10]:
data_folder

'../../data/processed/02-train-validation-test-split/threshold-30/'

In [11]:
train_data = pd.read_csv(data_folder+"train_data.csv")
val_data = pd.read_csv(data_folder+"val_data.csv")
test_data = pd.read_csv(data_folder+"test_data.csv")

# Clean

## Bag of words functions

In [70]:
def extract_words_from_text(texts):
    '''
    Purpose: Helper function for bag_of_words
    Input: texts
    Output: list of words that occur in more than threshold texts
    '''
    
    threshold = 5
    word_counts = {}
    
    for text in texts:
        for word in text:
            if word in word_counts:
                word_counts[word] += 1
            else:
                word_counts[word] = 1 
                
    filtered_word_counts = word_counts.copy()

    for i in word_counts:
        if filtered_word_counts[i] < threshold:
            filtered_word_counts.pop(i)
            
    return list(filtered_word_counts.keys())

In [71]:
def extract_text(data, text_column):
    '''
    Purpose: Helper function for bag_of_words
    Input: Dataset
    Output: array of email sets of words (sets don't allow duplicates)
    '''
    
    return(data.apply(lambda x:set(x[text_column].split(' ')), axis = 1))

In [72]:
extract_text(train_data, 'content')

0      {features\ntoward, Grand, land,\nremand, heart...
1      {of, deep, birds,, vermilion, sweet, play., th...
2      {waters.\nThe, of, not, way, where\nthis, alwa...
3      {my, Hock, thirst, Lip,\nNor, beg,, not, this,...
4      {Wings, noted, –\n\nSome, of, Church,\nOur, ne...
                             ...                        
452    {of, espionage?, be, destiny., happened, bride...
453    {of, though, not, when, thou, stay,\nAnd, hath...
454    {of, woman’s, born, them, upon, whom, what, di...
455    {of, that,, lovely, might, way, be, Dr., econo...
456    {killing, sacrilege,, of, that,, say'st, honor...
Length: 457, dtype: object

In [73]:
def bag_of_words(data, word_data = None):
    '''
    Purpose: Converts a dataset to bag of words format.
    Input: Dataset
    Output: Bag of words version of the data
    '''
    
    texts = extract_text(data, 'content')
        
    if word_data is None:
        bag = extract_words_from_text(texts)
    else:
        bag = extract_words_from_text(extract_text(word_data, 'content'))
    
    word_occurence = words_in_texts(bag, texts)
    
    data = data.reset_index(drop = True)
    
    new_data = pd.DataFrame(data = word_occurence, columns = bag)
    new_data.insert(0, 'poetry_text', data['content'])
    new_data['poetry_author'] = data['author']
        
    return(new_data)

In [74]:
def words_in_texts(words, texts):
    '''
    Args:
        words (list-like): words to find
        texts (Series): sets of words to search in
    
    Returns:
        NumPy array of 0s and 1s with shape (n, p) where n is the
        number of texts and p is the number of words.
        
        Only considers whole words, not partial.
    '''
    indicator_array = np.array([texts.map(lambda x:word in x) for word in words]).T
    return indicator_array.astype('int32')

## Run bag of words

In [75]:
bag_train_data = bag_of_words(train_data)
bag_val_data = bag_of_words(val_data, word_data = train_data)
bag_test_data = bag_of_words(test_data, word_data = train_data)

In [76]:
print("Bag Train:", bag_train_data.shape)
print("Bag Val:", bag_val_data.shape)
print("Bag Test:", bag_test_data.shape)

Bag Train: (457, 1810)
Bag Val: (128, 1810)
Bag Test: (58, 1810)


# Save dataset

In [77]:
def save_datasets(df_dict, save_folder):
    for i in df_dict:
        try:
            df_dict[i].to_csv(save_folder + "/" + i, index = False)
        except FileNotFoundError:
            os.mkdir(save_folder)
            df_dict[i].to_csv(save_folder + "/" + i, index = False)

In [78]:
dfs_to_save = {'bow_train_data.csv':bag_train_data,
               'bow_val_data.csv':bag_val_data,
               'bow_test_data.csv':bag_test_data}

save_datasets(dfs_to_save, save_folder = processed_root("03-bag-of-words"))