# Header

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math as math
import janitor
from sklearn.linear_model import LinearRegression
import os

exec(open("../header.py").read())

Header initialized


# Import

In [52]:
len(train_data['author'].unique())

39

In [4]:
train_data = pd.read_csv(processed_root("02-train-validation-test-split/train_data.csv"))
val_data = pd.read_csv(processed_root("02-train-validation-test-split/val_data.csv"))
test_data = pd.read_csv(processed_root("02-train-validation-test-split/test_data.csv"))

In [5]:
train_data.head()

Unnamed: 0,author,title,poetry_foundation_id,content,author_poem_count,author_poem_index,author_poem_pct
0,Percy sshe Shelley,from\n \n Queen Mab: Part VI,45137,"(excerpt)\n""Throughout these infinite orbs of ...",43,0,0.0
1,Thomas Hardy,'According to the Mighty Working',57342,I\n\nWhen moiling seems at cease\nIn the vague...,38,0,0.0
2,Rae Armantrout,Our Nature,54881,The very flatness\nof portraits\nmakes for nos...,62,0,0.0
3,Walt Whitman,For You O Democracy,51567,"Come, I will make the continent indissoluble,\...",41,0,0.0
4,William Butler Yeats,The Magi,12892,Now as at all times I can see in the mind's ey...,47,0,0.0


# Clean

## Bag of words functions

In [8]:
def extract_words_from_text(texts):
    '''
    Purpose: Helper function for bag_of_words
    Input: texts
    Output: list of words that occur in more than threshold texts
    '''
    
    threshold = 5
    word_counts = {}
    
    for text in texts:
        for word in text:
            if word in word_counts:
                word_counts[word] += 1
            else:
                word_counts[word] = 1 
                
    filtered_word_counts = word_counts.copy()

    for i in word_counts:
        if filtered_word_counts[i] < threshold:
            filtered_word_counts.pop(i)
            
    return list(filtered_word_counts.keys())

In [9]:
def extract_text(data, text_column):
    '''
    Purpose: Helper function for bag_of_words
    Input: Dataset
    Output: array of email sets of words (sets don't allow duplicates)
    '''
    
    return(data.apply(lambda x:set(x[text_column].split(' ')), axis = 1))

In [11]:
extract_text(train_data, 'content')

0       {graves,\nAnd, not, pressing, sweeping, minute...
1       {quick-cued, night-time,\nAnd, fear, of, vague...
2       {of, lip, our, flatness\nof, be, recklessness,...
3       {love, indissoluble,\nI, of, will, each, Democ...
4       {stiff,, eye,\nIn, disappear, of, uncontrollab...
                              ...                        
1118    {its, of, though, perhaps, not, thou, be, depa...
1119    {floor,\nAnd, not, bookshelves,\nThe, heavines...
1120    {of, carpets, carved, shady, them—aye,\nClocks...
1121    {they’re, secret, of, guarding, never, virtue,...
1122    {looking-glass, inhabit, fear, of, fresh, her\...
Length: 1123, dtype: object

In [45]:
def bag_of_words(data, word_data = None):
    '''
    Purpose: Converts a dataset to bag of words format.
    Input: Dataset
    Output: Bag of words version of the data
    '''
    
    texts = extract_text(data, 'content')
        
    if word_data is None:
        bag = extract_words_from_text(texts)
    else:
        bag = extract_words_from_text(extract_text(word_data, 'content'))
    
    word_occurence = words_in_texts(bag, texts)
    
    data = data.reset_index(drop = True)
    
    new_data = pd.DataFrame(data = word_occurence, columns = bag)
    new_data.insert(0, 'poetry_text', data['content'])
    new_data['poetry_author'] = data['author']
        
    return(new_data)

In [46]:
def words_in_texts(words, texts):
    '''
    Args:
        words (list-like): words to find
        texts (Series): sets of words to search in
    
    Returns:
        NumPy array of 0s and 1s with shape (n, p) where n is the
        number of texts and p is the number of words.
        
        Only considers whole words, not partial.
    '''
    indicator_array = np.array([texts.map(lambda x:word in x) for word in words]).T
    return indicator_array.astype('int32')

## Run bag of words

In [47]:
bag_train_data = bag_of_words(train_data)
bag_val_data = bag_of_words(val_data, word_data = train_data)
bag_test_data = bag_of_words(test_data, word_data = train_data)

In [48]:
print("Bag Train:", bag_train_data.shape)
print("Bag Val:", bag_val_data.shape)
print("Bag Test:", bag_test_data.shape)

Bag Train: (1123, 4829)
Bag Val: (317, 4829)
Bag Test: (136, 4829)


# Save dataset

In [50]:
def save_datasets(df_dict, save_folder):
    for i in df_dict:
        try:
            df_dict[i].to_csv(save_folder + "/" + i, index = False)
        except FileNotFoundError:
            os.mkdir(save_folder)
            df_dict[i].to_csv(save_folder + "/" + i, index = False)

In [51]:
dfs_to_save = {'bow_train_data.csv':bag_train_data,
               'bow_val_data.csv':bag_val_data,
               'bow_test_data.csv':bag_test_data}

save_datasets(dfs_to_save, save_folder = processed_root("03-bag-of-words"))