# Header

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math as math
import janitor
from sklearn.linear_model import LinearRegression
import os

exec(open("../header.py").read())

Header initialized


# Import

In [2]:
threshold = 40
data_folder = processed_root("02-train-validation-test-split/threshold-"+str(threshold)+"/")

In [3]:
data_folder

'../../data/processed/02-train-validation-test-split/threshold-30/'

In [4]:
train_data = pd.read_csv(data_folder+"train_data.csv")
val_data = pd.read_csv(data_folder+"val_data.csv")
test_data = pd.read_csv(data_folder+"test_data.csv")

# Clean

## Bag of words functions

In [5]:
def extract_words_from_text(texts):
    '''
    Purpose: Helper function for bag_of_words
    Input: texts
    Output: list of words that occur in more than threshold texts
    '''
    
    threshold = 5
    word_counts = {}
    
    for text in texts:
        for word in text:
            if word in word_counts:
                word_counts[word] += 1
            else:
                word_counts[word] = 1 
                
    filtered_word_counts = word_counts.copy()

    for i in word_counts:
        if filtered_word_counts[i] < threshold:
            filtered_word_counts.pop(i)
            
    return list(filtered_word_counts.keys())

In [6]:
def extract_text(data, text_column):
    '''
    Purpose: Helper function for bag_of_words
    Input: Dataset
    Output: array of email sets of words (sets don't allow duplicates)
    '''
    
    return(data.apply(lambda x:set(x[text_column].split(' ')), axis = 1))

In [7]:
extract_text(train_data, 'clean_content')

0       {wind\nand, main\n\nnor, sportively\nglanc, qu...
1       {sun, weary\nno, old, sound\nwhil, merri, dark...
2       {say, mine\nmer, bitter\na, stronger\ni, fully...
3       {cling, shambl, enough, nose\nh, wide, man\nsh...
4       {felt, –\nmen, heaven, aw, behind\nhunt, shudd...
                              ...                        
1118    {“behold, drink, mouth, “thinkest, world glory...
1119    {light\nof, meet\n\nbut, thi, coy, sweet\nus, ...
1120    {commandment”\n\nand, astonish, marveling\ngre...
1121    {be\ntray, true, relat, isn't, alon, pear, mig...
1122    {, genit, true, youth, unreason, hill, visual,...
Length: 1123, dtype: object

In [8]:
def bag_of_words(data, content_column, word_data = None):
    '''
    Purpose: Converts a dataset to bag of words format.
    Input: Dataset
    Output: Bag of words version of the data
    '''
    
    texts = extract_text(data, content_column)
        
    if word_data is None:
        bag = extract_words_from_text(texts)
    else:
        bag = extract_words_from_text(extract_text(word_data, content_column))
    
    word_occurence = words_in_texts(bag, texts)
    
    data = data.reset_index(drop = True)
    
    new_data = pd.DataFrame(data = word_occurence, columns = bag)
    new_data.insert(0, 'poetry_text', data[content_column])
    new_data['poetry_author'] = data['author']
        
    return(new_data)

In [9]:
def words_in_texts(words, texts):
    '''
    Args:
        words (list-like): words to find
        texts (Series): sets of words to search in
    
    Returns:
        NumPy array of 0s and 1s with shape (n, p) where n is the
        number of texts and p is the number of words.
        
        Only considers whole words, not partial.
    '''
    indicator_array = np.array([texts.map(lambda x:word in x) for word in words]).T
    return indicator_array.astype('int32')

# Run bag of words for each threshold

In [13]:
def save_datasets(df_dict, save_folder):
    for i in df_dict:
        try:
            df_dict[i].to_csv(save_folder + "/" + i, index = False)
        except FileNotFoundError:
            os.mkdir(save_folder)
            df_dict[i].to_csv(save_folder + "/" + i, index = False)

In [15]:
def bag_of_words_for_threshold(threshold):
    data_folder = processed_root("02-train-validation-test-split/threshold-"+str(threshold)+"/")

    train_data = pd.read_csv(data_folder+"train_data.csv")
    val_data = pd.read_csv(data_folder+"val_data.csv")
    test_data = pd.read_csv(data_folder+"test_data.csv")

    bag_train_data = bag_of_words(train_data, content_column = 'clean_content')
    bag_val_data = bag_of_words(val_data, 
                                content_column = 'clean_content', 
                                word_data = train_data)
    bag_test_data = bag_of_words(test_data, 
                                 content_column = 'clean_content',
                                 word_data = train_data)
    print("Threshold:", threshold)
    print("Bag Train:", bag_train_data.shape)
    print("Bag Val:", bag_val_data.shape)
    print("Bag Test:", bag_test_data.shape)
    
    dfs_to_save = {'bow_train_data.csv':bag_train_data,
               'bow_val_data.csv':bag_val_data,
               'bow_test_data.csv':bag_test_data}

    save_datasets(dfs_to_save, save_folder = processed_root("03-bag-of-words/threshold-"+str(threshold)))

In [16]:
bag_of_words_for_threshold(30)
bag_of_words_for_threshold(40)
bag_of_words_for_threshold(50)

Threshold: 30
Bag Train: (1123, 4192)
Bag Val: (313, 4192)
Bag Test: (140, 4192)
Threshold: 40
Bag Train: (457, 1899)
Bag Val: (127, 1899)
Bag Test: (59, 1899)
Threshold: 50
Bag Train: (241, 944)
Bag Val: (69, 944)
Bag Test: (31, 944)
