# Header

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math as math
import janitor
from sklearn.linear_model import LinearRegression
import os

exec(open("../header.py").read())

Header initialized


# Import

In [3]:
raw_data = pd.read_csv(raw_root("01-poetryfoundation/poetry_foundation.csv"))

In [4]:
raw_data.head()

Unnamed: 0.1,Unnamed: 0,Author,Title,Poetry Foundation ID,Content
0,0,Wendy Videlock,!,55489,"Dear Writers, I’m compiling the first in what ..."
1,1,Hailey Leithauser,0,41729,"Philosophic\nin its complex, ovoid emptiness,\..."
2,2,Jody Gladding,1-800-FEAR,57135,We'd like to talk with you about fear t...
3,3,Joseph Brodsky,1 January 1965,56736,The Wise Men will unlearn your name.\nAbove yo...
4,4,Ted Berrigan,3 Pages,51624,For Jack Collom\n10 Things I do Every Day\n\np...


In [5]:
raw_data.columns

Index(['Unnamed: 0', 'Author', 'Title', 'Poetry Foundation ID', 'Content'], dtype='object')

# Clean

In [6]:
data = raw_data\
    .drop('Unnamed: 0', axis = 1)\
    .clean_names()

In [7]:
data.head()

Unnamed: 0,author,title,poetry_foundation_id,content
0,Wendy Videlock,!,55489,"Dear Writers, I’m compiling the first in what ..."
1,Hailey Leithauser,0,41729,"Philosophic\nin its complex, ovoid emptiness,\..."
2,Jody Gladding,1-800-FEAR,57135,We'd like to talk with you about fear t...
3,Joseph Brodsky,1 January 1965,56736,The Wise Men will unlearn your name.\nAbove yo...
4,Ted Berrigan,3 Pages,51624,For Jack Collom\n10 Things I do Every Day\n\np...


In [8]:
data.shape

(15652, 4)

## Clean duplicate authors

In [9]:
# Do a fuzzy match between all possible authors to catch typos in the author
# field, e.g. hypothetically, "Authur Spellamn" and "Author Spellman"
# should be combined

## Threshold

In [16]:
def threshold_authors_by_count(data, count):

    count_by_authors = data\
        .groupby('author', as_index = False)\
        .agg({'content':'count'})\
        .rename({'content':'author_poem_count'}, axis = 1)\
        .loc[lambda x:x.author_poem_count >= count,:]\
        .loc[lambda x:x.author != "Anonymous",:]
    
    thresholded_data = data\
        .merge(count_by_authors, how = "inner", on = ["author"])\
        .shuffle()
    
    thresholded_data['author_poem_index'] = thresholded_data\
    .groupby('author', as_index = False)\
    .cumcount()

    thresholded_data['author_poem_pct'] = thresholded_data['author_poem_index']/thresholded_data['author_poem_count']
    
    return thresholded_data

In [17]:
data_threshold_30 = threshold_authors_by_count(data, 30)
data_threshold_40 = threshold_authors_by_count(data, 40)
data_threshold_50 = threshold_authors_by_count(data, 50)

In [18]:
print(data_threshold_30.shape)
print(data_threshold_40.shape)
print(data_threshold_50.shape)

(1576, 7)
(643, 7)
(341, 7)


In [19]:
data_threshold_30.head()

Unnamed: 0,author,title,poetry_foundation_id,content,author_poem_count,author_poem_index,author_poem_pct
0,W. S. Di Piero,The One-Year-Old Lemon Tree,46741,Its small celestial reach stops\nwhere the cou...,34,0,0.0
1,Henry Wadsworth Longfellow,The Building of the Ship,44626,"""Build me straight, O worthy Master!\nStanch a...",38,0,0.0
2,W. S. Di Piero,"Walt, the Wounded",46739,"The whole world was there, plucking their line...",34,1,0.029412
3,Ben Jonson,To Heaven,44465,"Good and great God, can I not think of thee\nB...",34,0,0.0
4,Edmund Spenser,An Hymn In Honour Of Beauty,45214,"AH whither, Love, wilt thou now carry me?\nWha...",33,0,0.0


## Train-validation-test split

True

In [39]:
def train_val_test_split(data, splits = [0.7, 0.2, 0.1]):
    if not np.isclose(sum([0.7,0.2,0.1]), 1):
        raise RuntimeError("Splits must sum to 1.")
    train_pt = splits[0]
    val_pt = splits[0] + splits[1]
    
    train_data = data.loc[lambda x:x.author_poem_pct <= train_pt]
    val_data = data.loc[lambda x:
                                    (x.author_poem_pct > train_pt)&
                                    (x.author_poem_pct <= val_pt)]
    test_data = data.loc[lambda x:x.author_poem_pct > val_pt]
    
    train_data.name = 'train_data.csv'
    val_data.name = 'val_data.csv'
    test_data.name = 'test_data.csv'
    
    return train_data, val_data, test_data

In [40]:
train_30, val_30, test_30 = train_val_test_split(data_threshold_30)
train_40, val_40, test_40 = train_val_test_split(data_threshold_40)
train_50, val_50, test_50 = train_val_test_split(data_threshold_50)

# Save datasets

In [41]:
def save_datasets(dfs, save_folder):
    for df in dfs:
        try:
            df.to_csv(save_folder + "/" + df.name, index = False)
        except FileNotFoundError:
            os.mkdir(save_folder)
            df.to_csv(save_folder + "/" + df.name, index = False)

In [42]:
save_datasets([train_30, val_30, test_30], 
             save_folder = processed_root("02-train-validation-test-split/threshold-30"))

In [43]:
save_datasets([train_40, val_40, test_40], 
             save_folder = processed_root("02-train-validation-test-split/threshold-40"))

In [44]:
save_datasets([train_50, val_50, test_50], 
             save_folder = processed_root("02-train-validation-test-split/threshold-50"))