# Header

In [36]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math as math
import janitor
from sklearn.linear_model import LinearRegression
import os

exec(open("../header.py").read())

Header initialized


# Import

In [3]:
raw_data = pd.read_csv(raw_root("01-poetryfoundation/poetry_foundation.csv"))

In [4]:
raw_data.head()

Unnamed: 0.1,Unnamed: 0,Author,Title,Poetry Foundation ID,Content
0,0,Wendy Videlock,!,55489,"Dear Writers, I’m compiling the first in what ..."
1,1,Hailey Leithauser,0,41729,"Philosophic\nin its complex, ovoid emptiness,\..."
2,2,Jody Gladding,1-800-FEAR,57135,We'd like to talk with you about fear t...
3,3,Joseph Brodsky,1 January 1965,56736,The Wise Men will unlearn your name.\nAbove yo...
4,4,Ted Berrigan,3 Pages,51624,For Jack Collom\n10 Things I do Every Day\n\np...


In [5]:
raw_data.columns

Index(['Unnamed: 0', 'Author', 'Title', 'Poetry Foundation ID', 'Content'], dtype='object')

# Clean

In [6]:
data = raw_data\
    .drop('Unnamed: 0', axis = 1)\
    .clean_names()

In [7]:
data.head()

Unnamed: 0,author,title,poetry_foundation_id,content
0,Wendy Videlock,!,55489,"Dear Writers, I’m compiling the first in what ..."
1,Hailey Leithauser,0,41729,"Philosophic\nin its complex, ovoid emptiness,\..."
2,Jody Gladding,1-800-FEAR,57135,We'd like to talk with you about fear t...
3,Joseph Brodsky,1 January 1965,56736,The Wise Men will unlearn your name.\nAbove yo...
4,Ted Berrigan,3 Pages,51624,For Jack Collom\n10 Things I do Every Day\n\np...


In [8]:
data.shape

(15652, 4)

## Clean duplicate authors

In [18]:
# Do a fuzzy match between all possible authors to catch typos in the author
# field, e.g. hypothetically, "Authur Spellamn" and "Author Spellman"
# should be combined

## Threshold

In [34]:
count_by_authors = data\
    .groupby('author', as_index = False)\
    .agg({'content':'count'})\
    .rename({'content':'author_poem_count'}, axis = 1)\
    .loc[lambda x:x.author_poem_count >= 30,:]\
    .loc[lambda x:x.author != "Anonymous",:]

count_by_authors.head()

Unnamed: 0,author,author_poem_count
79,"Alfred, Lord Tennyson",78
80,Algernon Charles Swinburne,39
90,Alice Notley,30
273,Ben Jonson,34
302,Billy Collins,32


In [57]:
thresholded_data = data\
    .merge(count_by_authors, how = "inner", on = ["author"])\
    .shuffle()

thresholded_data.head()

Unnamed: 0,author,title,poetry_foundation_id,content,author_poem_count
0,Sir Philip Sidney,"Astrophil and Stella 90:\n \n Stella, think...",50083,"Stella, think not that I by verse seek fame,\n...",39
1,Billy Collins,The Invention of the Saxophone,148012,"It was Adolphe Sax, remember,\nnot Saxo Gramma...",32
2,John Milton,L'Allegro,44731,"Hence loathed Melancholy,\nOf Cerberus, and bl...",34
3,Dean Young,Bronzed,12564,"That dusty bubble gum, once ubiquitous as star...",37
4,John Ashbery,The Tennis Court Oath,47767,What had you been thinking about\nthe face stu...,46


In [60]:
thresholded_data['author_poem_index'] = thresholded_data\
    .groupby('author', as_index = False)\
    .cumcount()

thresholded_data['author_poem_pct'] = thresholded_data['author_poem_index']/thresholded_data['author_poem_count']

## Train-validation-test split

In [65]:
train_data = thresholded_data.loc[lambda x:x.author_poem_pct <= 0.7]
val_data = thresholded_data.loc[lambda x:
                                (x.author_poem_pct > 0.7)&
                                (x.author_poem_pct <= 0.9)]
test_data = thresholded_data.loc[lambda x:x.author_poem_pct > 0.9]

# Save datasets

In [66]:
def test(**kwargs):
    for i in kwargs:
        print(i, kwargs[i])

In [68]:
test(hello='bye', whee = 'whoo')

hello bye
whee whoo


In [71]:
def save_datasets(df_dict, save_folder):
    for i in df_dict:
        try:
            df_dict[i].to_csv(save_folder + "/" + i + ".csv", index = False)
        except FileNotFoundError:
            os.mkdir(save_folder)
            df_dict[i].to_csv(save_folder + "/" + i + ".csv", index = False)

In [None]:
save_dataset({'train':}, save_folder = processed_root("02-train-validation-test-split"))