# Header

In [79]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math as math
import janitor
from sklearn.linear_model import LinearRegression
import os

exec(open("../header.py").read())

Header initialized


# Import

In [80]:
raw_data = pd.read_csv(raw_root("01-poetryfoundation/poetry_foundation.csv"))

In [81]:
raw_data.head()

Unnamed: 0.1,Unnamed: 0,Author,Title,Poetry Foundation ID,Content
0,0,Wendy Videlock,!,55489,"Dear Writers, I’m compiling the first in what ..."
1,1,Hailey Leithauser,0,41729,"Philosophic\nin its complex, ovoid emptiness,\..."
2,2,Jody Gladding,1-800-FEAR,57135,We'd like to talk with you about fear t...
3,3,Joseph Brodsky,1 January 1965,56736,The Wise Men will unlearn your name.\nAbove yo...
4,4,Ted Berrigan,3 Pages,51624,For Jack Collom\n10 Things I do Every Day\n\np...


In [82]:
raw_data.columns

Index(['Unnamed: 0', 'Author', 'Title', 'Poetry Foundation ID', 'Content'], dtype='object')

# Clean

In [83]:
data = raw_data\
    .drop('Unnamed: 0', axis = 1)\
    .clean_names()

In [84]:
data.head()

Unnamed: 0,author,title,poetry_foundation_id,content
0,Wendy Videlock,!,55489,"Dear Writers, I’m compiling the first in what ..."
1,Hailey Leithauser,0,41729,"Philosophic\nin its complex, ovoid emptiness,\..."
2,Jody Gladding,1-800-FEAR,57135,We'd like to talk with you about fear t...
3,Joseph Brodsky,1 January 1965,56736,The Wise Men will unlearn your name.\nAbove yo...
4,Ted Berrigan,3 Pages,51624,For Jack Collom\n10 Things I do Every Day\n\np...


In [85]:
data.shape

(15652, 4)

## Clean duplicate authors

In [86]:
# Do a fuzzy match between all possible authors to catch typos in the author
# field, e.g. hypothetically, "Authur Spellamn" and "Author Spellman"
# should be combined

## Threshold

In [87]:
count_by_authors = data\
    .groupby('author', as_index = False)\
    .agg({'content':'count'})\
    .rename({'content':'author_poem_count'}, axis = 1)\
    .loc[lambda x:x.author_poem_count >= 30,:]\
    .loc[lambda x:x.author != "Anonymous",:]

count_by_authors.shape

(39, 2)

In [88]:
thresholded_data = data\
    .merge(count_by_authors, how = "inner", on = ["author"])\
    .shuffle()

thresholded_data.head()

Unnamed: 0,author,title,poetry_foundation_id,content,author_poem_count
0,Percy sshe Shelley,from\n \n Queen Mab: Part VI,45137,"(excerpt)\n""Throughout these infinite orbs of ...",43
1,Thomas Hardy,'According to the Mighty Working',57342,I\n\nWhen moiling seems at cease\nIn the vague...,38
2,Rae Armantrout,Our Nature,54881,The very flatness\nof portraits\nmakes for nos...,62
3,Walt Whitman,For You O Democracy,51567,"Come, I will make the continent indissoluble,\...",41
4,William Butler Yeats,The Magi,12892,Now as at all times I can see in the mind's ey...,47


In [89]:
thresholded_data['author_poem_index'] = thresholded_data\
    .groupby('author', as_index = False)\
    .cumcount()

thresholded_data['author_poem_pct'] = thresholded_data['author_poem_index']/thresholded_data['author_poem_count']

## Train-validation-test split

In [90]:
train_data = thresholded_data.loc[lambda x:x.author_poem_pct <= 0.7]
val_data = thresholded_data.loc[lambda x:
                                (x.author_poem_pct > 0.7)&
                                (x.author_poem_pct <= 0.9)]
test_data = thresholded_data.loc[lambda x:x.author_poem_pct > 0.9]

# Save datasets

In [93]:
def save_datasets(df_dict, save_folder):
    for i in df_dict:
        try:
            df_dict[i].to_csv(save_folder + "/" + i, index = False)
        except FileNotFoundError:
            os.mkdir(save_folder)
            df_dict[i].to_csv(save_folder + "/" + i, index = False)

In [94]:
save_datasets({'train_data.csv':train_data,
              'val_data.csv':val_data,
              'test_data.csv':test_data}, 
             save_folder = processed_root("02-train-validation-test-split"))