# Header

In [81]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math as math
import janitor
import os
import string
import nltk
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
import enchant
import re

exec(open("../header.py").read())

Header initialized


# Import

In [18]:
raw_data = pd.read_csv(raw_root("01-poetryfoundation/poetry_foundation.csv"))

# Clean

## Initial clean

In [25]:
init_data = raw_data\
    .drop('Unnamed: 0', axis = 1)\
    .clean_names()

## Word processing

In [7]:
# intializing checkeres
d = enchant.Dict("en_US")
lemmatizer = WordNetLemmatizer()
stemmer = SnowballStemmer("english")

In [47]:
def word_suggestions(word, dictionary):
    '''
    Given a word, check if misspelled and suggest the most likely word according to
    dictionary.
    '''
    if dictionary.check(word) == False:
        suggestions = dictionary.suggest(word)
        if len(suggestions) == 0:
            return word
        else:
            return suggestions[0]
    else:
        return word

In [84]:
def str_cleanup(line, remove_stop_words = True, dictionary = None):
    """given a line of a poem, lowercase, get rid of punctuation, etc"""
    # defaults
    stop_words = set(stopwords.words('english'))
    
    # all lower case
    line = line.lower()
    
    # first split into words
    line = line.split(" ")
    
    # get rid of punctuation
    line = list(map(lambda x: re.sub("[.,-/\"'();:{}=!?%^&*]","",x), line))
    
    # remove blanks
    line = [i for i in line if i != '']
    
    # remove numbers
    line = list(map(lambda x: re.sub(r'\d+', '', x), line))
    
    # remove trailing spaces
    line = list(map(lambda x: x.strip(), line))
    
    # remove stop words
    if remove_stop_words:
        line = [i for i in line if i not in stop_words]
    
    # lemmatize
    line = list(map(lambda x: lemmatizer.lemmatize(x), line))
    
    # stem
    line = list(map(lambda x:stemmer.stem(x), line))
    
    # spell checks
    if dictionary is not None:
        line = list(map(lambda x: word_suggestions(x, dictionary), line))
        
    # reconstruct
    line = ' '.join(line)
    
    return line

In [85]:
processed_data = init_data\
    .assign(clean_content = lambda x:x.content.apply(str_cleanup))

In [86]:
processed_data

Unnamed: 0,author,title,poetry_foundation_id,content,clean_content
0,Wendy Videlock,!,55489,"Dear Writers, I’m compiling the first in what ...",dear writer i'm compil first hope seri public ...
1,Hailey Leithauser,0,41729,"Philosophic\nin its complex, ovoid emptiness,\...",philosophic\nin complex ovoid emptiness\na ski...
2,Jody Gladding,1-800-FEAR,57135,We'd like to talk with you about fear t...,wed like talk fear said so\nmani peopl live fe...
3,Joseph Brodsky,1 January 1965,56736,The Wise Men will unlearn your name.\nAbove yo...,wise men unlearn name\nabov head star flame\no...
4,Ted Berrigan,3 Pages,51624,For Jack Collom\n10 Things I do Every Day\n\np...,jack collom thing everi day\n\nplay poker\ndri...
...,...,...,...,...,...
15647,Hannah Gamble,Your Invitation to a Modest Breakfast,56059,"It’s too cold to smoke outside, but if you com...",it cold smoke outsid come over\ni'l keep hand ...
15648,Eleni Sikelianos,Your Kingdom\n \n \n \n Launch Audio in a N...,145220,if you like let the body feel\nall its own evo...,like let bodi feel\nal evolution\ninsid open f...
15649,Susan Elizabeth Howe,“Your Luck Is About To Change”,41696,(A fortune cookie)\nOminous inscrutable Chines...,fortun cookie\nomin inscrut chines news\nto ge...
15650,Andrew Shields,Your Mileage May Vary,90177,1\nOur last night in the house was not our las...,last night hous last\nwith two cat yard mover ...


## Add counts by author and ranking for data-split

In [94]:
def add_authors_count(data, count = 20):

    count_by_authors = data\
        .groupby('author', as_index = False)\
        .agg({'content':'count'})\
        .rename({'content':'author_poem_count'}, axis = 1)\
        .loc[lambda x:x.author_poem_count >= count,:]\
        .loc[lambda x:x.author != "Anonymous",:]
    
    expanded_data = data\
        .merge(count_by_authors, how = "inner", on = ["author"])\
        .sample(frac = 1, replace = False, random_state = 10)\
        .reset_index(drop = True)
    
    expanded_data['author_poem_index'] = expanded_data\
        .groupby('author', as_index = False)\
        .cumcount()

    expanded_data['author_poem_pct'] = expanded_data['author_poem_index']/expanded_data['author_poem_count']
    
    return expanded_data

In [98]:
data_expanded = add_authors_count(processed_data)

In [99]:
data_expanded

Unnamed: 0,author,title,poetry_foundation_id,content,clean_content,author_poem_count,author_poem_index,author_poem_pct
0,William Wordsworth,from\n \n The Prelude: Book 1: Childhood and...,45542,"—Was it for this\nThat one, the fairest of all...",—was this\nthat one fairest river lovd\nto ble...,59,0,0.000000
1,William Blake,The Ecchoing Green,56592,"The sun does arise,\nAnd make happy the skies....",sun arise\nand make happi skies\nth merri bell...,38,0,0.000000
2,Robert Browning,The Lost Mistress,50499,"All’s over, then: does truth sound bitter\nAs ...",all truth sound bitter\na one first believes\n...,39,0,0.000000
3,Sir Thomas Wyatt,"Whoso List to Hunt, I Know where is an Hind",45593,"Whoso list to hunt, I know where is an hind,\n...",whoso list hunt know hind\nbut me\nhéla may mo...,29,0,0.000000
4,Thomas Hardy,No Buyers,44331,A Load of brushes and baskets and cradles and ...,load brush basket cradl chairs\nlabour along s...,38,0,0.000000
...,...,...,...,...,...,...,...,...
3045,Billy Collins,The Parade,41254,How exhilarating it was to march\nalong the gr...,exhilar march\nalong great boulevards\nin sunf...,32,31,0.968750
3046,Franz Wright,Imago,54281,"From my cell I was staring at a cloud, a dog d...",cell stare cloud dog decay wood etc took longa...,22,21,0.954545
3047,Randall Mann,Bernal Hill,42307,Something has to give.\nWe stand above it all....,someth give\nw stand all\nbelow build tall\nbu...,24,23,0.958333
3048,William Butler Yeats,Byzantium,43296,The unpurged images of day recede;\nThe Empero...,unpurg imag day recede\nth emperor drunken sol...,47,46,0.978723


# Save dataset

In [96]:
data_expanded.to_csv(processed_root("01-poetryfoundation/poetry_foundation.csv"))