In [22]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns; sns.set_style('ticks')

import re
from unicodedata import normalize

import requests as rq
from bs4 import BeautifulSoup as bs
from selenium import webdriver


import time
import pickle

from functions import *

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
pd.set_option('max_colwidth', 150)

##### Manually create dictionary with url codes for each genre.

In [142]:
genre_codes = {
    'augustan': 149,
    'beat': 150,
    'black_arts_movement': 304,
    'black_mountain': 151,
    'confessional': 152,
    'fugitive': 153,
    'georgian': 154,
    'harlem_renaissance': 155,
    'imagist': 156,
    'language_poetry': 157,
    'middle_english': 158,
    'modern': 159,
    'new_york_school': 160,
    'new_york_school_2nd_generation': 161,
    'objectivist': 162,
    'renaissance': 163,
    'romantic': 164,
    'victorian': 165
}

##### Run function in a loop to create dictionary of poet urls.

In [193]:
poet_urls = {genre:poet_urls_by_genre(genre_code, 3) for genre,genre_code in genre_codes.items()}
poet_urls['augustan']

['https://www.poetryfoundation.org/poets/mary-barber',
 'https://www.poetryfoundation.org/poets/susanna-blamire',
 'https://www.poetryfoundation.org/poets/henry-carey',
 'https://www.poetryfoundation.org/poets/thomas-chatterton',
 'https://www.poetryfoundation.org/poets/william-collins',
 'https://www.poetryfoundation.org/poets/william-cowper',
 'https://www.poetryfoundation.org/poets/daniel-defoe',
 'https://www.poetryfoundation.org/poets/anne-finch',
 'https://www.poetryfoundation.org/poets/john-gay',
 'https://www.poetryfoundation.org/poets/oliver-goldsmith',
 'https://www.poetryfoundation.org/poets/thomas-gray',
 'https://www.poetryfoundation.org/poets/matthew-green',
 'https://www.poetryfoundation.org/poets/warren-hastings',
 'https://www.poetryfoundation.org/poets/samuel-johnson',
 'https://www.poetryfoundation.org/poets/mary-jones',
 'https://www.poetryfoundation.org/poets/lady-mary-wortley-montagu',
 'https://www.poetryfoundation.org/poets/alexander-pope',
 'https://www.poetryf

##### Loop only partially worked, so let's re-run sections in which some urls are missing.

In [196]:
poet_urls['black_arts_movement'] = poet_urls_by_genre(genre_codes['black_arts_movement'])

In [198]:
poet_urls['modern'] = poet_urls_by_genre(genre_codes['modern'])

In [200]:
poet_urls['renaissance'] = poet_urls_by_genre(genre_codes['renaissance'])

In [203]:
poet_urls['romantic'] = poet_urls_by_genre(genre_codes['romantic'])

In [206]:
poet_urls['victorian'] = poet_urls_by_genre(genre_codes['victorian'])

In [207]:
# confirm all urls have been grabbed
url_lens = {k:len(v) for k,v in poet_urls.items()}
url_lens

{'augustan': 23,
 'beat': 13,
 'black_arts_movement': 23,
 'black_mountain': 10,
 'confessional': 7,
 'fugitive': 7,
 'georgian': 22,
 'harlem_renaissance': 17,
 'imagist': 6,
 'language_poetry': 18,
 'middle_english': 3,
 'modern': 54,
 'new_york_school': 9,
 'new_york_school_2nd_generation': 16,
 'objectivist': 5,
 'renaissance': 41,
 'romantic': 51,
 'victorian': 55}

##### Pickle it! uncomment to save/load

In [5]:
# with open('poet_urls_dict.pickle', 'wb') as w:
#     pickle.dump(poet_urls, w, protocol=pickle.HIGHEST_PROTOCOL)

with open('poet_urls_dict.pickle', 'rb') as r:
    poet_urls_dict = pickle.load(r)

##### Check for duplicate values

In [6]:
poet_df = pd.DataFrame([(genre,v) for genre in poet_urls_dict.keys() for v in poet_urls_dict[genre]])
pd.concat(g for _, g in poet_df.groupby(1) if len(g) > 1)

Unnamed: 0,0,1
126,imagist,https://www.poetryfoundation.org/poets/ezra-pound
186,modern,https://www.poetryfoundation.org/poets/ezra-pound
122,imagist,https://www.poetryfoundation.org/poets/richard-aldington
150,modern,https://www.poetryfoundation.org/poets/richard-aldington


##### We'll give those poets to the imagist genre, since it has so few already

In [7]:
dups = [value for value in poet_df[poet_df.duplicated(1)][1]]
dups

['https://www.poetryfoundation.org/poets/richard-aldington',
 'https://www.poetryfoundation.org/poets/ezra-pound']

In [8]:
len(poet_urls_dict['modern'])

54

In [9]:
poet_urls_dict['modern'] = [url for url in poet_urls_dict['modern'] if url not in dups]
len(poet_urls_dict['modern'])

52

##### Instantiate an empty dataframe, then loop over each genre in our poet urls dictionary, create a dataframe for each genre and add that to the original dataframe, saving it after each concatenation

In [64]:
# instantiate an empty dataframe
df = pd.DataFrame()

# loop over each genre, create dataframe with desired information,
# concat to original dataframe, then save it before looping again
for genre in list(poet_urls_dict.keys()):
    genre_df = pf_scraper(poet_urls_dict, genre)
    df = pd.concat([df, genre_df])
    df.to_csv('data/poetry_foundation_raw.csv')

In [65]:
df.shape

(5442, 8)

In [69]:
df.columns = ['poet_url', 'genre', 'poem_url', 'poet', 'title', 'year', 'poem_lines', 'poem_string']
df.columns

Index(['poet_url', 'genre', 'poem_url', 'poet', 'title', 'year', 'poem_lines',
       'poem_string'],
      dtype='object')

In [70]:
df.head()

Unnamed: 0,poet_url,genre,poem_url,poet,title,year,poem_lines,poem_string
0,https://www.poetryfoundation.org/poets/richard-brautigan,beat,https://www.poetryfoundation.org/poems/48578/at-the-california-institute-of-technology,Richard Brautigan,At the California Institute of Technology,,"[I don’t care how God-damn smart, \r these guys are: I’m bored., <br/>, \r It’s been raining like hell all day long, \r and there’s nothing to do....",I don’t care how God-damn smart\n these guys are: I’m bored.\n\n It’s been raining like hell all day long\n and there’s nothing to do.\n
1,https://www.poetryfoundation.org/poets/richard-brautigan,beat,https://www.poetryfoundation.org/poems/48576/a-boat,Richard Brautigan,A Boat,1968.0,"[O beautiful , was the werewolf , in his evil forest. , We took him , to the carnival , and he started , crying , when he saw , the Fer...",O beautiful was the werewolf in his evil forest. We took him to the carnival and he started crying when he saw the Ferris wheel. Elec...
2,https://www.poetryfoundation.org/poets/richard-brautigan,beat,https://www.poetryfoundation.org/poems/48580/december-30,Richard Brautigan,December 30,1968.0,"[At 1:03 in the morning a fart, \r smells like a marriage between, \r an avocado and a fish head., <br/>, \r I have to get out of bed, \r to write...",At 1:03 in the morning a fart\n smells like a marriage between\n an avocado and a fish head.\n\n I have to get out of bed\n to write this down wit...
3,https://www.poetryfoundation.org/poets/richard-brautigan,beat,https://www.poetryfoundation.org/poems/48579/the-double-bed-dream-gallows,Richard Brautigan,The Double-Bed Dream Gallows,1968.0,"[Driving through , \r hot brushy country, \r in the late autumn, , \r I saw a hawk, \r crucified on a, \r barbed-wire fence., <br/>, \r I gues...","Driving through \n hot brushy country\n in the late autumn, \n I saw a hawk\n crucified on a\n barbed-wire fence.\n\n I guess as a kind \n o..."
4,https://www.poetryfoundation.org/poets/richard-brautigan,beat,https://www.poetryfoundation.org/poems/48581/haiku-ambulance,Richard Brautigan,Haiku Ambulance,1968.0,"[A piece of green pepper, \r fell, \r off the wooden salad bowl:, \r so what?, <br/>]",A piece of green pepper\n fell\n off the wooden salad bowl:\n so what?\n


In [73]:
type(df)

pandas.core.frame.DataFrame

In [72]:
df.duplicated(subset=None, keep='first').sum()

TypeError: unhashable type: 'list'

# SCRAP HEAP

In [526]:
ultra_dict = {genre: [] for genre in poet_urls_dict.keys()}
ultra_dict

{'augustan': [],
 'beat': [],
 'black_arts_movement': [],
 'black_mountain': [],
 'confessional': [],
 'fugitive': [],
 'georgian': [],
 'harlem_renaissance': [],
 'imagist': [],
 'language_poetry': [],
 'middle_english': [],
 'modern': [],
 'new_york_school': [],
 'new_york_school_2nd_generation': [],
 'objectivist': [],
 'renaissance': [],
 'romantic': [],
 'victorian': []}

In [46]:
s = rq.Session()
s.get(genre_urls[0])

<Response [200]>