In [None]:
from selenium import webdriver
import re
import time
import requests as rq
from bs4 import BeautifulSoup as bs
import pytesseract

def poet_urls_by_genre(genre_code, max_page_num):
    '''Scraper for PoetryFoundation.org--scrapes urls for poets by genre.
       Input genre code and maximum number of pages to iterate through.
       Outputs a list of urls for each poet within the specified parameters.
       NOTE: Selenium is known to encounter issues, so sometimes this code does not work properly. Try re-running if output
             is not as expected.'''
    
    # url requirements
    base_url = 'https://www.poetryfoundation.org/poets/browse#page='
    genre_addon = '&sort_by=last_name&school-period='
    
    # create empty list
    poet_urls = []
    # loop through desired number of pages
    for i in range(1,max_page_num):
        try: 
            # instantiate a selenium browser
            driver = webdriver.Chrome()
            # load webpage
            driver.get(f'{base_url}{i}{genre_addon}{genre_code}')
            # find all links
            hrefs = driver.find_elements_by_xpath("//*[@href]")
            # find only links that match pattern for poet url
            pattern = re.compile('^.*/poets/(?!browse)[a-z\-]*$')
            poet_urls_by_page = [href.get_attribute('href') for href in hrefs if pattern.match(href.get_attribute('href'))]
            
            # only extend the list if there is something to extend
            if poet_urls_by_page:
                poet_urls.extend(poet_urls_by_page)
                # manually create some time between selenium browser, to decrease chance of errors or IP block
                time.sleep(2.5)
            else:
                break
        # NOTE: a more specific except protocol may allow one to not have to re-run this code, could re-run the code
        #       until all possible links are grabbed
        except:
            break
            
    return poet_urls

def poem_urls_scraper(poet_url):
    '''Scraper for PoetryFoundation.org--scrapes poem urls by poet.
       Input the url for a poet's page on PoetryFoundation.org.
       Output two lists: first, a list of urls for text poems; second, a list of urls for poems that are scans of the original
       magazine page.'''
    
    # load a page and soupify it
    page = rq.get(poet_url)
    soup = bs(page.content, 'html.parser')
    
    # find all links that fit a certain pattern
    # finds links to poems that are text and easily scraped
    poems_text = soup.find_all('a',
                               href=re.compile('https://www.poetryfoundation.org/poems/[0-9]+/.*'),
                               attrs={'class': None})
    # finds links to poems that are images
    poems_scan = soup.find_all('a',
                               href=re.compile('https://www.poetryfoundation.org/poetrymagazine/poems/[0-9]+/.*'),
                               attrs={'class': None})
    
    # turn into lists
    if poems_text:
        poems_text_urls = [poem.get('href') for poem in poems_text]
    else:
        poems_text_urls = []
        
    if poems_scan:
        poems_scan_urls = [poem.get('href') for poem in poems_scan]
    else:
        poems_scan_urls = []
    
    return poems_text_urls, poems_scan_urls

def poem_scraper(poem_url):
    '''Scraper for PoetryFoundation.org--scrapes poet name, poem title, poem year, list of poem's lines,
       and the poem as a string.
       Input the url for a poem's page on PoetryFoundation.org.
       Output is a list.'''
    
    # load a page and soupify it
    page = rq.get(poem_url)
    soup = bs(page.content, 'html.parser')
    
    # series of try/except statements to scrape info or return NaN value if desired info cannot be scraped
    try:
        poet = soup.find('a', href=re.compile('.*/poets/.*')).contents[0]
    except:
        poet = np.nan
        
    try:
        title = soup.find('h1').contents[-1].strip()
    except:
        try:
            title_pattern = '[a-z\-]*$'
            title = re.search(title_pattern, poem_url, re.I).group().replace('-', ' ').title()
        except:
            title = np.nan
        
    try:
        lines_raw = soup.find_all('div', {'style': 'text-indent: -1em; padding-left: 1em;'})
        lines = [normalize('NFKD', str(line.contents[0])) for line in lines_raw if line.contents]
        lines = [line.replace('<br/>', '') for line in lines]
        line_pattern = '>(.*?)<'
        lines = [re.search(line_pattern, line, re.I).group(1) if '<' in line else line for line in lines]
        if lines == []:
            try:
                img_link = soup.find('img', src=re.compile('.*/jstor/.*'))['src']
                img_data = rq.get(img_link).content
                with open('poem_imgs/temp.png', 'wb') as handle:
                    handle.write(img_data)
                text = pytesseract.image_to_string('poem_imgs/temp.png')
                scan_pattern = fr'{title.upper()}\s*((.*\s.*)*)'
                lines = re.search(scan_pattern, text, re.I).group(1).splitlines()
            except:
                lines = np.nan
    except:
        lines = np.nan
        
    try:
        poem_string = '\n'.join(lines)
    except:
        poem_string = np.nan
        
    try:
        year_blurb = soup.find('span', {'class': 'c-txt c-txt_note c-txt_note_mini'}).contents[2]
        year_pattern = r'[12]\d{3}'
        year = int(re.search(year_pattern, year_blurb, re.I).group())
    except:
        try:
            year_blurb = soup.find_all('span', {'class': 'c-txt c-txt_note c-txt_note_mini'})[-1].contents[2]
            year_pattern = r'[12]\d{3}'
            year = int(re.search(year_pattern, year_blurb, re.I).group())
        except:
            year = np.nan
    
    info = [poet, title, year, lines, poem_string]
    
    return info

def pf_scraper(poet_urls_dict, genre):
    '''Scraper for PoetryFoundation.org--scrapes poet name, poem title, poem year, list of poem's lines,
       and the poem as a string.
       Input is a dictionary with genres as keys and urls to poets' pages as values, as well as the genre you wish to scrape.
           Designed to be used in a loop, so if there is an error along the way, you could feasibly have some progress saved.
       Output is a Pandas DataFrame.'''
    
    # instantiate an empty list
    ultra_list = []
    
    # set up a for loop to iterate through urls in genre
    for poet_url in poet_urls_dict[genre]:
        
        # scrape urls for both types of pages, text poems and scanned poems
        poem_text_urls, poem_scan_urls = poem_urls_scraper(poet_url)

        # instantiate a list with url and genre info, then scrape the rest of the info using earlier function,
        # then add it to the list that will get converted into a dataframe
        for poem_url in poem_text_urls:
            info = [poet_url, genre, poem_url]
            info.extend(poem_scraper(poem_url))
            ultra_list.append(info)

        for poem_url in poem_scan_urls:
            info = [poet_url, genre, poem_url]
            info.extend(poem_scraper(poem_url))
            ultra_list.append(info)

        # pause the for loop for a second to try to prevent being blocked
        time.sleep(1)

    df = pd.DataFrame(ultra_list)
        
    return df

In [11]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns; sns.set_style('ticks')

import re
from unicodedata import normalize

import requests as rq
from bs4 import BeautifulSoup as bs
from selenium import webdriver


import time
import pickle

from functions import *

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
pd.set_option('max_colwidth', 150)

##### Manually create dictionary with url codes for each genre.

In [3]:
genre_codes = {
    'augustan': 149,
    'beat': 150,
    'black_arts_movement': 304,
    'black_mountain': 151,
    'confessional': 152,
    'fugitive': 153,
    'georgian': 154,
    'harlem_renaissance': 155,
    'imagist': 156,
    'language_poetry': 157,
    'middle_english': 158,
    'modern': 159,
    'new_york_school': 160,
    'new_york_school_2nd_generation': 161,
    'objectivist': 162,
    'renaissance': 163,
    'romantic': 164,
    'victorian': 165
}

##### Run function in a loop to create dictionary of poet urls.

In [193]:
poet_urls = {genre:poet_urls_by_genre(genre_code, 3) for genre,genre_code in genre_codes.items()}
poet_urls['augustan']

['https://www.poetryfoundation.org/poets/mary-barber',
 'https://www.poetryfoundation.org/poets/susanna-blamire',
 'https://www.poetryfoundation.org/poets/henry-carey',
 'https://www.poetryfoundation.org/poets/thomas-chatterton',
 'https://www.poetryfoundation.org/poets/william-collins',
 'https://www.poetryfoundation.org/poets/william-cowper',
 'https://www.poetryfoundation.org/poets/daniel-defoe',
 'https://www.poetryfoundation.org/poets/anne-finch',
 'https://www.poetryfoundation.org/poets/john-gay',
 'https://www.poetryfoundation.org/poets/oliver-goldsmith',
 'https://www.poetryfoundation.org/poets/thomas-gray',
 'https://www.poetryfoundation.org/poets/matthew-green',
 'https://www.poetryfoundation.org/poets/warren-hastings',
 'https://www.poetryfoundation.org/poets/samuel-johnson',
 'https://www.poetryfoundation.org/poets/mary-jones',
 'https://www.poetryfoundation.org/poets/lady-mary-wortley-montagu',
 'https://www.poetryfoundation.org/poets/alexander-pope',
 'https://www.poetryf

##### Loop only partially worked, so let's re-run sections in which some urls are missing.

In [196]:
poet_urls['black_arts_movement'] = poet_urls_by_genre(genre_codes['black_arts_movement'])

In [198]:
poet_urls['modern'] = poet_urls_by_genre(genre_codes['modern'])

In [200]:
poet_urls['renaissance'] = poet_urls_by_genre(genre_codes['renaissance'])

In [203]:
poet_urls['romantic'] = poet_urls_by_genre(genre_codes['romantic'])

In [206]:
poet_urls['victorian'] = poet_urls_by_genre(genre_codes['victorian'])

In [207]:
# confirm all urls have been grabbed
url_lens = {k:len(v) for k,v in poet_urls.items()}
url_lens

{'augustan': 23,
 'beat': 13,
 'black_arts_movement': 23,
 'black_mountain': 10,
 'confessional': 7,
 'fugitive': 7,
 'georgian': 22,
 'harlem_renaissance': 17,
 'imagist': 6,
 'language_poetry': 18,
 'middle_english': 3,
 'modern': 54,
 'new_york_school': 9,
 'new_york_school_2nd_generation': 16,
 'objectivist': 5,
 'renaissance': 41,
 'romantic': 51,
 'victorian': 55}

##### Pickle it! uncomment to save/load

In [4]:
# with open('poet_urls_dict.pickle', 'wb') as w:
#     pickle.dump(poet_urls, w, protocol=pickle.HIGHEST_PROTOCOL)

with open('poet_urls_dict.pickle', 'rb') as r:
    poet_urls_dict = pickle.load(r)

##### Check for duplicate values

In [5]:
poet_df = pd.DataFrame([(genre,v) for genre in poet_urls_dict.keys() for v in poet_urls_dict[genre]])
pd.concat(g for _, g in poet_df.groupby(1) if len(g) > 1)

Unnamed: 0,0,1
126,imagist,https://www.poetryfoundation.org/poets/ezra-pound
186,modern,https://www.poetryfoundation.org/poets/ezra-pound
122,imagist,https://www.poetryfoundation.org/poets/richard-aldington
150,modern,https://www.poetryfoundation.org/poets/richard-aldington


##### We'll give those poets to the imagist genre, since it has so few already

In [6]:
dups = [value for value in poet_df[poet_df.duplicated(1)][1]]
dups

['https://www.poetryfoundation.org/poets/richard-aldington',
 'https://www.poetryfoundation.org/poets/ezra-pound']

In [7]:
len(poet_urls_dict['modern'])

54

In [8]:
poet_urls_dict['modern'] = [url for url in poet_urls_dict['modern'] if url not in dups]
len(poet_urls_dict['modern'])

52

##### Instantiate an empty dataframe, then loop over each genre in our poet urls dictionary, create a dataframe for each genre and add that to the original dataframe, saving it after each concatenation

In [None]:
%%time

# instantiate an empty dataframe
df = pd.DataFrame()

# loop over each genre, create dataframe with desired information,
# concat to original dataframe, then save it before looping again
for genre in list(poet_urls_dict.keys()):
    genre_df = pf_scraper(poet_urls_dict, genre)
    df = pd.concat([df, genre_df])
    df.to_csv('data/poetry_foundation_raw.csv')

##### Uncomment to save/load dataframe

In [142]:
# df.to_csv('data/poetry_foundation_raw.csv')
df = pd.read_csv('data/poetry_foundation_raw.csv', index_col=0)

In [143]:
df.shape

(5442, 8)

In [144]:
df.columns = ['poet_url', 'genre', 'poem_url', 'poet', 'title', 'year', 'poem_lines', 'poem_string']
df.columns

Index(['poet_url', 'genre', 'poem_url', 'poet', 'title', 'year', 'poem_lines',
       'poem_string'],
      dtype='object')

In [145]:
df.head()

Unnamed: 0,poet_url,genre,poem_url,poet,title,year,poem_lines,poem_string
0,https://www.poetryfoundation.org/poets/richard-brautigan,beat,https://www.poetryfoundation.org/poems/48578/at-the-california-institute-of-technology,Richard Brautigan,At the California Institute of Technology,,"['I don’t care how God-damn smart', '\r these guys are: I’m bored.', '<br/>', '\r It’s been raining like hell all day long', '\r and there’s nothi...",I don’t care how God-damn smart\n these guys are: I’m bored.\n\n It’s been raining like hell all day long\n and there’s nothing to do.\n
1,https://www.poetryfoundation.org/poets/richard-brautigan,beat,https://www.poetryfoundation.org/poems/48576/a-boat,Richard Brautigan,A Boat,1968.0,"['O beautiful ', 'was the werewolf ', 'in his evil forest. ', 'We took him ', 'to the carnival ', 'and he started ', ' crying ', 'when h...",O beautiful was the werewolf in his evil forest. We took him to the carnival and he started crying when he saw the Ferris wheel. Elec...
2,https://www.poetryfoundation.org/poets/richard-brautigan,beat,https://www.poetryfoundation.org/poems/48580/december-30,Richard Brautigan,December 30,1968.0,"['At 1:03 in the morning a fart', '\r smells like a marriage between', '\r an avocado and a fish head.', '<br/>', '\r I have to get out of bed', '...",At 1:03 in the morning a fart\n smells like a marriage between\n an avocado and a fish head.\n\n I have to get out of bed\n to write this down wit...
3,https://www.poetryfoundation.org/poets/richard-brautigan,beat,https://www.poetryfoundation.org/poems/48579/the-double-bed-dream-gallows,Richard Brautigan,The Double-Bed Dream Gallows,1968.0,"['Driving through ', '\r hot brushy country', '\r in the late autumn, ', '\r I saw a hawk', '\r crucified on a', '\r barbed-wire fence.', '<br...","Driving through \n hot brushy country\n in the late autumn, \n I saw a hawk\n crucified on a\n barbed-wire fence.\n\n I guess as a kind \n o..."
4,https://www.poetryfoundation.org/poets/richard-brautigan,beat,https://www.poetryfoundation.org/poems/48581/haiku-ambulance,Richard Brautigan,Haiku Ambulance,1968.0,"['A piece of green pepper', '\r fell', '\r off the wooden salad bowl:', '\r so what?', '<br/>']",A piece of green pepper\n fell\n off the wooden salad bowl:\n so what?\n


In [146]:
df.genre.unique()

array(['beat', 'augustan', 'black_arts_movement', 'black_mountain',
       'confessional', 'fugitive', 'georgian', 'harlem_renaissance',
       'imagist', 'language_poetry', 'middle_english', 'modern',
       'new_york_school', 'new_york_school_2nd_generation', 'objectivist',
       'renaissance', 'romantic', 'victorian'], dtype=object)

In [147]:
df.genre.value_counts()

modern                            1324
victorian                          674
renaissance                        430
romantic                           407
imagist                            370
beat                               294
new_york_school                    265
black_mountain                     257
new_york_school_2nd_generation     193
language_poetry                    192
confessional                       176
georgian                           167
black_arts_movement                165
objectivist                        159
harlem_renaissance                 148
augustan                           121
fugitive                            90
middle_english                      10
Name: genre, dtype: int64

In [148]:
df.duplicated(subset=['poet_url', 'genre', 'poem_url', 'poet', 'title', 'year', 'poem_string'], keep='last').sum()

245

In [149]:
df.drop_duplicates(subset=['poet_url', 'genre', 'poem_url', 'poet', 'title', 'year', 'poem_string'],
                   keep='last',
                   inplace=True)

In [150]:
df.reset_index(drop=True, inplace=True)

In [151]:
df.shape

(5197, 8)

In [152]:
df.genre.value_counts()

modern                            1284
victorian                          643
renaissance                        427
romantic                           398
imagist                            370
new_york_school                    265
black_mountain                     257
new_york_school_2nd_generation     192
language_poetry                    192
confessional                       176
black_arts_movement                165
georgian                           160
objectivist                        159
harlem_renaissance                 148
beat                               147
augustan                           114
fugitive                            90
middle_english                      10
Name: genre, dtype: int64

In [153]:
df.isna().sum()

poet_url          0
genre             0
poem_url          0
poet           1858
title          1933
year           3736
poem_lines       30
poem_string    1937
dtype: int64

In [184]:
df[df.poem_lines.isna()]

Unnamed: 0,poet_url,genre,poem_url,poet,title,year,poem_lines,poem_string
65,https://www.poetryfoundation.org/poets/lady-mary-wortley-montagu,augustan,https://www.poetryfoundation.org/poems/44765/town-eclogues-thursday-the-bassette-table,Lady Mary Wortley Montagu,Town Eclogues: Thursday; the Bassette-Table,,,
306,https://www.poetryfoundation.org/poets/gwendolyn-brooks,black_arts_movement,https://www.poetryfoundation.org/poems/58377/riot-56d23cb395a01,Gwendolyn Brooks,,,,
408,https://www.poetryfoundation.org/poets/jay-wright,black_arts_movement,https://www.poetryfoundation.org/poems/42736/benjamin-banneker-helps-to-build-a-city,Jay Wright,Benjamin Banneker Helps to Build a City,2000.0,,
500,https://www.poetryfoundation.org/poets/robert-creeley,black_mountain,https://www.poetryfoundation.org/poetrymagazine/poems/55314/a-prayer-56d236c6bb760,Robert Creeley,A Prayer,1982.0,,
799,https://www.poetryfoundation.org/poets/frederick-seidel,confessional,https://www.poetryfoundation.org/poetrymagazine/poems/55728/snow-56d23797074a2,Frederick Seidel,Snow,2012.0,,
949,https://www.poetryfoundation.org/poets/hilaire-belloc,georgian,https://www.poetryfoundation.org/poems/46684/ballade-of-modest-confession,Hilaire Belloc,Ballade of Modest Confession,1970.0,,
1087,https://www.poetryfoundation.org/poets/siegfried-sassoon,georgian,https://www.poetryfoundation.org/poems/57215/blighters,Siegfried Sassoon,'Blighters',1917.0,,
1165,https://www.poetryfoundation.org/poets/langston-hughes,harlem_renaissance,https://www.poetryfoundation.org/poetrymagazine/poems/55313/god-56d236c65624c,Langston Hughes,God,1994.0,,
1211,https://www.poetryfoundation.org/poets/claude-mckay,harlem_renaissance,https://www.poetryfoundation.org/poems/56983/the-lynching,Claude McKay,The Lynching,1922.0,,
1430,https://www.poetryfoundation.org/poets/ezra-pound,imagist,https://www.poetryfoundation.org/poems/44915/hugh-selwyn-mauberley-part-i,Ezra Pound,Hugh Selwyn Mauberley [Part I],,,


In [226]:
df[df.poem_string.isna()]

Unnamed: 0,poet_url,genre,poem_url,poet,title,year,poem_lines,poem_string
65,https://www.poetryfoundation.org/poets/lady-mary-wortley-montagu,augustan,https://www.poetryfoundation.org/poems/44765/town-eclogues-thursday-the-bassette-table,Lady Mary Wortley Montagu,Town Eclogues: Thursday; the Bassette-Table,,,
126,https://www.poetryfoundation.org/poets/richard-brautigan,beat,https://www.poetryfoundation.org/poetrymagazine/poems/31338/wood,,,,[],
140,https://www.poetryfoundation.org/poets/william-everson,beat,https://www.poetryfoundation.org/poetrymagazine/poems/21676/dust-and-the-glory,,,,[],
141,https://www.poetryfoundation.org/poets/william-everson,beat,https://www.poetryfoundation.org/poetrymagazine/poems/21675/we-in-the-fields,,,,[],
158,https://www.poetryfoundation.org/poets/lawrence-ferlinghetti,beat,https://www.poetryfoundation.org/poetrymagazine/poems/58150/beatitudes-visuales-mexicanas,Lawrence Ferlinghetti,Beatitudes Visuales Mexicanas,2015.0,[],
...,...,...,...,...,...,...,...,...
4438,https://www.poetryfoundation.org/poets/percy-bysshe-shelley,romantic,https://www.poetryfoundation.org/poems/56665/laon-and-cythna-or-the-revolution-of-the-golden-city,Percy Bysshe Shelley,Laon and Cythna; or The Revolution of the Golden City,2002.0,,
4777,https://www.poetryfoundation.org/poets/gerard-manley-hopkins,victorian,https://www.poetryfoundation.org/poems/44403/the-wreck-of-the-deutschland,Gerard Manley Hopkins,The Wreck of the Deutschland,1950.0,,
4812,https://www.poetryfoundation.org/poets/rudyard-kipling,victorian,https://www.poetryfoundation.org/poems/57409/epitaphs-of-the-war,Rudyard Kipling,Epitaphs of the War,1919.0,,
4831,https://www.poetryfoundation.org/poets/emma-lazarus,victorian,https://www.poetryfoundation.org/poems/46791/by-the-waters-of-babylon,Emma Lazarus,By the Waters of Babylon,2002.0,[],


In [189]:
df.head()

Unnamed: 0,poet_url,genre,poem_url,poet,title,year,poem_lines,poem_string
0,https://www.poetryfoundation.org/poets/mary-barber,augustan,https://www.poetryfoundation.org/poems/50523/advice-to-her-son-on-marriage,Mary Barber,Advice to Her Son on Marriage,,"['When you gain her Affection, take care to preserve it;\r', 'Lest others persuade her, you do not deserve it.\r', 'Still study to heighten the Jo...","When you gain her Affection, take care to preserve it;\nLest others persuade her, you do not deserve it.\nStill study to heighten the Joys of her ..."
1,https://www.poetryfoundation.org/poets/susanna-blamire,augustan,https://www.poetryfoundation.org/poems/50534/auld-robin-forbes,Susanna Blamire,Auld Robin Forbes,,"['And auld Robin Forbes hes gien tem a dance,\r', 'I pat on my speckets to see them aw prance;\r', 'I thout o’ the days when I was but fifteen,\r'...","And auld Robin Forbes hes gien tem a dance,\nI pat on my speckets to see them aw prance;\nI thout o’ the days when I was but fifteen,\nAnd skipp’d..."
2,https://www.poetryfoundation.org/poets/susanna-blamire,augustan,https://www.poetryfoundation.org/poems/50533/o-donald-ye-are-just-the-man,Susanna Blamire,O Donald! Ye Are Just the Man,,"['O Donald! ye are just the man\r', ' Who, when he’s got a wife,\r', 'Begins to fratch— nae notice ta’en—\r', ' They’re strangers a’ their life....","O Donald! ye are just the man\n Who, when he’s got a wife,\nBegins to fratch— nae notice ta’en—\n They’re strangers a’ their life.\n\nThe fan ma..."
3,https://www.poetryfoundation.org/poets/susanna-blamire,augustan,https://www.poetryfoundation.org/poems/50532/the-siller-croun,Susanna Blamire,The Siller Croun,,"['And ye shall walk in silk attire,\r', ' And siller hae to spare,\r', 'Gin ye’ll consent to be his bride,\r', ' Nor think o’ Donald mair.\r'...","And ye shall walk in silk attire,\n And siller hae to spare,\nGin ye’ll consent to be his bride,\n Nor think o’ Donald mair.\nO wha wad buy a..."
4,https://www.poetryfoundation.org/poets/henry-carey,augustan,https://www.poetryfoundation.org/poems/43884/the-ballad-of-sally-in-our-alley,Henry Carey,The Ballad of Sally in our Alley,,"['Of all the Girls that are so smart\r', ' There’s none like pretty SALLY,\r', 'She is the Darling of my Heart,\r', ' And she lives in our...","Of all the Girls that are so smart\n There’s none like pretty SALLY,\nShe is the Darling of my Heart,\n And she lives in our Alley.\nThere..."


In [262]:
import urllib

In [274]:
def poem_scraper(poem_url):
    '''Scraper for PoetryFoundation.org--scrapes poet name, poem title, poem year, list of poem's lines,
       and the poem as a string.
       Input the url for a poem's page on PoetryFoundation.org.
       Output is a list.'''
    
    # load a page and soupify it
    page = rq.get(poem_url)
    soup = bs(page.content, 'html.parser')
    
    # series of try/except statements to scrape info or return NaN value if desired info cannot be scraped
    try:
        poet = soup.find('a', href=re.compile('.*/poets/.*')).contents[0]
    except:
        poet = np.nan
        
    try:
        title = soup.find('h1').contents[-1].strip()
    except:
        try:
            title_pattern = '[a-z\-]*$'
            title = re.search(title_pattern, poem_url, re.I).group().replace('-', ' ').title()
        except:
            title = np.nan
        
    try:
        lines_raw = soup.find_all('div', {'style': 'text-indent: -1em; padding-left: 1em;'})
        lines = [normalize('NFKD', str(line.contents[0])) for line in lines_raw if line.contents]
        lines = [line.replace('<br/>', '') for line in lines]
        line_pattern = '>(.*?)<'
        lines = [re.search(line_pattern, line, re.I).group(1) if '<' in line else line for line in lines]
        if lines == []:
            try:
                img_link = soup.find('img', src=re.compile('.*/jstor/.*'))['src']
                img_data = rq.get(img_link).content
                with open('poem_imgs/temp.png', 'wb') as handle:
                    handle.write(img_data)
                text = pytesseract.image_to_string('poem_imgs/temp.png')
                scan_pattern = fr'{title.upper()}\s*((.*\s.*)*)'
                lines = re.search(scan_pattern, text, re.I).group(1).splitlines()
            except:
                lines = np.nan
    except:
        lines = np.nan
        
    try:
        poem_string = '\n'.join(lines)
    except:
        poem_string = np.nan
        
    try:
        year_blurb = soup.find('span', {'class': 'c-txt c-txt_note c-txt_note_mini'}).contents[2]
        year_pattern = r'[12]\d{3}'
        year = int(re.search(year_pattern, year_blurb, re.I).group())
    except:
        try:
            year_blurb = soup.find_all('span', {'class': 'c-txt c-txt_note c-txt_note_mini'})[-1].contents[2]
            year_pattern = r'[12]\d{3}'
            year = int(re.search(year_pattern, year_blurb, re.I).group())
        except:
            year = np.nan
    
    info = [poet, title, year, lines, poem_string]
    
    return info

In [275]:
poem_scraper('https://www.poetryfoundation.org/poetrymagazine/poems/31338/wood')

['Richard Brautigan',
 'Wood',
 1969,
 ['We age in darkness like wood',
  'and watch our phantoms change',
  'eir clothes',
  'of shingles and boards',
  'for a purpose that can only be',
  'described as wood.'],
 'We age in darkness like wood\nand watch our phantoms change\neir clothes\nof shingles and boards\nfor a purpose that can only be\ndescribed as wood.']

In [266]:
img_link

'https://static.poetryfoundation.org/jstor/i20599092/pages/36.png'

In [269]:
page = rq.get('https://www.poetryfoundation.org/poetrymagazine/poems/31338/wood')
soup = bs(page.content, 'html.parser')
img_link = soup.find('img', src=re.compile('.*/jstor/.*'))['src']

img_data = rq.get(img_link).content
with open('poem_imgs/temp.png', 'wb') as handle:
    handle.write(img_data)
# with open('poem_imgs/temp.png', 'wb') as handle:
#     response = rq.get(img_link, stream=True)
# text = pytesseract.image_to_string('poem_imgs/temp.png')
# scan_pattern = fr'{title.upper()}\s*((.*\s.*)*)'
# lines = re.search(scan_pattern, text, re.I).group(1).splitlines()

In [272]:
img_data = rq.get(img_link).content
with open('poem_imgs/temp.png', 'wb') as handle:
    handle.write(img_data)
text = pytesseract.image_to_string('poem_imgs/temp.png')
scan_pattern = fr'{title.upper()}\s*((.*\s.*)*)'
lines = re.search(scan_pattern, text, re.I).group(1).splitlines()

In [273]:
lines

['We age in darkness like wood',
 'and watch our phantoms change',
 'eir clothes',
 'of shingles and boards',
 'for a purpose that can only be',
 'described as wood.']

In [271]:
text

'POETRY\n\nRICHARD BRAUTIGAN\n\n \n\nWOOD\n\nWe age in darkness like wood\nand watch our phantoms change\neir clothes\nof shingles and boards\nfor a purpose that can only be\ndescribed as wood.'

In [253]:
soup.find('img', src=re.compile('.*/jstor/.*'))['src']

'https://static.poetryfoundation.org/jstor/i20599092/pages/36.png'

In [236]:
import pytesseract

text = pytesseract.image_to_string('poem_imgs/36.png')
text

'POETRY\n\nRICHARD BRAUTIGAN\n\n \n\nWOOD\n\nWe age in darkness like wood\nand watch our phantoms change\neir clothes\nof shingles and boards\nfor a purpose that can only be\ndescribed as wood.'

In [237]:
title_test = 'Wood'.upper()
title_test

'WOOD'

In [241]:
scan_pattern = fr'{title_test}\s*((.*\s.*)*)'
re.search(scan_pattern, text, re.I).group(1).splitlines()

['We age in darkness like wood',
 'and watch our phantoms change',
 'eir clothes',
 'of shingles and boards',
 'for a purpose that can only be',
 'described as wood.']

In [215]:
print(text)

POETRY

RICHARD BRAUTIGAN

 

WOOD

We age in darkness like wood
and watch our phantoms change
eir clothes
of shingles and boards
for a purpose that can only be
described as wood.


In [196]:
page = rq.get('https://www.poetryfoundation.org/poetrymagazine/poems/21676/dust-and-the-glory')
soup = bs(page.content, 'html.parser')
poet = soup.find('a', href=re.compile('.*/poets/.*')).contents[0]
title = soup.find('span', attrs={'class':'c-hdgSans c-hdgSans_7'}).contents[-1].strip()
print(poet)
print(title)

William Everson
We in the Fields


In [207]:
url = 'https://www.poetryfoundation.org/poetrymagazine/poems/21676/dust-and-the-glory'
title_pattern = '[a-z\-]*$'
title = re.search(title_pattern, url, re.I).group().replace('-', ' ').title()
year_blurb = soup.find_all('span', {'class': 'c-txt c-txt_note c-txt_note_mini'})[-1].contents[2]
year_pattern = r'[12]\d{3}'
year = int(re.search(year_pattern, year_blurb, re.I).group())
year

1937

In [159]:
nan_lines = {ind:row.poem_url for ind,row in df[df.poem_lines.isna()].iterrows()}
nan_lines

{65: 'https://www.poetryfoundation.org/poems/44765/town-eclogues-thursday-the-bassette-table',
 306: 'https://www.poetryfoundation.org/poems/58377/riot-56d23cb395a01',
 408: 'https://www.poetryfoundation.org/poems/42736/benjamin-banneker-helps-to-build-a-city',
 500: 'https://www.poetryfoundation.org/poetrymagazine/poems/55314/a-prayer-56d236c6bb760',
 799: 'https://www.poetryfoundation.org/poetrymagazine/poems/55728/snow-56d23797074a2',
 949: 'https://www.poetryfoundation.org/poems/46684/ballade-of-modest-confession',
 1087: 'https://www.poetryfoundation.org/poems/57215/blighters',
 1165: 'https://www.poetryfoundation.org/poetrymagazine/poems/55313/god-56d236c65624c',
 1211: 'https://www.poetryfoundation.org/poems/56983/the-lynching',
 1430: 'https://www.poetryfoundation.org/poems/44915/hugh-selwyn-mauberley-part-i',
 1431: 'https://www.poetryfoundation.org/poems/57353/hugh-selwyn-mauberley-part-ii',
 1646: 'https://www.poetryfoundation.org/poetrymagazine/poems/52593/advent-56d231303d

In [None]:
for i,url in nan_lines.items():
    df[i]['poem_lines'] = 

In [155]:
df.iloc[308]

poet_url                                                                                                     https://www.poetryfoundation.org/poets/gwendolyn-brooks
genre                                                                                                                                            black_arts_movement
poem_url                                                                                                 https://www.poetryfoundation.org/poems/43311/sadie-and-maud
poet                                                                                                                                                Gwendolyn Brooks
title                                                                                                                                                 Sadie and Maud
year                                                                                                                                                             NaN
poem_lines

In [178]:
def poem_scraper(poem_url):
    '''Scraper for PoetryFoundation.org--scrapes poet name, poem title, poem year, list of poem's lines,
       and the poem as a string.
       Input the url for a poem's page on PoetryFoundation.org.
       Output is a list.'''
    
    # load a page and soupify it
    page = rq.get(poem_url)
    soup = bs(page.content, 'html.parser')
    
    # series of try/except statements to scrape info or return NaN value if desired info cannot be scraped
    try:
        poet = soup.find('a', href=re.compile('.*/poets/.*'), attrs={'class': None}).contents[0]
    except:
        poet = np.nan
        
    try:
        title = soup.find('h1').contents[-1].strip()
    except:
        title = np.nan
        
    try:
        lines_raw = soup.find_all('div', {'style': 'text-indent: -1em; padding-left: 1em;'})
        lines = [normalize('NFKD', str(line.contents[0])) for line in lines_raw if line.contents]
        lines = [line.replace('<br/>', '') for line in lines]
    except:
        lines = np.nan
        
    try:
        poem_string = '\n'.join(lines)
    except:
        poem_string = np.nan
        
    try:
        year_blurb = soup.find('span', {'class': 'c-txt c-txt_note c-txt_note_mini'}).contents[2]
        year_pattern = r'[12]\d{3}'
        year = int(re.search(year_pattern, year_blurb, re.I).group())
    except:
        year = np.nan
    
    info = [poet, title, year, lines, poem_string]
    
    return info

In [181]:
infor = poem_scraper('https://www.poetryfoundation.org/poetrymagazine/poems/55209/the-cenotaph')
infor

['Fanny Howe',
 'The Cenotaph',
 2011,
 ['I want to leave this place',
  'unremembered.',
  'The gas stove is leaking',
  'and the door of the refrigerator',
  'stained with rust.',
  'The mugs are ugly',
  'and there are only two forks.',
  'The walls are black',
  'and soft, the bed a balloon',
  'of night-clothing.',
  'The stairwell sloped',
  'to a dragger’s pace.',
  '',
  'There are big windows',
  'with blind-slats dusty',
  'and gray. Street life ',
  'goes all night and at dawn',
  'freedmen shout and ',
  'laugh outside the kitchen.',
  '',
  'Where does life begin and end?',
  'In the lamb or the cotton?',
  'My pillow is my friend.',
  ''],
 'I want to leave this place\nunremembered.\nThe gas stove is leaking\nand the door of the refrigerator\nstained with rust.\nThe mugs are ugly\nand there are only two forks.\nThe walls are black\nand soft, the bed a balloon\nof night-clothing.\nThe stairwell sloped\nto a dragger’s pace.\n\nThere are big windows\nwith blind-slats dusty\n

In [182]:
page = rq.get('https://www.poetryfoundation.org/poetrymagazine/poems/55209/the-cenotaph')
soup = bs(page.content, 'html.parser')
lines_raw = soup.find_all('div', {'style': 'text-indent: -1em; padding-left: 1em;'})
lines = [normalize('NFKD', str(line.contents[0])) for line in lines_raw if line.contents]
lines = [line.replace('<br/>', '') for line in lines]
year_blurb = soup.find('span', {'class': 'c-txt c-txt_note c-txt_note_mini'})
year_blurb

<span class="c-txt c-txt_note c-txt_note_mini">
                        Source:
                        <em>Poetry</em>
                                                                                                                                                                    (December 2011)
                                            </span>

In [175]:
print('\n'.join(lines).replace('<br/>', '\n'))

I want to leave this place
unremembered.
The gas stove is leaking
and the door of the refrigerator
stained with rust.
The mugs are ugly
and there are only two forks.
The walls are black
and soft, the bed a balloon
of night-clothing.
The stairwell sloped
to a dragger’s pace.

There are big windows
with blind-slats dusty
and gray. Street life 
goes all night and at dawn
freedmen shout and 
laugh outside the kitchen.

Where does life begin and end?
In the lamb or the cotton?
My pillow is my friend.



In [166]:
poem_string = "\n".join(''.join(lines).splitlines()).replace('<br/>', '\n')
print(poem_string)

I want to leave this placeunremembered.The gas stove is leakingand the door of the refrigeratorstained with rust.The mugs are uglyand there are only two forks.The walls are blackand soft, the bed a balloonof night-clothing.The stairwell slopedto a dragger’s pace.
There are big windowswith blind-slats dustyand gray. Street life goes all night and at dawnfreedmen shout and laugh outside the kitchen.
Where does life begin and end?In the lamb or the cotton?My pillow is my friend.



In [121]:
poet = soup.find('a', href=re.compile('.*/poets/.*'), attrs={'class': None}).contents[0]
poet

'Fanny Howe'

In [122]:
title = soup.find('h1').contents[-1].strip()
title

'The Cenotaph'

In [124]:
lines_raw = soup.find_all('div', {'style': 'text-indent: -1em; padding-left: 1em;'})
lines_raw

[<div style="text-indent: -1em; padding-left: 1em;">I want to leave this place<br/></div>,
 <div style="text-indent: -1em; padding-left: 1em;">unremembered.<br/></div>,
 <div style="text-indent: -1em; padding-left: 1em;">The gas stove is leaking<br/></div>,
 <div style="text-indent: -1em; padding-left: 1em;">and the door of the refrigerator<br/></div>,
 <div style="text-indent: -1em; padding-left: 1em;">stained with rust.<br/></div>,
 <div style="text-indent: -1em; padding-left: 1em;">The mugs are ugly<br/></div>,
 <div style="text-indent: -1em; padding-left: 1em;">and there are only two forks.<br/></div>,
 <div style="text-indent: -1em; padding-left: 1em;">The walls are black<br/></div>,
 <div style="text-indent: -1em; padding-left: 1em;">and soft, the bed a balloon<br/></div>,
 <div style="text-indent: -1em; padding-left: 1em;">of night-clothing.<br/></div>,
 <div style="text-indent: -1em; padding-left: 1em;">The stairwell sloped<br/></div>,
 <div style="text-indent: -1em; padding-le

In [131]:
lines_raw[-2]

<div style="text-indent: -1em; padding-left: 1em;"></div>

In [129]:
lines_raw[-2].contents[0]

IndexError: list index out of range

In [132]:
lines = [normalize('NFKD', str(line.contents[0])) for line in lines_raw if line.contents]
lines

['I want to leave this place',
 'unremembered.',
 'The gas stove is leaking',
 'and the door of the refrigerator',
 'stained with rust.',
 'The mugs are ugly',
 'and there are only two forks.',
 'The walls are black',
 'and soft, the bed a balloon',
 'of night-clothing.',
 'The stairwell sloped',
 'to a dragger’s pace.',
 '<br/>',
 'There are big windows',
 'with blind-slats dusty',
 'and gray. Street life ',
 'goes all night and at dawn',
 'freedmen shout and ',
 'laugh outside the kitchen.',
 '<br/>',
 'Where does life begin and end?',
 'In the lamb or the cotton?',
 'My pillow is my friend.',
 '<br/>']

In [None]:

    
    # series of try/except statements to scrape info or return NaN value if desired info cannot be scraped
    try:
        poet = soup.find('a', href=re.compile('.*/poets/.*'), attrs={'class': None}).contents[0]
    except:
        poet = np.nan
        
    try:
        title = soup.find('h1').contents[-1].strip()
    except:
        title = np.nan
        
    try:
        lines_raw = soup.find_all('div', {'style': 'text-indent: -1em; padding-left: 1em;'})
        lines = [normalize('NFKD', str(line.contents[0])) for line in lines_raw]
    except:
        lines = np.nan
        
    try:
        poem_string = "\n".join(''.join(lines).splitlines()).replace('<br/>', '\n') 
    except:
        poem_string = np.nan
        
    try:
        year_blurb = soup.find('span', {'class': 'c-txt c-txt_note c-txt_note_mini'}).contents[2]
        year_pattern = r'[12]\d{3}'
        year = int(re.search(year_pattern, year_blurb, re.I).group())
    except:
        year = np.nan
    
    info = [poet, title, year, lines, poem_string]
    
    return info

In [116]:
df.groupby('genre').sum()

Unnamed: 0_level_0,year
genre,Unnamed: 1_level_1
augustan,30930.0
beat,183660.0
black_arts_movement,271883.0
black_mountain,194322.0
confessional,150867.0
fugitive,62157.0
georgian,56322.0
harlem_renaissance,152419.0
imagist,122877.0
language_poetry,342601.0


In [87]:
df[df.genre == 'modern'].title.value_counts()

The Waste Land                         14
The Love Song of J. Alfred Prufrock    10
Gerontion                               6
Rhapsody on a Windy Night               4
Portrait of a Lady                      4
                                       ..
No Second Troy                          1
Under Ben Bulben                        1
The People, Yes                         1
Leave-Taking                            1
Fixed Ideas                             1
Name: title, Length: 459, dtype: int64

In [88]:
df[df.title == 'The Waste Land']

Unnamed: 0,poet_url,genre,poem_url,poet,title,year,poem_lines,poem_string
245,https://www.poetryfoundation.org/poets/t-s-eliot,modern,https://www.poetryfoundation.org/poems/47311/the-waste-land,T. S. Eliot,The Waste Land,,"[ , <br/>, \r April is the cruellest month, breeding, \r Lilacs out of the dead land, mixing, \r Memory and desire, stirring, \r Dul...","\n\n April is the cruellest month, breeding\n Lilacs out of the dead land, mixing\n Memory and desire, stirring\n Dull roots with s..."
246,https://www.poetryfoundation.org/poets/t-s-eliot,modern,https://www.poetryfoundation.org/poems/47311/the-waste-land,T. S. Eliot,The Waste Land,,"[ , <br/>, \r April is the cruellest month, breeding, \r Lilacs out of the dead land, mixing, \r Memory and desire, stirring, \r Dul...","\n\n April is the cruellest month, breeding\n Lilacs out of the dead land, mixing\n Memory and desire, stirring\n Dull roots with s..."
247,https://www.poetryfoundation.org/poets/t-s-eliot,modern,https://www.poetryfoundation.org/poems/47311/the-waste-land,T. S. Eliot,The Waste Land,,"[ , <br/>, \r April is the cruellest month, breeding, \r Lilacs out of the dead land, mixing, \r Memory and desire, stirring, \r Dul...","\n\n April is the cruellest month, breeding\n Lilacs out of the dead land, mixing\n Memory and desire, stirring\n Dull roots with s..."
256,https://www.poetryfoundation.org/poets/t-s-eliot,modern,https://www.poetryfoundation.org/poems/47311/the-waste-land,T. S. Eliot,The Waste Land,,"[ , <br/>, \r April is the cruellest month, breeding, \r Lilacs out of the dead land, mixing, \r Memory and desire, stirring, \r Dul...","\n\n April is the cruellest month, breeding\n Lilacs out of the dead land, mixing\n Memory and desire, stirring\n Dull roots with s..."
257,https://www.poetryfoundation.org/poets/t-s-eliot,modern,https://www.poetryfoundation.org/poems/47311/the-waste-land,T. S. Eliot,The Waste Land,,"[ , <br/>, \r April is the cruellest month, breeding, \r Lilacs out of the dead land, mixing, \r Memory and desire, stirring, \r Dul...","\n\n April is the cruellest month, breeding\n Lilacs out of the dead land, mixing\n Memory and desire, stirring\n Dull roots with s..."
261,https://www.poetryfoundation.org/poets/t-s-eliot,modern,https://www.poetryfoundation.org/poems/47311/the-waste-land,T. S. Eliot,The Waste Land,,"[ , <br/>, \r April is the cruellest month, breeding, \r Lilacs out of the dead land, mixing, \r Memory and desire, stirring, \r Dul...","\n\n April is the cruellest month, breeding\n Lilacs out of the dead land, mixing\n Memory and desire, stirring\n Dull roots with s..."
262,https://www.poetryfoundation.org/poets/t-s-eliot,modern,https://www.poetryfoundation.org/poems/47311/the-waste-land,T. S. Eliot,The Waste Land,,"[ , <br/>, \r April is the cruellest month, breeding, \r Lilacs out of the dead land, mixing, \r Memory and desire, stirring, \r Dul...","\n\n April is the cruellest month, breeding\n Lilacs out of the dead land, mixing\n Memory and desire, stirring\n Dull roots with s..."
265,https://www.poetryfoundation.org/poets/t-s-eliot,modern,https://www.poetryfoundation.org/poems/47311/the-waste-land,T. S. Eliot,The Waste Land,,"[ , <br/>, \r April is the cruellest month, breeding, \r Lilacs out of the dead land, mixing, \r Memory and desire, stirring, \r Dul...","\n\n April is the cruellest month, breeding\n Lilacs out of the dead land, mixing\n Memory and desire, stirring\n Dull roots with s..."
266,https://www.poetryfoundation.org/poets/t-s-eliot,modern,https://www.poetryfoundation.org/poems/47311/the-waste-land,T. S. Eliot,The Waste Land,,"[ , <br/>, \r April is the cruellest month, breeding, \r Lilacs out of the dead land, mixing, \r Memory and desire, stirring, \r Dul...","\n\n April is the cruellest month, breeding\n Lilacs out of the dead land, mixing\n Memory and desire, stirring\n Dull roots with s..."
267,https://www.poetryfoundation.org/poets/t-s-eliot,modern,https://www.poetryfoundation.org/poems/47311/the-waste-land,T. S. Eliot,The Waste Land,,"[ , <br/>, \r April is the cruellest month, breeding, \r Lilacs out of the dead land, mixing, \r Memory and desire, stirring, \r Dul...","\n\n April is the cruellest month, breeding\n Lilacs out of the dead land, mixing\n Memory and desire, stirring\n Dull roots with s..."


In [82]:
df.genre.value_counts()

modern                            1324
victorian                          674
renaissance                        430
romantic                           407
imagist                            370
beat                               294
new_york_school                    265
black_mountain                     257
new_york_school_2nd_generation     193
language_poetry                    192
confessional                       176
georgian                           167
black_arts_movement                165
objectivist                        159
harlem_renaissance                 148
augustan                           121
fugitive                            90
middle_english                      10
Name: genre, dtype: int64

In [86]:
df[df.genre == 'middle_english']

Unnamed: 0,poet_url,genre,poem_url,poet,title,year,poem_lines,poem_string
0,https://www.poetryfoundation.org/poets/geoffrey-chaucer,middle_english,https://www.poetryfoundation.org/poems/43926/the-canterbury-tales-general-prologue,Geoffrey Chaucer,The Canterbury Tales: General Prologue,,"[Whan that Aprille with his shour, The droghte of March hath perc, And bath, Of which vertú engendr, Whan Zephirus eek with his swet, Inspir, The...",Whan that Aprille with his shourThe droghte of March hath percAnd bathOf which vertú engendrWhan Zephirus eek with his swetInspirThe tendrHath in...
1,https://www.poetryfoundation.org/poets/geoffrey-chaucer,middle_english,https://www.poetryfoundation.org/poems/43936/the-parlement-of-fowls,Geoffrey Chaucer,The Parlement of Fowls,,"[Now welcome, somer, with thy sonne softe,, \r That hast this wintres wedres overshake,, \r And driven away the longe nyghtes blake!, <br/>, \r Sa...","Now welcome, somer, with thy sonne softe,\n That hast this wintres wedres overshake,\n And driven away the longe nyghtes blake!\n\n Saynt Valentyn..."
2,https://www.poetryfoundation.org/poets/geoffrey-chaucer,middle_english,https://www.poetryfoundation.org/poems/45694/to-rosemounde-a-balade,Geoffrey Chaucer,To Rosemounde: A Balade,1891.0,"[<br/>, \r Madame, ye ben of al beaute shryne, \r As fer as cercled is the mapamounde,, \r For as the cristal glorious ye shyne,, \r And lyke ruby...","\n\n Madame, ye ben of al beaute shryne\n As fer as cercled is the mapamounde,\n For as the cristal glorious ye shyne,\n And lyke ruby ben your ch..."
3,https://www.poetryfoundation.org/poets/geoffrey-chaucer,middle_english,https://www.poetryfoundation.org/poems/43937/troilus-and-criseyde-book-i,Geoffrey Chaucer,Troilus and Criseyde: Book I,,"[And so bifel, whan comen was the tym, Of Aperil, whan clothed is the med, With new, And swot, In sondry wises shew, The folk of Troie hir observa...","And so bifel, whan comen was the tymOf Aperil, whan clothed is the medWith newAnd swotIn sondry wises shewThe folk of Troie hir observaunces oldPa..."
4,https://www.poetryfoundation.org/poets/geoffrey-chaucer,middle_english,https://www.poetryfoundation.org/poems/43938/troilus-and-criseyde-book-ii,Geoffrey Chaucer,Troilus and Criseyde: Book II,,"[With this he took his leve, and hom he wente; , And lord, so he was glad and wel bygon! , Criseyde aroos, no lenger she ne stente, , But streght ...","With this he took his leve, and hom he wente; And lord, so he was glad and wel bygon! Criseyde aroos, no lenger she ne stente, But streght in-to h..."
5,https://www.poetryfoundation.org/poets/geoffrey-chaucer,middle_english,https://www.poetryfoundation.org/poems/43939/troilus-and-criseyde-book-v,Geoffrey Chaucer,Troilus and Criseyde: Book V,,"[The morwen com, and gostly for to speke, , This Diomede is come un-to Criseyde; , And shortly, lest that ye my tale breke, , So wel he for hym-se...","The morwen com, and gostly for to speke, This Diomede is come un-to Criseyde; And shortly, lest that ye my tale breke, So wel he for hym-selven sp..."
6,https://www.poetryfoundation.org/poets/geoffrey-chaucer,middle_english,https://www.poetryfoundation.org/poems/43940/truth-56d222d5bf80c,Geoffrey Chaucer,Truth,,"[Fle fro the pres, and dwelle with sothefastnesse,, \r Suffise thin owen thing, thei it be smal;, \r For hord hath hate, and clymbyng tykelnesse,,...","Fle fro the pres, and dwelle with sothefastnesse,\n Suffise thin owen thing, thei it be smal;\n For hord hath hate, and clymbyng tykelnesse,\n Pre..."
7,https://www.poetryfoundation.org/poets/geoffrey-chaucer,middle_english,https://www.poetryfoundation.org/poems/50383/if-no-love-is-o-god-what-fele-i-so,Petrarch,"“If no love is, O God, what fele I so?”",,"[If no love is, O God, what fele I so?, \r And if love is, what thing and which is he?, \r If love be good, from whennes cometh my woo?, \r If it ...","If no love is, O God, what fele I so?\n And if love is, what thing and which is he?\n If love be good, from whennes cometh my woo?\n If it be wikk..."
8,https://www.poetryfoundation.org/poets/william-langland,middle_english,https://www.poetryfoundation.org/poems/47350/piers-plowman-the-prologue,William Langland,Piers Plowman: The Prologue,,"[<br/>, \r In a somer sesun, whon softe was the sonne,, \r I schop me into a shroud, as I a scheep were;, \r In habite as an hermite unholy of wer...","\n\n In a somer sesun, whon softe was the sonne,\n I schop me into a shroud, as I a scheep were;\n In habite as an hermite unholy of werkes\n Went..."
9,https://www.poetryfoundation.org/poets/john-lydgate,middle_english,https://www.poetryfoundation.org/poems/44660/the-testament-of-john-lydgate,John Lydgate,The Testament of John Lydgate,,"[Beholde, o man! lyft up thyn eye and see , What mortall peyne I suffre for thi trespace. , With pietous voys I crye and sey to the: , ...","Beholde, o man! lyft up thyn eye and see What mortall peyne I suffre for thi trespace. With pietous voys I crye and sey to the: Behold..."


# SCRAP HEAP

In [526]:
ultra_dict = {genre: [] for genre in poet_urls_dict.keys()}
ultra_dict

{'augustan': [],
 'beat': [],
 'black_arts_movement': [],
 'black_mountain': [],
 'confessional': [],
 'fugitive': [],
 'georgian': [],
 'harlem_renaissance': [],
 'imagist': [],
 'language_poetry': [],
 'middle_english': [],
 'modern': [],
 'new_york_school': [],
 'new_york_school_2nd_generation': [],
 'objectivist': [],
 'renaissance': [],
 'romantic': [],
 'victorian': []}

In [46]:
s = rq.Session()
s.get(genre_urls[0])

<Response [200]>

### SCRAPER ATTEMPT READ IMAGE

In [None]:
def poem_scraper(poem_url):
    '''Scraper for PoetryFoundation.org--scrapes poet name, poem title, poem year, list of poem's lines,
       and the poem as a string.
       Input the url for a poem's page on PoetryFoundation.org.
       Output is a list.'''
    
    # load a page and soupify it
    page = rq.get(poem_url)
    soup = bs(page.content, 'html.parser')
    
    # series of try/except statements to scrape info or return NaN value if desired info cannot be scraped
    try:
        poet = soup.find('a', href=re.compile('.*/poets/.*')).contents[0]
    except:
        poet = np.nan
        
    try:
        title = soup.find('h1').contents[-1].strip()
    except:
        try:
            title_pattern = '[a-z\-]*$'
            title = re.search(title_pattern, poem_url, re.I).group().replace('-', ' ').title()
        except:
            title = np.nan
        
    try:
        lines_raw = soup.find_all('div', {'style': 'text-indent: -1em; padding-left: 1em;'})
        lines = [normalize('NFKD', str(line.contents[0])) for line in lines_raw if line.contents]
        lines = [line.replace('<br/>', '') for line in lines]
        line_pattern = '>(.*?)<'
        lines = [re.search(line_pattern, line, re.I).group(1) if '<' in line else line for line in lines]
        if lines == []:
            try:
                img_link = soup.find('img', src=re.compile('.*/jstor/.*'))['src']
                resource = urlopen(img_link)
                output = open('poem_imgs/temp.png','wb')
                output.write(resource.read())
                text = pytesseract.image_to_string('poem_imgs/temp.png')
                scan_pattern = fr'{title.upper()}\s*((.*\s.*)*)'
                lines = re.search(scan_pattern, text, re.I).group(1).splitlines()
            except:
                lines = np.nan
    except:
        lines = np.nan
        
    try:
        poem_string = '\n'.join(lines)
    except:
        poem_string = np.nan
        
    try:
        year_blurb = soup.find('span', {'class': 'c-txt c-txt_note c-txt_note_mini'}).contents[2]
        year_pattern = r'[12]\d{3}'
        year = int(re.search(year_pattern, year_blurb, re.I).group())
    except:
        try:
            year_blurb = soup.find_all('span', {'class': 'c-txt c-txt_note c-txt_note_mini'})[-1].contents[2]
            year_pattern = r'[12]\d{3}'
            year = int(re.search(year_pattern, year_blurb, re.I).group())
        except:
            year = np.nan
    
    info = [poet, title, year, lines, poem_string]
    
    return info