In [12]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns; sns.set_style('ticks')

import re

import requests as rq
from bs4 import BeautifulSoup as bs
from selenium import webdriver

import time

from functions import *

%load_ext autoreload
%autoreload 2

In [227]:
pd.set_option('max_colwidth', 150)

##### Manually create dictionary with url codes for each genre.

In [142]:
genre_codes = {
    'augustan': 149,
    'beat': 150,
    'black_arts_movement': 304,
    'black_mountain': 151,
    'confessional': 152,
    'fugitive': 153,
    'georgian': 154,
    'harlem_renaissance': 155,
    'imagist': 156,
    'language_poetry': 157,
    'middle_english': 158,
    'modern': 159,
    'new_york_school': 160,
    'new_york_school_2nd_generation': 161,
    'objectivist': 162,
    'renaissance': 163,
    'romantic': 164,
    'victorian': 165
}

##### Run function in a loop to create dictionary of poet urls.

In [193]:
poet_urls = {genre:poet_urls_by_genre(genre_code, 3) for genre,genre_code in genre_codes.items()}
poet_urls['augustan']

['https://www.poetryfoundation.org/poets/mary-barber',
 'https://www.poetryfoundation.org/poets/susanna-blamire',
 'https://www.poetryfoundation.org/poets/henry-carey',
 'https://www.poetryfoundation.org/poets/thomas-chatterton',
 'https://www.poetryfoundation.org/poets/william-collins',
 'https://www.poetryfoundation.org/poets/william-cowper',
 'https://www.poetryfoundation.org/poets/daniel-defoe',
 'https://www.poetryfoundation.org/poets/anne-finch',
 'https://www.poetryfoundation.org/poets/john-gay',
 'https://www.poetryfoundation.org/poets/oliver-goldsmith',
 'https://www.poetryfoundation.org/poets/thomas-gray',
 'https://www.poetryfoundation.org/poets/matthew-green',
 'https://www.poetryfoundation.org/poets/warren-hastings',
 'https://www.poetryfoundation.org/poets/samuel-johnson',
 'https://www.poetryfoundation.org/poets/mary-jones',
 'https://www.poetryfoundation.org/poets/lady-mary-wortley-montagu',
 'https://www.poetryfoundation.org/poets/alexander-pope',
 'https://www.poetryf

##### Loop only partially worked, so let's re-run sections in which some urls are missing.

In [196]:
poet_urls['black_arts_movement'] = poet_urls_by_genre(genre_codes['black_arts_movement'])

In [198]:
poet_urls['modern'] = poet_urls_by_genre(genre_codes['modern'])

In [200]:
poet_urls['renaissance'] = poet_urls_by_genre(genre_codes['renaissance'])

In [203]:
poet_urls['romantic'] = poet_urls_by_genre(genre_codes['romantic'])

In [206]:
poet_urls['victorian'] = poet_urls_by_genre(genre_codes['victorian'])

In [207]:
# confirm all urls have been grabbed
url_lens = {k:len(v) for k,v in poet_urls.items()}
url_lens

{'augustan': 23,
 'beat': 13,
 'black_arts_movement': 23,
 'black_mountain': 10,
 'confessional': 7,
 'fugitive': 7,
 'georgian': 22,
 'harlem_renaissance': 17,
 'imagist': 6,
 'language_poetry': 18,
 'middle_english': 3,
 'modern': 54,
 'new_york_school': 9,
 'new_york_school_2nd_generation': 16,
 'objectivist': 5,
 'renaissance': 41,
 'romantic': 51,
 'victorian': 55}

##### Pickle it! uncomment to save/load

In [208]:
# with open('poet_urls_dict.pickle', 'wb') as w:
#     pickle.dump(poet_urls, w, protocol=pickle.HIGHEST_PROTOCOL)

# with open('poet_urls_dict.pickle', 'rb') as r:
#     poet_urls_dict = pickle.load(r)

##### Check for duplicate values

In [234]:
poet_df = pd.DataFrame([(genre,v) for genre in poet_urls_dict.keys() for v in poet_urls_dict[genre]])
pd.concat(g for _, g in poet_df.groupby(1) if len(g) > 1)

Unnamed: 0,0,1
126,imagist,https://www.poetryfoundation.org/poets/ezra-pound
186,modern,https://www.poetryfoundation.org/poets/ezra-pound
122,imagist,https://www.poetryfoundation.org/poets/richard-aldington
150,modern,https://www.poetryfoundation.org/poets/richard-aldington


##### We'll give those poets to the imagist genre, since it has so few already

In [241]:
dups = [value for value in poet_df[poet_df.duplicated(1)][1]]
dups

['https://www.poetryfoundation.org/poets/richard-aldington',
 'https://www.poetryfoundation.org/poets/ezra-pound']

In [242]:
len(poet_urls_dict['modern'])

54

In [243]:
poet_urls_dict['modern'] = [url for url in poet_urls_dict['modern'] if url not in dups]
len(poet_urls_dict['modern'])

52

In [None]:
def poem_scraper(poet_urls_dict):
    for genre in poet_urls_dict.keys():
        for url in genre:
            page = rq.get(poet_urls_dict['modern'][0])
            soup = bs(page.content, 'html.parser')
            poems = soup.find_all('a',
                                  href=re.compile('https://www.poetryfoundation.org/poems/[0-9]+/.*'),
                                  attrs={'class': None})
            poems2 = soup.find_all('a',
                                   href=re.compile('https://www.poetryfoundation.org/poetrymagazine/poems/[0-9]+/.*'),
                                   attrs={'class': None})

In [554]:
def poem_urls_scraper(poet_url):
    page = rq.get(poet_url)
    soup = bs(page.content, 'html.parser')
    
    poems_text = soup.find_all('a',
                               href=re.compile('https://www.poetryfoundation.org/poems/[0-9]+/.*'),
                               attrs={'class': None})
    poems_scan = soup.find_all('a',
                               href=re.compile('https://www.poetryfoundation.org/poetrymagazine/poems/[0-9]+/.*'),
                               attrs={'class': None})
    
    if poems_text:
        poems_text_urls = [poem.get('href') for poem in poems_text]
    else:
        poems_text_urls = []
        
    if poems_scan:
        poems_scan_urls = [poem.get('href') for poem in poems_scan]
    else:
        poems_scan_urls = []
    
    return poems_text_urls, poems_scan_urls

In [570]:
def poem_scraper(poem_url):
    page = rq.get(poem_url)
    soup = bs(page.content, 'html.parser')
    try:
        poet = soup.find('a', href=re.compile('.*/poets/.*'), attrs={'class': None}).contents[0]
    except:
        poet = np.nan
    title = soup.find('h1').contents[-1]
    try:
        lines_raw = soup.find_all('div', {'style': 'text-indent: -1em; padding-left: 1em;'})
        lines = [normalize('NFKD', str(line.contents[0])) for line in lines_raw]
    except:
        lines = np.nan
    try:
        poem_string = "\n".join(''.join(lines).splitlines()).replace('<br/>', '\n') 
    except:
        poem_string = np.nan
    try:
        year_blurb = soup.find('span', {'class': 'c-txt c-txt_note c-txt_note_mini'}).contents[2]
        year_pattern = r'[12]\d{3}'
        year = int(re.search(year_pattern, year_blurb, re.I).group())
    except:
        year = np.nan
        pass
    
    info = [poet, title, year, lines, poem_string]
    
    return info

In [573]:
def pf_scraper(poet_urls_dict, genre):
    ultra_list = []
    for poet_url in poet_urls_dict[genre]:
        poet_text_urls, poet_scan_urls = poem_urls_scraper(url)

        for poem_url in poet_text_urls:
            info = [poet_url, genre, poem_url]
            info.extend(poem_scraper(url))
            ultra_list.append(info)

        for poem_url in poet_scan_urls:
            info = [poet_url, genre, poem_url]
            info.extend(poem_scraper(url))
            ultra_list.append(info)

        time.sleep(1)
        
    return ultra_list

In [571]:
pf_list = pf_scraper(poet_urls_dict, 'augustan')
pf_list[100]

['https://www.poetryfoundation.org/poets/jonathan-swift',
 'augustan',
 'William Shakespeare',
 'Jonathan Swift',
 nan,
 [],
 '']

In [548]:
poet_urls_dict['augustan'][5]

'https://www.poetryfoundation.org/poets/william-cowper'

In [556]:
poet_text_urls, poet_scan_urls = poem_urls_scraper(poet_urls_dict['augustan'][5])
poet_text_urls

['https://www.poetryfoundation.org/poems/44035/the-task-book-i-the-sofa',
 'https://www.poetryfoundation.org/poems/44027/the-castaway',
 'https://www.poetryfoundation.org/poems/50599/epitaph-on-a-hare',
 'https://www.poetryfoundation.org/poems/50600/hatred-and-vengeance-my-eternal-portion',
 'https://www.poetryfoundation.org/poems/44028/light-shining-out-of-darkness',
 'https://www.poetryfoundation.org/poems/44031/on-receipt-of-my-mothers-picture',
 'https://www.poetryfoundation.org/poems/44029/on-the-loss-of-the-royal-george',
 'https://www.poetryfoundation.org/poems/44033/the-shrubbery',
 'https://www.poetryfoundation.org/poems/44034/sonnet-to-william-wilberforce-esq',
 'https://www.poetryfoundation.org/poems/44035/the-task-book-i-the-sofa',
 'https://www.poetryfoundation.org/poems/44036/the-task-book-ii-the-time-piece',
 'https://www.poetryfoundation.org/poems/44037/the-task-book-iv-the-winter-evening',
 'https://www.poetryfoundation.org/poems/44038/the-task-book-v-the-winter-mornin

In [557]:
poet_scan_urls

[]

In [559]:
info = ['augustan']
info.extend(poem_scraper('https://www.poetryfoundation.org/poems/44035/the-task-book-i-the-sofa'))
info

['augustan',
 'William Cowper',
 ' The Task, Book I: The Sofa\n                ',
 1794,
 ['Thou know’st my praise of nature most sincere, ',
  'And that my raptures are not conjur’d up ',
  'To serve occasions of poetic pomp, ',
  'But genuine, and art partner of them all. ',
  'How oft upon yon eminence our pace ',
  'Has slacken’d to a pause, and we have borne ',
  'The ruffling wind, scarce conscious that it blew, ',
  'While admiration, feeding at the eye, ',
  'And still unsated, dwelt upon the scene. ',
  'Thence with what pleasure have we just discern’d ',
  'The distant plough slow-moving, and beside ',
  'His lab’ring team, that swerv’d not from the track, ',
  'The sturdy swain diminish’d to a boy! ',
  'Here Ouse, slow winding through a level plain ',
  'Of spacious meads with cattle sprinkled o’er, ',
  'Conducts the eye along its sinuous course ',
  'Delighted.  There, fast rooted in his bank, ',
  'Stand, never overlook’d, our fav’rite elms, ',
  'That screen the herdsma

In [560]:
ultra_list = []

poet_text_urls, poet_scan_urls = poem_urls_scraper(poet_urls_dict['augustan'][5])

for url in poet_text_urls:
    info = ['augustan']
    info.extend(poem_scraper(url))
    ultra_list.append(info)
    
for url in poet_scan_urls:
    info = ['augustan']
    info.extend(poem_scraper(url))
    ultra_list.append(info)

ultra_list

[['augustan',
  'William Cowper',
  ' The Task, Book I: The Sofa\n                ',
  1794,
  ['Thou know’st my praise of nature most sincere, ',
   'And that my raptures are not conjur’d up ',
   'To serve occasions of poetic pomp, ',
   'But genuine, and art partner of them all. ',
   'How oft upon yon eminence our pace ',
   'Has slacken’d to a pause, and we have borne ',
   'The ruffling wind, scarce conscious that it blew, ',
   'While admiration, feeding at the eye, ',
   'And still unsated, dwelt upon the scene. ',
   'Thence with what pleasure have we just discern’d ',
   'The distant plough slow-moving, and beside ',
   'His lab’ring team, that swerv’d not from the track, ',
   'The sturdy swain diminish’d to a boy! ',
   'Here Ouse, slow winding through a level plain ',
   'Of spacious meads with cattle sprinkled o’er, ',
   'Conducts the eye along its sinuous course ',
   'Delighted.  There, fast rooted in his bank, ',
   'Stand, never overlook’d, our fav’rite elms, ',
   '

In [563]:
pd.DataFrame(ultra_list)

Unnamed: 0,0,1,2,3,4,5
0,augustan,William Cowper,"The Task, Book I: The Sofa\n",1794.0,"[Thou know’st my praise of nature most sincere, , And that my raptures are not conjur’d up , To serve occasions of poetic pomp, , But genuine, and...","Thou know’st my praise of nature most sincere, And that my raptures are not conjur’d up To serve occasions of poetic pomp, But genuine, and art pa..."
1,augustan,William Cowper,\n The Castaway\n,,"[Obscurest night involv'd the sky, , Th' Atlantic billows roar'd, , When such a destin'd wretch as I, , Wash'd headlong from on ...","Obscurest night involv'd the sky, Th' Atlantic billows roar'd, When such a destin'd wretch as I, Wash'd headlong from on board, ..."
2,augustan,William Cowper,\n Epitaph on a Hare\n,,"[Here lies, whom hound did ne’er pursue, , Nor swifter greyhound follow, , Whose foot ne’er tainted morning dew, , Nor ear heard huntsman’...","Here lies, whom hound did ne’er pursue, Nor swifter greyhound follow, Whose foot ne’er tainted morning dew, Nor ear heard huntsman’s hallo..."
3,augustan,William Cowper,"\n Hatred and Vengeance, My Eternal Portion\n",,"[Hatred and vengeance, my eternal portion, , Scarce can endure delay of execution, , Wait, with impatient readiness, to seize my , ...","Hatred and vengeance, my eternal portion, Scarce can endure delay of execution, Wait, with impatient readiness, to seize my ..."
4,augustan,William Cowper,\n Light Shining out of Darkness\n,,"[1\r, God moves in a mysterious way,\r, His wonders to perform;\r, He plants his footsteps in the sea,\r, And rides upon the storm.\r, <br...","1\nGod moves in a mysterious way,\n His wonders to perform;\nHe plants his footsteps in the sea,\n And rides upon the storm.\n\n2\nDeep in u..."
5,augustan,William Cowper,\n On Receipt Of My Mother's Picture\n,,"[Oh that those lips had language! Life has pass'd\r, With me but roughly since I heard thee last.\r, Those lips are thine—thy own sweet smiles I s...","Oh that those lips had language! Life has pass'd\nWith me but roughly since I heard thee last.\nThose lips are thine—thy own sweet smiles I see,\n..."
6,augustan,William Cowper,\n On the Loss of the Royal George\n,,"[Toll for the brave—\r, The brave! that are no more:\r, All sunk beneath the wave,\r, Fast by their native shore.\r, Eight hundred of the ...","Toll for the brave—\nThe brave! that are no more:\n All sunk beneath the wave,\nFast by their native shore.\n Eight hundred of the brave,\nW..."
7,augustan,William Cowper,\n The Shrubbery\n,,"[Oh happy shades—to me unblest!\r, Friendly to peace, but not to me!\r, How ill the scene that offers rest,\r, And heart that cannot rest,...","Oh happy shades—to me unblest!\n Friendly to peace, but not to me!\nHow ill the scene that offers rest,\n And heart that cannot rest, agree!..."
8,augustan,William Cowper,"\n Sonnet to William Wilberforce, Esq.\n",,"[Thy country, Wilberforce, with just disdain,\r, Hears thee, by cruel men and impious, call'd\r, Fanatic, for thy zeal to loose th' enthrall'd\r, ...","Thy country, Wilberforce, with just disdain,\nHears thee, by cruel men and impious, call'd\nFanatic, for thy zeal to loose th' enthrall'd\nFrom ex..."
9,augustan,William Cowper,"The Task, Book I: The Sofa\n",1794.0,"[Thou know’st my praise of nature most sincere, , And that my raptures are not conjur’d up , To serve occasions of poetic pomp, , But genuine, and...","Thou know’st my praise of nature most sincere, And that my raptures are not conjur’d up To serve occasions of poetic pomp, But genuine, and art pa..."


In [None]:
poem_scraper('')

In [553]:
poem_scraper('https://www.poetryfoundation.org/poems/44035/the-task-book-i-the-sofa')

['William Cowper',
 ' The Task, Book I: The Sofa\n                ',
 1794,
 ['Thou know’st my praise of nature most sincere, ',
  'And that my raptures are not conjur’d up ',
  'To serve occasions of poetic pomp, ',
  'But genuine, and art partner of them all. ',
  'How oft upon yon eminence our pace ',
  'Has slacken’d to a pause, and we have borne ',
  'The ruffling wind, scarce conscious that it blew, ',
  'While admiration, feeding at the eye, ',
  'And still unsated, dwelt upon the scene. ',
  'Thence with what pleasure have we just discern’d ',
  'The distant plough slow-moving, and beside ',
  'His lab’ring team, that swerv’d not from the track, ',
  'The sturdy swain diminish’d to a boy! ',
  'Here Ouse, slow winding through a level plain ',
  'Of spacious meads with cattle sprinkled o’er, ',
  'Conducts the eye along its sinuous course ',
  'Delighted.  There, fast rooted in his bank, ',
  'Stand, never overlook’d, our fav’rite elms, ',
  'That screen the herdsman’s solitary 

In [546]:
ultra_list

[['augustan', 'poet', 'title', 'year', 'poem_lines', 'poem_string'],
 ['augustan',
  'poet',
  'title',
  'year',
  'poem_lines',
  'poem_string',
  'poet',
  'title',
  'year',
  'poem_lines',
  'poem_string',
  'poet',
  'title',
  'year',
  'poem_lines',
  'poem_string'],
 ['augustan', 'poet', 'title', 'year', 'poem_lines', 'poem_string'],
 ['augustan',
  'poet',
  'title',
  'year',
  'poem_lines',
  'poem_string',
  'poet',
  'title',
  'year',
  'poem_lines',
  'poem_string'],
 ['augustan',
  'poet',
  'title',
  'year',
  'poem_lines',
  'poem_string',
  'poet',
  'title',
  'year',
  'poem_lines',
  'poem_string',
  'poet',
  'title',
  'year',
  'poem_lines',
  'poem_string',
  'poet',
  'title',
  'year',
  'poem_lines',
  'poem_string'],
 ['augustan',
  'poet',
  'title',
  'year',
  'poem_lines',
  'poem_string',
  'poet',
  'title',
  'year',
  'poem_lines',
  'poem_string',
  'poet',
  'title',
  'year',
  'poem_lines',
  'poem_string',
  'poet',
  'title',
  'year',
  'p

In [543]:
pd.DataFrame.from_dict(ultra_dict, orient='index')

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,111,112,113,114,115,116,117,118,119,120
augustan,"{'poet': 'Mary Barber', 'title': 'Advice to Her Son on Marriage', 'year': nan, 'poem_lines': ['When you gain her Affection, take care to preserve ...","{'poet': 'Susanna Blamire', 'title': 'Auld Robin Forbes', 'year': nan, 'poem_lines': ['And auld Robin Forbes hes gien tem a dance, ', 'I pat on my...","{'poet': 'Susanna Blamire', 'title': 'O Donald! Ye Are Just the Man', 'year': nan, 'poem_lines': ['O Donald! ye are just the man ', ' Who, when h...","{'poet': 'Susanna Blamire', 'title': 'The Siller Croun', 'year': nan, 'poem_lines': ['And ye shall walk in silk attire, ', ' And siller hae to s...","{'poet': 'Henry Carey', 'title': 'The Ballad of Sally in our Alley', 'year': nan, 'poem_lines': ['Of all the Girls that are so smart ', ' Ther...","{'poet': 'Thomas Chatterton', 'title': 'Ælla, a Tragical Interlude', 'year': nan, 'poem_lines': ['<b>FYRSTE MYNSTRELLE</b>', '<br/>', ' ... ',...","{'poet': 'Thomas Chatterton', 'title': 'An Excelente Balade of Charitie', 'year': nan, 'poem_lines': ['In Virgynë the sweltrie sun gan sheene, ',...","{'poet': 'William Collins', 'title': 'Eclogue the Second: HASSAN; or, the Camel-driver.', 'year': 2006, 'poem_lines': ['In silent horror o’er the ...","{'poet': 'William Collins', 'title': 'Ode on the Poetical Character', 'year': 2006, 'poem_lines': ['<strong>I</strong>', 'As once, if not with lig...","{'poet': 'William Collins', 'title': 'An Ode on the Popular Superstitions of the Highlands of Scotland, Considered as the Subject of Poetry', 'yea...",...,"{'poet': 'Jonathan Swift', 'title': 'Verses on the Death of Dr. Swift, D.S.P.D.', 'year': nan, 'poem_lines': ['As Rochefoucauld his maxims drew ',...","{'poet': 'James Thomson', 'title': 'Rule Britannia', 'year': 2006, 'poem_lines': ['When Britain first, at heaven's command, ', ' Arose from out...","{'poet': 'James Thomson', 'title': ['from'], 'year': nan, 'poem_lines': ['As rising from the vegetable World ', 'My Theme ascends, with equal Wing...","{'poet': 'James Thomson', 'title': ['from'], 'year': nan, 'poem_lines': [' See, Winter comes to rule the varied year,', 'Sullen and sad, with a...","{'poet': 'Isaac Watts', 'title': 'The Day of Judgment', 'year': nan, 'poem_lines': ['When the fierce north wind with his airy forces ', 'Rears up ...","{'poet': 'Isaac Watts', 'title': 'Our God, Our Help', 'year': nan, 'poem_lines': ['Our God, our help in ages past, ', ' Our hope for years to co...","{'poet': 'Isaac Watts', 'title': 'A Prospect of Heaven Makes Death Easy', 'year': nan, 'poem_lines': ['There is a land of pure delight ', ' Wher...","{'poet': 'Isaac Watts', 'title': 'Psalm 114', 'year': nan, 'poem_lines': ['When Isr’el, freed from Pharaoh’s hand, ', 'Left the proud tyrant and h...","{'poet': 'Isaac Watts', 'title': 'Psalm 58', 'year': nan, 'poem_lines': ['Judges, who rule the world by laws, ', 'Will ye despise the righteous ca...","{'poet': 'Gerhard Tersteegen', 'title': 'Hymn: Thou Hidden Love of God', 'year': nan, 'poem_lines': ['Thou hidden love of God, whose height, ', ' ..."
beat,,,,,,,,,,,...,,,,,,,,,,
black_arts_movement,,,,,,,,,,,...,,,,,,,,,,
black_mountain,,,,,,,,,,,...,,,,,,,,,,
confessional,,,,,,,,,,,...,,,,,,,,,,
fugitive,,,,,,,,,,,...,,,,,,,,,,
georgian,,,,,,,,,,,...,,,,,,,,,,
harlem_renaissance,,,,,,,,,,,...,,,,,,,,,,
imagist,,,,,,,,,,,...,,,,,,,,,,
language_poetry,,,,,,,,,,,...,,,,,,,,,,


In [539]:
%time ultra_dict = pf_scraper(poet_urls_dict)
ultra_dict['fugitive']

AttributeError: ResultSet object has no attribute 'contents'. You're probably treating a list of items like a single item. Did you call find_all() when you meant to call find()?

[]

In [507]:
poems_text_titles, poems_text_urls, poems_scan_titles, poems_scan_urls = poem_urls_scraper(poet_urls_dict['augustan'][3])

In [508]:
poems_text_titles

['Ælla, a Tragical Interlude', 'An Excelente Balade of Charitie']

In [509]:
poems_text_urls

['https://www.poetryfoundation.org/poems/43924/aella-a-tragical-interlude',
 'https://www.poetryfoundation.org/poems/43925/an-excelente-balade-of-charitie']

In [None]:
page = rq.get('https://www.poetryfoundation.org/poems/43924/aella-a-tragical-interlude')


In [510]:
poems_scan_titles

[]

In [511]:
poems_scan_urls

[]

In [520]:
ps = list(zip(poems_text_titles, poems_text_urls))
ps[0][0]

'Ælla, a Tragical Interlude'

In [523]:
test = {}
test['augustan'] = []
for poem in zip(poems_text_titles, poems_text_urls):
    test['augustan'].append(poem_scraper(poem[0], poem[1]))

In [524]:
test

{'augustan': [{'poet': 'Thomas Chatterton',
   'title': 'Ælla, a Tragical Interlude',
   'year': nan,
   'poem_lines': ['<b>FYRSTE MYNSTRELLE</b>',
    '<br/>',
    '     ...\r',
    '<br/>',
    '      The boddynge flourettes bloshes atte the lyghte;\r',
    '    The mees be sprenged wyth the yellowe hue;\r',
    '    Ynn daiseyd mantels ys the mountayne dyghte;\r',
    '    The nesh yonge coweslepe bendethe wyth the dewe;\r',
    '    The trees enlefed, yntoe Heavenne straughte,\r',
    'Whenn gentle wyndes doe blowe to whestlyng dynne ys broughte.\r',
    '<br/>',
    '      The evenynge commes, and brynges the dewe alonge;\r',
    '    The roddie welkynne sheeneth to the eyne;\r',
    '    Arounde the alestake Mynstrells synge the songe;\r',
    '    Yonge ivie rounde the doore poste do entwyne;\r',
    '    I laie mee onn the grasse; yette, to mie wylle,\r',
    'Albeytte alle ys fayre, there lackethe somethynge stylle.\r',
    '<br/>',
    '<b>SECONDE MYNSTRELLE</b>',
    '<br/>'

In [496]:
poet_urls_dict

{'augustan': ['https://www.poetryfoundation.org/poets/mary-barber',
  'https://www.poetryfoundation.org/poets/susanna-blamire',
  'https://www.poetryfoundation.org/poets/henry-carey',
  'https://www.poetryfoundation.org/poets/thomas-chatterton',
  'https://www.poetryfoundation.org/poets/william-collins',
  'https://www.poetryfoundation.org/poets/william-cowper',
  'https://www.poetryfoundation.org/poets/daniel-defoe',
  'https://www.poetryfoundation.org/poets/anne-finch',
  'https://www.poetryfoundation.org/poets/john-gay',
  'https://www.poetryfoundation.org/poets/oliver-goldsmith',
  'https://www.poetryfoundation.org/poets/thomas-gray',
  'https://www.poetryfoundation.org/poets/matthew-green',
  'https://www.poetryfoundation.org/poets/warren-hastings',
  'https://www.poetryfoundation.org/poets/samuel-johnson',
  'https://www.poetryfoundation.org/poets/mary-jones',
  'https://www.poetryfoundation.org/poets/lady-mary-wortley-montagu',
  'https://www.poetryfoundation.org/poets/alexander

In [244]:
poet_urls_dict['modern'][0]

'https://www.poetryfoundation.org/poets/conrad-aiken'

In [245]:
page = rq.get(poet_urls_dict['modern'][0])
soup = bs(page.content, 'html.parser')
soup

<!DOCTYPE doctype html>

<html class="no-js" lang="en-us">
<head>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<!--[if IE]><meta http-equiv="cleartype" content="on" /><![endif]-->
<meta content="https://www.poetryfoundation.org/poets/conrad-aiken" name="dcterms.Identifier"/>
<meta content="Conrad Aiken" name="dcterms.Title"/>
<meta content="text/html" name="dcterms.Format"/>
<meta content="Poetry Foundation" name="dcterms.Relation"/>
<meta content="en" name="dcterms.Language"/>
<meta content="Poetry Foundation" name="dcterms.Publisher"/>
<meta content="text/html" name="dcterms.Type"/>
<meta content="https://www.poetryfoundation.org/" name="dcterms.Coverage"/>
<meta content="Copyright 2020 Poetry Foundation" name="dcterms.Rights"/>
<meta content="Poetry Foundation" name="dcterms.Contributor"/>
<meta content="2020-06-22" name="dcterms.Date"/>
<meta content="Although he received the most prestigious of literary awards, including a Pulitze

In [266]:
poems = soup.find_all('a', href=re.compile('https://www.poetryfoundation.org/poems/[0-9]+/.*'),
                      attrs={'class': None})

poems2 = soup.find_all('a', href=re.compile('https://www.poetryfoundation.org/poetrymagazine/poems/[0-9]+/.*'),
                       attrs={'class': None})

In [267]:
poems[0].contents[0]

'Exile'

In [268]:
poems

[<a href="https://www.poetryfoundation.org/poems/43210/exile-56d221e9ee593">Exile</a>,
 <a href="https://www.poetryfoundation.org/poems/43213/goya-56d221eb139d0">Goya</a>,
 <a href="https://www.poetryfoundation.org/poems/43219/summer-56d221ecea6eb">Summer</a>,
 <a href="https://www.poetryfoundation.org/poems/43215/the-things">The Things</a>,
 <a href="https://www.poetryfoundation.org/poems/43221/when-you-are-not-surprised">When You Are Not Surprised</a>]

In [269]:
poems2

[<a href="https://www.poetryfoundation.org/poetrymagazine/poems/18959/ballad-into-the-wood">Ballad</a>,
 <a href="https://www.poetryfoundation.org/poetrymagazine/poems/20687/blues-for-ruby-matrix">Blues for Ruby Matrix</a>,
 <a href="https://www.poetryfoundation.org/poetrymagazine/poems/17625/broad-on-the-sunburnt-hill">Six Sonnets</a>,
 <a href="https://www.poetryfoundation.org/poetrymagazine/poems/14494/counterpoint-two-rooms">Counterpoint: Two Rooms</a>,
 <a href="https://www.poetryfoundation.org/poetrymagazine/poems/26039/the-cyclads">The Cyclads</a>,
 <a href="https://www.poetryfoundation.org/poetrymagazine/poems/13202/from-discordants-iv">Discordants</a>,
 <a href="https://www.poetryfoundation.org/poetrymagazine/poems/26127/the-ego">The Ego</a>,
 <a href="https://www.poetryfoundation.org/poetrymagazine/poems/14493/haunted-chambers">Haunted Chambers</a>,
 <a href="https://www.poetryfoundation.org/poetrymagazine/poems/14492/illicit-56d2083b63520">Illicit</a>,
 <a href="https://www.

In [494]:
info = []
for poem in poems:
    title = poem.contents[0]
    url = poem.get('href')
    page = rq.get(url)
    soup = bs(page.content, 'html.parser')
    poet = soup.find('a', href=re.compile('.*/poets/.*'), attrs={'class': None}).contents[0]
    lines_raw = soup.find_all('div', {'style': 'text-indent: -1em; padding-left: 1em;'})
    lines = [normalize('NFKD', str(line.contents[0])) for line in lines_raw]
    poem_string = "\n".join(''.join(lines).splitlines()).replace('<br/>', '\n') 
    year_blurb = soup.find('span', {'class': 'c-txt c-txt_note c-txt_note_mini'}).contents[2]
    year_pattern = r'[12]\d{3}'
    year = int(re.search(year_pattern, year_blurb, re.I).group())
    info.append((poet, title, year, lines, poem_string))
    
   

#     titles.append(title)
    
#     if poemTitle:
#         print(parser.unescape(poemTitle.text).encode('utf8'),file=output)
        
#         poemContent = poemSoup.find('div',{'class':'o-poem'})
#         poemLines = poemContent.findAll('div')
#         for line in poemLines:
#             text = parser.unescape(line.text)
#             out = text.encode('utf8')
#             print(out,file=output)

In [498]:
len(lines)

54

In [500]:
lines

['When you are not surprised, not surprised,',
 '\r nor leap in imagination from sunlight into shadow   ',
 '\r or from shadow into sunlight   ',
 '\r suiting the color of fright or delight   ',
 '\r to the bewildering circumstance   ',
 '\r when you are no longer surprised   ',
 '\r by the quiet or fury of daybreak   ',
 '\r the stormy uprush of the sun’s rage   ',
 '\r over the edges of torn trees',
 '\r torrents of living and dying flung',
 '\r upward and outward inward and downward to space',
 '\r or else',
 '\r peace peace peace peace',
 '\r the wood-thrush speaking his holy holy',
 '\r far hidden in the forest of the mind   ',
 '\r while slowly',
 '\r the limbs of light unwind',
 '\r and the world’s surface dreams again of night',
 '\r as the center dreams of light   ',
 '\r when you are not surprised',
 '\r by breath and breath and breath',
 '\r the first unconscious morning breath',
 '\r the tap of the bird’s beak on the pane',
 '\r and do not cry out come again   ',
 '\r blest

In [499]:
len([line for line in lines if line != '<br/>'])

52

In [491]:
year_blurb = 'Copyright 1953 by Conrad Aiken. Used by permission of Brandt & Hochman Literary Agents, Inc. Any electronic copying or distribution of this text'
year_pattern = r'[12]\d{3}'
year = int(re.search(year_pattern, year_blurb, re.I).group())

In [492]:
year

1953

In [483]:
year_pattern

re.compile(r'1953', re.UNICODE)

In [495]:
pd.DataFrame(info)

Unnamed: 0,0,1,2,3,4
0,Conrad Aiken,Exile,1953,"[These hills are sandy. Trees are dwarfed here. Crows, \r Caw dismally in skies of an arid brilliance,, \r Complain in dusty pine-trees. Yellow da...","These hills are sandy. Trees are dwarfed here. Crows\n Caw dismally in skies of an arid brilliance,\n Complain in dusty pine-trees. Yellow daybrea..."
1,Conrad Aiken,Goya,1953,"[Goya drew a pig on a wall., \r The five-year-old hairdresser’s son, \r Saw, graved on a silver tray,, \r The lion; and sunsets were begun., <br/>...","Goya drew a pig on a wall.\n The five-year-old hairdresser’s son\n Saw, graved on a silver tray,\n The lion; and sunsets were begun.\n\n Goya smel..."
2,Conrad Aiken,Summer,1953,"[Absolute zero: the locust sings:, \r summer’s caught in eternity’s rings:, \r the rock explodes, the planet dies,, \r we shovel up our verities.,...","Absolute zero: the locust sings:\n summer’s caught in eternity’s rings:\n the rock explodes, the planet dies,\n we shovel up our verities.\n\n The..."
3,Conrad Aiken,The Things,1953,"[The house in Broad Street, red brick, with nine rooms, \r the weedgrown graveyard with its rows of tombs, \r the jail from which imprisoned faces...","The house in Broad Street, red brick, with nine rooms\n the weedgrown graveyard with its rows of tombs\n the jail from which imprisoned faces grin..."
4,Conrad Aiken,When You Are Not Surprised,1953,"[When you are not surprised, not surprised,, \r nor leap in imagination from sunlight into shadow , \r or from shadow into sunlight , \r suiti...","When you are not surprised, not surprised,\n nor leap in imagination from sunlight into shadow \n or from shadow into sunlight \n suiting the ..."


In [454]:
print(pd.DataFrame(info).iloc[0,3])

['These hills are sandy. Trees are dwarfed here. Crows', '\r Caw dismally in skies of an arid brilliance,', '\r Complain in dusty pine-trees. Yellow daybreak', '\r Lights on the long brown slopes a frost-like dew,', '\r Dew as heavy as rain; the rabbit tracks', '\r Show sharply in it, as they might in snow.', '\r But it’s soon gone in the sun—what good does it do?', '\r The houses, on the slope, or among brown trees,', '\r Are grey and shrivelled. And the men who live here', '\r Are small and withered, spider-like, with large eyes.', '<br/>', '\r Bring water with you if you come to live here—', '\r Cold tinkling cisterns, or else wells so deep', '\r That one looks down to Ganges or Himalayas.', '\r Yes, and bring mountains with you, white, moon-bearing,', '\r Mountains of ice. You will have need of these', '\r Profundities and peaks of wet and cold.', '<br/>', '\r Bring also, in a cage of wire or osier,', '\r Birds of a golden colour, who will sing', '\r Of leaves that do not wither, w

In [319]:
title = poems[0].contents[0]
url = poems[0].get('href')
page = rq.get(url)
soup = bs(page.content, 'html.parser')
poet = soup.find('a', href=re.compile('.*/poets/.*'), attrs={'class': None}).contents[0]
lines_raw = soup.find_all('div', {'style': 'text-indent: -1em; padding-left: 1em;'})
lines = [line.get_text(' ', strip=True) for line in lines_raw]

In [320]:
lines_raw[3]

<div style="text-indent: -1em; padding-left: 1em;">
 Lights on the long brown slopes a frost-like dew,<br/></div>

In [321]:
lines[3]

'Lights on the long brown slopes a frost-like dew,'

In [322]:
print('\n'.join(lines))

These hills are sandy. Trees are dwarfed here. Crows
Caw dismally in skies of an arid brilliance,
Complain in dusty pine-trees. Yellow daybreak
Lights on the long brown slopes a frost-like dew,
Dew as heavy as rain; the rabbit tracks
Show sharply in it, as they might in snow.
But it’s soon gone in the sun—what good does it do?
The houses, on the slope, or among brown trees,
Are grey and shrivelled. And the men who live here
Are small and withered, spider-like, with large eyes.

Bring water with you if you come to live here—
Cold tinkling cisterns, or else wells so deep
That one looks down to Ganges or Himalayas.
Yes, and bring mountains with you, white, moon-bearing,
Mountains of ice. You will have need of these
Profundities and peaks of wet and cold.

Bring also, in a cage of wire or osier,
Birds of a golden colour, who will sing
Of leaves that do not wither, watery fruits
That heavily hang on long melodious boughs
In the blue-silver forests of deep valleys.

I have now been here—how 

In [347]:
from unicodedata import normalize

In [355]:
normalize("NFKD", lines[1])

'\r                      tossed in'

In [439]:
page = rq.get(poems[0].get('href'))
soup = bs(page.content, 'html.parser')
poet = soup.find('a', href=re.compile('.*/poets/.*'), attrs={'class': None}).contents[0]
lines_raw = soup.find_all('div', {'style': 'text-indent: -1em; padding-left: 1em;'})
lines = [normalize('NFKD', str(line.contents[0])) for line in lines_raw]
poem_string = "\n".join(''.join(lines).splitlines()).replace('<br/>', '\n')

In [441]:
print(poem_string)

These hills are sandy. Trees are dwarfed here. Crows
 Caw dismally in skies of an arid brilliance,
 Complain in dusty pine-trees. Yellow daybreak
 Lights on the long brown slopes a frost-like dew,
 Dew as heavy as rain; the rabbit tracks
 Show sharply in it, as they might in snow.
 But it’s soon gone in the sun—what good does it do?
 The houses, on the slope, or among brown trees,
 Are grey and shrivelled. And the men who live here
 Are small and withered, spider-like, with large eyes.

 Bring water with you if you come to live here—
 Cold tinkling cisterns, or else wells so deep
 That one looks down to Ganges or Himalayas.
 Yes, and bring mountains with you, white, moon-bearing,
 Mountains of ice. You will have need of these
 Profundities and peaks of wet and cold.

 Bring also, in a cage of wire or osier,
 Birds of a golden colour, who will sing
 Of leaves that do not wither, watery fruits
 That heavily hang on long melodious boughs
 In the blue-silver forests of deep valleys.

 I ha

In [434]:
page = rq.get(poems[0].get('href'))
soup = bs(page.content, 'html.parser')
poet = soup.find('a', href=re.compile('.*/poets/.*'), attrs={'class': None}).contents[0]
lines_raw = soup.find_all('div', {'style': 'text-indent: -1em; padding-left: 1em;'})
lines = [normalize('NFKD', str(line.contents[0])) for line in lines_raw]
poem_string = "\n".join(''.join(lines).splitlines()).replace('<br/>', '\n')
# lines = [line.replace(r'\\r', r'\\n') for line in lines]
poem_string = '\n'.join(lines)
# lines = [re.sub(r'\r', '\n', line) for line in lines]
# lines = [re.sub(r'\xa0', ' ', line) for line in lines]

In [435]:
''.join(lines)

'These hills are sandy. Trees are dwarfed here. Crows\r Caw dismally in skies of an arid brilliance,\r Complain in dusty pine-trees. Yellow daybreak\r Lights on the long brown slopes a frost-like dew,\r Dew as heavy as rain; the rabbit tracks\r Show sharply in it, as they might in snow.\r But it’s soon gone in the sun—what good does it do?\r The houses, on the slope, or among brown trees,\r Are grey and shrivelled. And the men who live here\r Are small and withered, spider-like, with large eyes.<br/>\r Bring water with you if you come to live here—\r Cold tinkling cisterns, or else wells so deep\r That one looks down to Ganges or Himalayas.\r Yes, and bring mountains with you, white, moon-bearing,\r Mountains of ice. You will have need of these\r Profundities and peaks of wet and cold.<br/>\r Bring also, in a cage of wire or osier,\r Birds of a golden colour, who will sing\r Of leaves that do not wither, watery fruits\r That heavily hang on long melodious boughs\r In the blue-silver fo

In [436]:
poem_string

'These hills are sandy. Trees are dwarfed here. Crows\n\r Caw dismally in skies of an arid brilliance,\n\r Complain in dusty pine-trees. Yellow daybreak\n\r Lights on the long brown slopes a frost-like dew,\n\r Dew as heavy as rain; the rabbit tracks\n\r Show sharply in it, as they might in snow.\n\r But it’s soon gone in the sun—what good does it do?\n\r The houses, on the slope, or among brown trees,\n\r Are grey and shrivelled. And the men who live here\n\r Are small and withered, spider-like, with large eyes.\n<br/>\n\r Bring water with you if you come to live here—\n\r Cold tinkling cisterns, or else wells so deep\n\r That one looks down to Ganges or Himalayas.\n\r Yes, and bring mountains with you, white, moon-bearing,\n\r Mountains of ice. You will have need of these\n\r Profundities and peaks of wet and cold.\n<br/>\n\r Bring also, in a cage of wire or osier,\n\r Birds of a golden colour, who will sing\n\r Of leaves that do not wither, watery fruits\n\r That heavily hang on lon

In [430]:
lines_raw[-1]

<div style="text-indent: -1em; padding-left: 1em;"><div style="text-align: right;"><span class="sm-caps">new york, 1982</span></div><br/></div>

In [438]:
print("\n".join(''.join(lines).splitlines()).replace('<br/>', '\n'))

These hills are sandy. Trees are dwarfed here. Crows
 Caw dismally in skies of an arid brilliance,
 Complain in dusty pine-trees. Yellow daybreak
 Lights on the long brown slopes a frost-like dew,
 Dew as heavy as rain; the rabbit tracks
 Show sharply in it, as they might in snow.
 But it’s soon gone in the sun—what good does it do?
 The houses, on the slope, or among brown trees,
 Are grey and shrivelled. And the men who live here
 Are small and withered, spider-like, with large eyes.

 Bring water with you if you come to live here—
 Cold tinkling cisterns, or else wells so deep
 That one looks down to Ganges or Himalayas.
 Yes, and bring mountains with you, white, moon-bearing,
 Mountains of ice. You will have need of these
 Profundities and peaks of wet and cold.

 Bring also, in a cage of wire or osier,
 Birds of a golden colour, who will sing
 Of leaves that do not wither, watery fruits
 That heavily hang on long melodious boughs
 In the blue-silver forests of deep valleys.

 I ha

In [413]:
print(poem_string)

Last night
                      tossed in
 my bed
                      the sound of the rain turned me
 around,
                  a leaf
 in a dried gully
                             from side to
 side,
             the sound of the rain took me
 apart,        opened to                 what is it?
 breath caught in memory of
 a deep sweetness
                                      that sound
                                      unceasing
 delicate,                the wetness running
 through my body
                                     It might be nighttime
                                     in a forest hut,
 the rain constant
                                in little rivulets
 splashing,
                           at times uncertain—
 safe in each other's arms,
                                                the rain sheltering
 us       a depth opening
 bottomless to a terrible sweetness,
                                                     the small r

In [396]:
big_string = '\n'.join(lines)

In [397]:
big_string

"Last night\n\r                      tossed in\n\r my bed\n\r                      the sound of the rain turned me\n\r around,\n\r                  a leaf\n\r in a dried gully\n\r                             from side to\n\r side,\n\r             the sound of the rain took me\n\r apart,        opened to                 what is it?\n\r breath caught in memory of\n\r a deep sweetness\n\r                                      that sound\n\r                                      unceasing\n\r delicate,                the wetness running\n\r through my body\n\r                                     It might be nighttime\n\r                                     in a forest hut,\n\r the rain constant\n\r                                in little rivulets\n\r splashing,\n\r                           at times uncertain—\n\r safe in each other's arms,\n\r                                                the rain sheltering\n\r us       a depth opening\n\r bottomless to a terrible sweetness,\n\r         

In [398]:
print(big_string)

Last night
                      tossed in
 my bed
                      the sound of the rain turned me
 around,
                  a leaf
 in a dried gully
                             from side to
 side,
             the sound of the rain took me
 apart,        opened to                 what is it?
 breath caught in memory of
 a deep sweetness
                                      that sound
                                      unceasing
 delicate,                the wetness running
 through my body
                                     It might be nighttime
                                     in a forest hut,
 the rain constant
                                in little rivulets
 splashing,
                           at times uncertain—
 safe in each other's arms,
                                                the rain sheltering
 us       a depth opening
 bottomless to a terrible sweetness,
                                                     the small r

In [399]:
print(''.join(lines))

Last night                      tossed in my bed                      the sound of the rain turned me around,                  a leaf in a dried gully                             from side to side,             the sound of the rain took me apart,        opened to                 what is it? breath caught in memory of a deep sweetness                                      that sound                                      unceasing delicate,                the wetness running through my body                                     It might be nighttime                                     in a forest hut, the rain constant                                in little rivulets splashing,                           at times uncertain— safe in each other's arms,                                                the rain sheltering us       a depth opening bottomless to a terrible sweetness,                                                     the small rain shaking us in our bed

In [351]:
lines_raw[1].contents[0]

'\r \xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0 tossed in'

In [407]:
print(' '.join(lines))

Last night                       tossed in  my bed                       the sound of the rain turned me  around,                   a leaf  in a dried gully                              from side to  side,              the sound of the rain took me  apart,        opened to                 what is it?  breath caught in memory of  a deep sweetness                                       that sound                                       unceasing  delicate,                the wetness running  through my body                                      It might be nighttime                                      in a forest hut,  the rain constant                                 in little rivulets  splashing,                            at times uncertain—  safe in each other's arms,                                                 the rain sheltering  us       a depth opening  bottomless to a terrible sweetness,                                                      the small r

In [408]:
lines

['Last night',
 '\r                      tossed in',
 '\r my bed',
 '\r                      the sound of the rain turned me',
 '\r around,',
 '\r                  a leaf',
 '\r in a dried gully',
 '\r                             from side to',
 '\r side,',
 '\r             the sound of the rain took me',
 '\r apart,        opened to                 what is it?',
 '\r breath caught in memory of',
 '\r a deep sweetness',
 '\r                                      that sound',
 '\r                                      unceasing',
 '\r delicate,                the wetness running',
 '\r through my body',
 '\r                                     It might be nighttime',
 '\r                                     in a forest hut,',
 '\r the rain constant',
 '\r                                in little rivulets',
 '\r splashing,',
 '\r                           at times uncertain—',
 "\r safe in each other's arms,",
 '\r                                                the rain sheltering',
 '\r u

In [406]:
print('\n'.join(lines))

Last night
                      tossed in
 my bed
                      the sound of the rain turned me
 around,
                  a leaf
 in a dried gully
                             from side to
 side,
             the sound of the rain took me
 apart,        opened to                 what is it?
 breath caught in memory of
 a deep sweetness
                                      that sound
                                      unceasing
 delicate,                the wetness running
 through my body
                                     It might be nighttime
                                     in a forest hut,
 the rain constant
                                in little rivulets
 splashing,
                           at times uncertain—
 safe in each other's arms,
                                                the rain sheltering
 us       a depth opening
 bottomless to a terrible sweetness,
                                                     the small r

In [290]:
poet = soup.find('a', href=re.compile('.*/poets/.*'), attrs={'class': None}).contents[0]
poet

'Conrad Aiken'

In [278]:
titles[0]

'Exile'

# SCRAP HEAP

In [526]:
ultra_dict = {genre: [] for genre in poet_urls_dict.keys()}
ultra_dict

{'augustan': [],
 'beat': [],
 'black_arts_movement': [],
 'black_mountain': [],
 'confessional': [],
 'fugitive': [],
 'georgian': [],
 'harlem_renaissance': [],
 'imagist': [],
 'language_poetry': [],
 'middle_english': [],
 'modern': [],
 'new_york_school': [],
 'new_york_school_2nd_generation': [],
 'objectivist': [],
 'renaissance': [],
 'romantic': [],
 'victorian': []}

In [46]:
s = rq.Session()
s.get(genre_urls[0])

<Response [200]>

In [29]:
genre_urls[0]

'https://www.poetryfoundation.org/poets/browse#school-period=149'

In [54]:
ses = rq.Session()
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36",
    "Accept": "*/*",
    "Referer": "https://www.poetryfoundation.org/poems/browse",
    "Accept-Encoding": "gzip, deflate, br",
}

page = ses.get(genre_urls[0], headers=headers)
soup = bs(page.content, 'html.parser')
soup

<!DOCTYPE doctype html>

<html class="no-js" lang="en-us">
<head>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<!--[if IE]><meta http-equiv="cleartype" content="on" /><![endif]-->
<meta content="https://www.poetryfoundation.org/poets/browse" name="dcterms.Identifier"/>
<meta content="Browse Poets" name="dcterms.Title"/>
<meta content="text/html" name="dcterms.Format"/>
<meta content="Poetry Foundation" name="dcterms.Relation"/>
<meta content="en" name="dcterms.Language"/>
<meta content="Poetry Foundation" name="dcterms.Publisher"/>
<meta content="text/html" name="dcterms.Type"/>
<meta content="https://www.poetryfoundation.org/" name="dcterms.Coverage"/>
<meta content="Copyright 2020 Poetry Foundation" name="dcterms.Rights"/>
<meta content="Poetry Foundation" name="dcterms.Contributor"/>
<meta content="2020-06-20" name="dcterms.Date"/>
<meta content="Poems, readings, poetry news and the entire 100-year archive of POETRY magazine. " name

In [53]:
soup

<!DOCTYPE doctype html>

<html class="no-js" lang="en-us">
<head>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<!--[if IE]><meta http-equiv="cleartype" content="on" /><![endif]-->
<meta content="https://www.poetryfoundation.org/poets/browse" name="dcterms.Identifier"/>
<meta content="Browse Poets" name="dcterms.Title"/>
<meta content="text/html" name="dcterms.Format"/>
<meta content="Poetry Foundation" name="dcterms.Relation"/>
<meta content="en" name="dcterms.Language"/>
<meta content="Poetry Foundation" name="dcterms.Publisher"/>
<meta content="text/html" name="dcterms.Type"/>
<meta content="https://www.poetryfoundation.org/" name="dcterms.Coverage"/>
<meta content="Copyright 2020 Poetry Foundation" name="dcterms.Rights"/>
<meta content="Poetry Foundation" name="dcterms.Contributor"/>
<meta content="2020-06-20" name="dcterms.Date"/>
<meta content="Poems, readings, poetry news and the entire 100-year archive of POETRY magazine. " name

In [49]:
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36",
           "Accept": "*/*",
           "Referer": "https://www.poetryfoundation.org/poems/browse",
           "Accept-Encoding": "gzip, deflate, br"}
page = rq.get(genre_urls[0], headers=headers)
# sauce = urllib.request.urlopen(req).read()
soup = bs(page.content, 'html.parser')
soup

<!DOCTYPE doctype html>

<html class="no-js" lang="en-us">
<head>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<!--[if IE]><meta http-equiv="cleartype" content="on" /><![endif]-->
<meta content="https://www.poetryfoundation.org/poets/browse" name="dcterms.Identifier"/>
<meta content="Browse Poets" name="dcterms.Title"/>
<meta content="text/html" name="dcterms.Format"/>
<meta content="Poetry Foundation" name="dcterms.Relation"/>
<meta content="en" name="dcterms.Language"/>
<meta content="Poetry Foundation" name="dcterms.Publisher"/>
<meta content="text/html" name="dcterms.Type"/>
<meta content="https://www.poetryfoundation.org/" name="dcterms.Coverage"/>
<meta content="Copyright 2020 Poetry Foundation" name="dcterms.Rights"/>
<meta content="Poetry Foundation" name="dcterms.Contributor"/>
<meta content="2020-06-20" name="dcterms.Date"/>
<meta content="Poems, readings, poetry news and the entire 100-year archive of POETRY magazine. " name

In [50]:
poets = soup.find_all('a', href=re.compile('.*/poets/.*'))
poets

[<a href="https://www.poetryfoundation.org/poets/aazhidegiizhig">Aazhidegiizhig</a>,
 <a href="https://www.poetryfoundation.org/poets/chris-abani">Chris Abani</a>,
 <a href="https://www.poetryfoundation.org/poets/chris-abani"> <div class="c-feature-auxMedia">
 

In [34]:
soup = bs(page.content, 'html.parser')
soup

<!DOCTYPE doctype html>

<html class="no-js" lang="en-us">
<head>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<!--[if IE]><meta http-equiv="cleartype" content="on" /><![endif]-->
<meta content="https://www.poetryfoundation.org/poets/browse" name="dcterms.Identifier"/>
<meta content="Browse Poets" name="dcterms.Title"/>
<meta content="text/html" name="dcterms.Format"/>
<meta content="Poetry Foundation" name="dcterms.Relation"/>
<meta content="en" name="dcterms.Language"/>
<meta content="Poetry Foundation" name="dcterms.Publisher"/>
<meta content="text/html" name="dcterms.Type"/>
<meta content="https://www.poetryfoundation.org/" name="dcterms.Coverage"/>
<meta content="Copyright 2020 Poetry Foundation" name="dcterms.Rights"/>
<meta content="Poetry Foundation" name="dcterms.Contributor"/>
<meta content="2020-06-20" name="dcterms.Date"/>
<meta content="Poems, readings, poetry news and the entire 100-year archive of POETRY magazine. " name

In [38]:
soup.find_all('h2')

[<h2 class="c-hdgSans c-hdgSans_2"><a href="https://www.poetryfoundation.org/poets/aazhidegiizhig">Aazhidegiizhig</a>
 </h2>,
 <h2 class="c-hdgSans c-hdgSans_2"><a href="https://www.poetryfoundation.org/poets/chris-abani">Chris Abani</a>
 </h2>,
 <h2 class="c-hdgSans c-hdgSans_2"><a href="https://www.poetryfoundation.org/poets/francesca-abbate">Francesca Abbate</a>
 </h2>,
 <h2 class="c-hdgSans c-hdgSans_2"><a href="https://www.poetryfoundation.org/poets/george-abbe">George Abbe</a>
 </h2>,
 <h2 class="c-hdgSans c-hdgSans_2"><a href="https://www.poetryfoundation.org/poets/garous-abdolmalekian">Garous Abdolmalekian</a>
 </h2>,
 <h2 class="c-hdgSans c-hdgSans_2"><a href="https://www.poetryfoundation.org/poets/hanif-abdurraqib">Hanif Abdurraqib</a>
 </h2>,
 <h2 class="c-hdgSans c-hdgSans_2"><a href="https://www.poetryfoundation.org/poets/lascelles-abercrombie">Lascelles Abercrombie</a>
 </h2>,
 <h2 class="c-hdgSans c-hdgSans_2"><a href="https://www.poetryfoundation.org/poets/marguerite-ab

In [20]:
poets = soup.find_all('a', href=re.compile('.*/poets/.*'))
poets

[<a href="https://www.poetryfoundation.org/poets/aazhidegiizhig">Aazhidegiizhig</a>,
 <a href="https://www.poetryfoundation.org/poets/chris-abani">Chris Abani</a>,
 <a href="https://www.poetryfoundation.org/poets/chris-abani"> <div class="c-feature-auxMedia">
 , <selenium.webdriver.remote.webelement.WebElement (session="c979655de4516cc0fa660f9d399ee45d", element="df903b64-ae8f-4544-b21f-e2a3adfe0d87")>, <selenium.webdriver.remote.webelement.WebElement (session="c979655de4516cc0fa660f9d399ee45d", element="eeb9350d-d0ae-49a3-8d1d-3ec938d4c312")>, <selenium.webdriver.remote.webelement.WebElement (session="c979655de4516cc0fa660f9d399ee45d", element="ed900a76-f190-4748-a89d-ae0e7b2efdc1")>, <selenium.webdriver.remote.webelement.WebElement (session="c979655de4516cc0fa660f9d399ee45d", element="523d7117-074a-4867-9372-ff4dd80db62d")>, <selenium.webdriver.remote.webelement.WebElement (session="c979655de4516cc0fa660f9d399ee45d", element="45400c89-27ce-42f7-99dd-2f8bceb44a8a")>, <selenium.webdriver.remote.webelement.WebElement (session="c979655de4516cc0fa660f9d399ee45d", element="89cf2b48-cc5e-43b0-9ff9-4f7e6878

In [93]:
for e in elem:
#     if e.get_attribute('href').match()
    print(e.get_attribute('href'))

https://www.poetryfoundation.org/assets/media/images/apple-touch-icon-57x57.png?=1.2.9
https://www.poetryfoundation.org/assets/media/images/apple-touch-icon-60x60.png?=1.2.9
https://www.poetryfoundation.org/assets/media/images/apple-touch-icon-72x72.png?v=1.2.9
https://www.poetryfoundation.org/assets/media/images/apple-touch-icon-76x76.png?v=1.2.9
https://www.poetryfoundation.org/assets/media/images/apple-touch-icon-114x114.png?v=1.2.9
https://www.poetryfoundation.org/assets/media/images/apple-touch-icon-120x120.png?v=1.2.9
https://www.poetryfoundation.org/assets/media/images/apple-touch-icon-144x144.png?v=1.2.9
https://www.poetryfoundation.org/assets/media/images/apple-touch-icon-152x152.png?v=1.2.9
https://www.poetryfoundation.org/assets/media/images/apple-touch-icon-180x180.png?v=1.2.9
https://www.poetryfoundation.org/assets/media/images/apple-touch-icon-196x196.png?v=1.2.9
https://www.poetryfoundation.org/assets/media/images/favicon-196x196.png?v=1.2.9
https://www.poetryfoundation.

https://www.poetryfoundation.org/poems/50586/songs-from-the-beggars-opera-air-x-thomas-i-cannot
https://www.poetryfoundation.org/poems/50587/songs-from-the-beggars-opera-air-xvi-over-the-hills-and-far-away
https://www.poetryfoundation.org/poems/50588/songs-from-the-beggars-opera-air-iv-cotillion
https://www.poetryfoundation.org/poems/50589/songs-from-the-beggars-opera-air-xxvii-green-sleeves
https://www.poetryfoundation.org/poets/samuel-johnson
https://www.poetryfoundation.org/poems/44447/drury-lane-prologue-spoken-by-mr-garrick-at-the-opening-of-the-theatre-in-drury-lane-1747
https://www.poetryfoundation.org/poems/44448/the-vanity-of-human-wishes
https://www.poetryfoundation.org/poems/50596/on-the-death-of-dr-robert-levet
https://www.poetryfoundation.org/poets/susanna-blamire
https://www.poetryfoundation.org/poems/50532/the-siller-croun
https://www.poetryfoundation.org/poems/50533/o-donald-ye-are-just-the-man
https://www.poetryfoundation.org/poems/50534/auld-robin-forbes
https://www.p

In [128]:
pattern.match(links[1])

<_sre.SRE_Match object; span=(0, 57), match='https://www.poetryfoundation.org/poets/browse#mai>

['https://www.poetryfoundation.org/poets/henry-carey',
 'https://www.poetryfoundation.org/poets/anne-finch',
 'https://www.poetryfoundation.org/poets/james-thomson',
 'https://www.poetryfoundation.org/poets/thomas-gray',
 'https://www.poetryfoundation.org/poets/alexander-pope',
 'https://www.poetryfoundation.org/poets/william-cowper',
 'https://www.poetryfoundation.org/poets/daniel-defoe',
 'https://www.poetryfoundation.org/poets/john-gay',
 'https://www.poetryfoundation.org/poets/samuel-johnson',
 'https://www.poetryfoundation.org/poets/susanna-blamire',
 'https://www.poetryfoundation.org/poets/thomas-chatterton',
 'https://www.poetryfoundation.org/poets/oliver-goldsmith',
 'https://www.poetryfoundation.org/poets/jonathan-swift',
 'https://www.poetryfoundation.org/poets/mary-jones',
 'https://www.poetryfoundation.org/poets/isaac-watts',
 'https://www.poetryfoundation.org/poets/matthew-prior',
 'https://www.poetryfoundation.org/poets/matthew-green',
 'https://www.poetryfoundation.org/p

In [123]:
pattern.match('hello')

In [124]:
test = [e.get_attribute('href') for e in elem if pattern.match(e.get_attribute('href'))]
test

['https://www.poetryfoundation.org/poets/browse',
 'https://www.poetryfoundation.org/poets/browse#mainContent',
 'https://www.poetryfoundation.org/poets/browse#',
 'https://www.poetryfoundation.org/poets/browse#',
 'https://www.poetryfoundation.org/poets/browse#',
 'https://www.poetryfoundation.org/poets/browse#',
 'https://www.poetryfoundation.org/poets/browse#',
 'https://www.poetryfoundation.org/poets/browse#',
 'https://www.poetryfoundation.org/poets/browse#',
 'https://www.poetryfoundation.org/poets/browse#',
 'mailto:?subject=Poetry%20Foundation&body=https://www.poetryfoundation.org/poets/browse',
 'https://www.poetryfoundation.org/poets/browse#',
 'https://www.poetryfoundation.org/poets/browse#',
 'https://www.poetryfoundation.org/poets/browse#',
 'https://www.poetryfoundation.org/poets/browse#',
 'https://www.poetryfoundation.org/poets/browse#',
 'https://www.poetryfoundation.org/poets/browse#',
 'https://www.poetryfoundation.org/poets/browse#',
 'https://www.poetryfoundation.o

In [122]:
test = list(filter(pattern.match, elem))
test

TypeError: expected string or bytes-like object

In [119]:
for e in elem:
    pattern = re.compile('(?!#|browse)^(.*/poets/.*)')
    if pattern.match(e.get_attribute('href')):
        print(e.get_attribute('href'))

https://www.poetryfoundation.org/poets/browse
https://www.poetryfoundation.org/poets/browse#mainContent
https://www.poetryfoundation.org/poets/browse#
https://www.poetryfoundation.org/poets/browse#
https://www.poetryfoundation.org/poets/browse#
https://www.poetryfoundation.org/poets/browse#
https://www.poetryfoundation.org/poets/browse#
https://www.poetryfoundation.org/poets/browse#
https://www.poetryfoundation.org/poets/browse#
https://www.poetryfoundation.org/poets/browse#
mailto:?subject=Poetry%20Foundation&body=https://www.poetryfoundation.org/poets/browse
https://www.poetryfoundation.org/poets/browse#
https://www.poetryfoundation.org/poets/browse#
https://www.poetryfoundation.org/poets/browse#
https://www.poetryfoundation.org/poets/browse#
https://www.poetryfoundation.org/poets/browse#
https://www.poetryfoundation.org/poets/browse#
https://www.poetryfoundation.org/poets/browse#
https://www.poetryfoundation.org/poets/browse#
https://www.poetryfoundation.org/poets/browse#
https://ww

In [89]:
driver.get_attribute( '//a[regx:match(@href, ".*/poets/.*")]' )

AttributeError: 'WebDriver' object has no attribute 'get_attribute'

In [72]:
driver.find_element_by_link_text(r'.*/poets/.*')

NoSuchElementException: Message: no such element: Unable to locate element: {"method":"link text","selector":".*/poets/.*"}
  (Session info: chrome=83.0.4103.106)


In [64]:
poets

[]

In [65]:
len(poets)

0