# Scrape Public Domain Poetry

This is a notebook for scraping poems from http://public-domain-poetry.com

In [1]:
from bs4 import BeautifulSoup
import requests
import re

## Get all links to author pages

In [12]:
response = requests.get(f"http://public-domain-poetry.com/alfred-edward-housman/-20705")
html_string = response.text
document = BeautifulSoup(html_string, "html.parser")

In [4]:
author_links = []

for page_number in range(0, 12):
    response = requests.get(f"http://public-domain-poetry.com/name.php?l=All&s=author&page={page_number}")
    html_string = response.text
    document = BeautifulSoup(html_string, "html.parser")
    for link in document.find_all('a'):
        #if ',' in link.text and 'Read' not in link.text:
        #if link.text.istitle():
        if '-' in link['href'] and link.text.istitle() and 'poetry' not in link['href']:
            author_links.append(f"http://public-domain-poetry.com/{link['href']}")

In [5]:
author_links

['http://public-domain-poetry.com/a-h-laidlaw',
 'http://public-domain-poetry.com/a-hoatson',
 'http://public-domain-poetry.com/a-r-ammons',
 'http://public-domain-poetry.com/abijah-m-ide',
 'http://public-domain-poetry.com/abraham-cowley',
 'http://public-domain-poetry.com/abram-joseph-ryan',
 'http://public-domain-poetry.com/ada-langworthy-collier',
 'http://public-domain-poetry.com/adam-bernard-mickiewicz',
 'http://public-domain-poetry.com/adam-lindsay-gordon',
 'http://public-domain-poetry.com/alan-alexander-milne',
 'http://public-domain-poetry.com/alan-l-strang',
 'http://public-domain-poetry.com/alan-seeger',
 'http://public-domain-poetry.com/albert-bigelow-paine',
 'http://public-domain-poetry.com/aldous-leonard-huxley',
 'http://public-domain-poetry.com/alexander-pope',
 'http://public-domain-poetry.com/alexander-rodger',
 'http://public-domain-poetry.com/alfred-castner-king ',
 'http://public-domain-poetry.com/alfred-edward-housman',
 'http://public-domain-poetry.com/alfred-

## Loop through author links and get all poem data for each author

In [26]:
poem_dictionaries = []

for author_link in author_links:
    
    try:
        print(f"Scraping... {author_link}")

        response = requests.get(author_link)
        html_string = response.text
        document = BeautifulSoup(html_string, "html.parser")
        
        if "sorry" in document.text.lower():
            print("Sorry error found on page...")
            continue

        author_years = str(document.find('font', attrs={'class': 't1'})).split('<br/>')[2].split('<')[0].replace(' � ', '-')
        birth_year = author_years.split('-')[0]

        if len(author_years.split('-')) >= 2:
            death_year = author_years.split('-')[1]
        else:
            death_year = '?'

       # print(birth_year)

        author_link_ending = author_link.replace('http://public-domain-poetry.com/', '') 
        poem_links = [link['href'] for link in document.find_all('a') if author_link_ending in link['href']]
        poem_links = list(set(poem_links))

        #print(poem_links)

        for poem_link in poem_links:

            print(f"Scraping..{poem_link}")
            response = requests.get(f"http://public-domain-poetry.com/{poem_link}")
            html_string = response.text
            document = BeautifulSoup(html_string, "html.parser")
        
            if "sorry" in document.text.lower():
                print("Sorry error found on page...")
                continue
            
            if document.find('font', attrs={'class': 't1'}):

                poem_author = document.find('font', attrs={'class': 't1'}).text

                regex_search = re.search('(?<=By ).*', poem_author)

                if regex_search != None:
                    poem_author = re.search('(?<=By ).*', poem_author).group(0).strip()

                else:
                    poem_author = poem_author
            else:
                poem_author = 'Unknown'

            poem_title = [link.text for link in document.find_all('font', attrs={'class': 't0'}) if "Public Domain" not in link.text][0].strip('.')

            poem_text = document.find('font', attrs={'class': 't3a'}).text.replace('\xa0', '')

            #print(poem_author, '//', poem_title)
            poem_dictionaries.append({'author': poem_author,
                                     'title': poem_title,
                                      'text': poem_text,
                                       'period': author_years,
                                        'birth_year': birth_year,
                                        'death_year': death_year,            
                                      'link': f"http://public-domain-poetry.com/{poem_link}"})
            print(f"Succesfully scraped..{poem_title} by {poem_author}")
    
    except IndexError:
        print("IndexError")
        pass

Scraping... http://public-domain-poetry.com/a-h-laidlaw
Scraping..a-h-laidlaw/american-consummation-22327
Succesfully scraped..The American Consummation by A. H. Laidlaw
Scraping..a-h-laidlaw/lily-land-of-france-22332
Succesfully scraped..The Lily Land Of France by A. H. Laidlaw
Scraping..a-h-laidlaw/with-a-ho-ho-ho-and-a-hi-hi-hi-22316
Succesfully scraped..With A Ho-Ho-Ho! And A Hi-Hi-Hi! by A. H. Laidlaw
Scraping..a-h-laidlaw/three-ps-the-pratie-the-pig-and-poteen-22333
Succesfully scraped..The Three P's. - The Pratie, The Pig And Poteen by A. H. Laidlaw
Scraping..a-h-laidlaw/black-eyes-22304
Succesfully scraped..Black Eyes by A. H. Laidlaw
Scraping..a-h-laidlaw/american-girls-22300
Succesfully scraped..The American Girls by A. H. Laidlaw
Scraping..a-h-laidlaw/american-a-ira-22305
Succesfully scraped..The American �a Ira by A. H. Laidlaw
Scraping..a-h-laidlaw/sherry-in-the-saddle-22321
Succesfully scraped..Sherry In The Saddle by A. H. Laidlaw
Scraping..a-h-laidlaw/bird-of-the-summer

## Add Dates and Ages

In [2]:
import pandas as pd
pd.options.display.max_colwidth = 400

In [29]:
df = pd.DataFrame(poem_dictionaries)

In [9]:
def get_year(date):
    year = re.search('\d{4}', date)
    if year != None:
        return re.search('\d{4}', date).group(0)

In [34]:
df['death_year'] = df['death_year'].apply(get_year)
df['death_year'] = pd.to_numeric(df['death_year'])

In [35]:
df['birth_year'] = df['birth_year'].apply(get_year)
df['birth_year'] = pd.to_numeric(df['birth_year'])

In [61]:
df['age'] =  df['death_year'] - df['birth_year'] 

In [41]:
df['dates'] = df['birth_year'] + '-' + df['death_year']

## Output to CSV

In [66]:
df.to_csv('public-domain-poetry.csv', index=False, encoding='utf=8')

## Clean/Examine Data

In [200]:
df = pd.read_csv('public-domain-poetry.csv')

In [201]:
df.sample(5)

Unnamed: 0,author,title,text,period,birth_year,death_year,link,dates,age
30414,William Wordsworth,Ecclesiastical Sonnets - Part II. - XXVII - Imaginative Regrets,"Deep is the lamentation! Not alone\r\n From Sages justly honoured by mankind;\r\n But from the ghostly tenants of the wind,\r\n Demons and Spirits, many a dolorous groan\r\n Issues for that dominion overthrown:\r\n Proud Tiber grieves, and far-off Ganges, blind\r\n As his own worshipers: and Nile, reclined\r\n Upon his monstrous urn, the farewell moan\r\n Renews. Through every forest, cave, a...","April 7, 1770-April 23, 1850",1770.0,1850.0,http://public-domain-poetry.com/william-wordsworth/ecclesiastical-sonnets-part-ii-xxvii-imaginative-regrets-4402,1770-1850,80.0
6727,Emily Elizabeth Dickinson,The Rat,"The rat is the concisest tenant.\r\nHe pays no rent, --\r\nRepudiates the obligation,\r\nOn schemes intent.\n\r\nBalking our wit\r\nTo sound or circumvent,\r\nHate cannot harm\r\nA foe so reticent.\n\r\nNeither decree\r\nProhibits him,\r\nLawful as\r\nEquilibrium.","December 10, 1830-May 15, 1886",1830.0,1886.0,http://public-domain-poetry.com/emily-elizabeth-dickinson/rat-13728,1830-1886,56.0
25597,Thomas Moore,St. Senanus And The Lady,"ST. SENANUS.[1]\n\r\n""Oh! haste and leave this sacred isle,\r\nUnholy bark, ere morning smile;\r\nFor on thy deck, though dark it be,\r\nA female form I see;\r\nAnd I have sworn this sainted sod\r\nShall ne'er by woman's feet be trod.""\n\r\nTHE LADY.\n\r\n""Oh! Father, send not hence my bark,\r\nThro' wintry winds and billows dark:\r\nI come with humble heart to share\r\nThy morn and evening pr...",28 May 1779-25 February 1852,1779.0,1852.0,http://public-domain-poetry.com/thomas-moore/st-senanus-and-the-lady-26829,1779-1852,73.0
7331,Frances Ellen Watkins Harper,The Pure In Heart Shall See God,"They shall see Him in the crimson flush\r\n Of morning's early light,\r\n In the drapery of sunset,\r\n Around the couch of night.\n\n\r\n When the clouds drop down their fatness,\r\n In late and early rain,\r\n They shall see His glorious footprints\r\n On valley, hill and plain.\n\r\n They shall see Him when the cyclone\r\n Breathes terror through the land;\r\n They shall see Him 'mid the m...",24 September 1825 - 22 February 1911,1825.0,1911.0,http://public-domain-poetry.com/frances-ellen-watkins-harper/pure-in-heart-shall-see-god-17803,1825-1911,86.0
21810,Robert Lee Frost,Into My Own,"One of my wishes is that those dark trees,\r\nSo old and firm they scarcely show the breeze,\r\nWere not, as 'twere, the merest mask of gloom,\r\nBut stretched away unto th eedge of doom.\n\r\nI should not be withheld but that some day\r\ninto their vastness I should steal away,\r\nFearless of ever finding open land,\r\nor highway where the slow wheel pours the sand.\n\r\nI do not see why I sh...","March 26, 1874-January 29, 1963",1874.0,1963.0,http://public-domain-poetry.com/robert-lee-frost/into-my-own-1190,1874-1963,89.0


Remove rows with blank text

In [202]:
df = df[df['text'].isna() == False]

Most of the birth and death years are correct, but some are off, so we're going to re-create these columns

In [203]:
def get_birth_year(date):
    if type(date) == str:
        year = re.search('\d{4}', date)
        if year != None:
            return re.search('\d{4}', date).group(0)

In [204]:
def get_death_year(date):
    if type(date) == str:
        year = re.search('\d{4}', date)
        if year != None and len(re.findall('\d{4}', date)) > 1:
            return re.findall('\d{4}', date)[1]

In [205]:
df['birth_year'] = df['period'].apply(get_birth_year)
df['birth_year'] = pd.to_numeric(df['birth_year'])

In [206]:
df['death_year'] = df['period'].apply(get_death_year)
df['death_year'] = pd.to_numeric(df['death_year'])

Convert Nones and ?s to NaN values

In [207]:
import numpy as np
df['period'] = df['period'].replace('??', np.nan)
df['period'] = df['period'].replace('?', np.nan)
df = df.fillna(value=np.nan)

Rename "period" as "lifespan" because we're going to create a new column called "period"

In [208]:
df = df.rename(columns={'period': 'lifespan'})

In [209]:
df.describe(include='all')

Unnamed: 0,author,title,text,lifespan,birth_year,death_year,link,dates,age
count,31080,31080,31080,29631,29617.0,29069.0,31080,27449,27449.0
unique,450,27725,31006,367,,,31080,329,
top,Robert Herrick,Song,"(To a Country Gentleman.)\n\n\r\nMan, with integrity of heart,\r\nDisdains to play a double part:\r\nHe bears a moral coat of mail,\r\nWhen envy snarls and slanders rail.\r\nFrom virtue's shield the shafts resound,\r\nAnd his light shines in freedom round.\n\r\nIf in his country's cause he rise,\r\nUnbribed, unawed, he will advise;\r\nWill fear no ministerial frown,\r\nNeither will clamour put...","Baptized - August 24, 1591- October 1674",,,http://public-domain-poetry.com/william-butler-yeats/dolls-52,1865-1914,
freq,1464,102,6,1464,,,1,1345,
mean,,,,,1798.324982,1862.041694,,,63.824912
std,,,,,103.402509,103.280217,,,17.008976
min,,,,,1265.0,1321.0,,,21.0
25%,,,,,1779.0,1850.0,,,49.0
50%,,,,,1830.0,1892.0,,,69.0
75%,,,,,1865.0,1921.0,,,77.0


In [210]:
df['author'].value_counts()[:50]

Robert Herrick                             1464
Madison Julius Cawein                      1345
William Wordsworth                          963
Thomas Moore                                853
Thomas Hardy                                655
Rudyard Kipling                             638
Robert Burns                                499
John Greenleaf Whittier                     481
Algernon Charles Swinburne                  461
Emily Elizabeth Dickinson                   447
Paul Laurence Dunbar                        417
John Clare                                  382
William Butler Yeats                        378
Francesco Petrarca (Petrarch)               375
Paul Cameron Brown                          341
Walt Whitman                                338
Edgar Lee Masters                           331
Percy Bysshe Shelley                        330
Walter De La Mare                           329
Oliver Wendell Holmes                       329
Alfred Lord Tennyson                    

How many poems contain the � encoding character?

In [211]:
len(df[df['text'].str.contains('�') == True])

4862

In [212]:
df[df['text'].str.contains('�') == True][:5]

Unnamed: 0,author,title,text,lifespan,birth_year,death_year,link,dates,age
24,A. H. Laidlaw,Babylon,"Thou art mighty,\r\nBabylon!\r\nThou art haughty,\r\nBabylon!\r\nHaughty, mighty,\r\nBabylon!\r\nThrough thy streets the bats shall fly,\r\nO'er thy ruins owls shall cry,\r\nAll thy chivalry shall die,\r\nBabylon!\n\r\nGolden-godded\r\nBabylon!\r\nIdol-curs�d\r\nBabylon!\r\nIdol-curs�d, golden-godded,\r\nBabylon!\r\nAll thy gods shall bite the dust,\r\nAll thy golden godlets must\r\nSink to ro...",,,,http://public-domain-poetry.com/a-h-laidlaw/babylon-22309,,
48,A. R. Ammons,Greeting Verses,"What do I find right at the center of my interpersonal\r\nrelationships: a slightly dispersed but indisputably\r\ntinctured core of brutality: go to the hospital\n\n\r\nthe question is not whether your life is at stake\r\nbut whether you can pay the bill, guaranteeing it on\r\nadmission (or no admission) and proving it (or not getting\n\n\r\nout) on release (if any): this bit of realism\r\nclu...","February 18, 1926-February 25, 2001",1926.0,2001.0,http://public-domain-poetry.com/a-r-ammons/greeting-verses-2702,1926-2001,75.0
53,A. R. Ammons,"Shit List; Or, Omnium-gatherum Of Diversity Into Unity","You'll rejoice at how many kinds of shit there are:\r\ngosling shit (which J. Williams said something\r\nwas as green as), fish shit (the generality), trout\n\r\nshit, rainbow trout shit (for the nice), mullet shit,\r\nsand dab shit, casual sloth shit, elephant shit\r\n(awesome as process or payload), wildebeest shit,\n\r\nhorse shit (a favorite), caterpillar shit (so many dark\r\nkinds, neatl...","February 18, 1926-February 25, 2001",1926.0,2001.0,http://public-domain-poetry.com/a-r-ammons/shit-list-or-omnium-gatherum-of-diversity-into-unity-2698,1926-2001,75.0
65,A. R. Ammons,Corsons Inlet,"I went for a walk over the dunes again this morning\r\nto the sea,\r\nthen turned right along\r\nthe surf\r\nrounded a naked headland\r\nand returned\n\n\r\nalong the inlet shore:\n\n\r\nit was muggy sunny, the wind from the sea steady and high,\r\ncrisp in the running sand,\r\nsome breakthroughs of sun\r\nbut after a bit\n\n\r\ncontinuous overcast:\n\n\r\nthe walk liberating, I was released f...","February 18, 1926-February 25, 2001",1926.0,2001.0,http://public-domain-poetry.com/a-r-ammons/corsons-inlet-2705,1926-2001,75.0
70,Abijah M. Ide,To Isadore,"I\n\r\nBeneath the vine-clad eaves,\r\nWhose shadows fall before\r\nThy lowly cottage door\r\nUnder the lilac�s tremulous leaves,\r\nWithin thy snowy claspe�d hand\r\nThe purple flowers it bore.\r\nLast eve in dreams, I saw thee stand,\r\nLike queenly nymphs from Fairy-land,\r\nEnchantress of the flowery wand,\r\nMost beauteous Isadore!\n\r\nII\n\r\nAnd when I bade the dream\r\nUpon thy spirit...",1825-1873,1825.0,1873.0,http://public-domain-poetry.com/abijah-m-ide/to-isadore-6552,1825-1873,48.0


In [213]:
df['text'] = df['text'].str.replace("�", "'")
df['title'] = df['title'].str.replace("�", "'")

In [214]:
len(df[df['text'].str.contains('�') == True])
len(df[df['title'].str.contains('�') == True])

0

## Estimate Time Period

In [215]:
def get_poetry_period(birth_year): 
    career_start = birth_year + 20
    if career_start <= 2000 and career_start > 1890:
        return "20th Century"
    elif career_start <= 1890 and career_start > 1790:
        return "19th Century"
    elif career_start <= 1790 and career_start > 1700:
        return "18th Century"
    elif career_start <= 1700 and career_start > 1500:
        return "16th-17th Centuries (Early Modern)"
    elif career_start <= 1500 and career_start > 1100:
        return "12th-15th Centuries (Middle English)"
    elif career_start < 1100:
        return "11th Century and Earlier"

In [216]:
df['period'] = df['birth_year'].apply(get_poetry_period)

In [217]:
df['period'].value_counts()

19th Century                            19025
20th Century                             4190
18th Century                             3132
16th-17th Centuries (Early Modern)       2786
12th-15th Centuries (Middle English)      484
Name: period, dtype: int64

In [218]:
df[df['period'] == '19th Century']['author'].value_counts()[:15]

Madison Julius Cawein         1345
Thomas Moore                   853
Thomas Hardy                   655
Rudyard Kipling                638
John Greenleaf Whittier        481
Algernon Charles Swinburne     461
Emily Elizabeth Dickinson      447
John Clare                     382
William Butler Yeats           378
Walt Whitman                   338
Edgar Lee Masters              331
Percy Bysshe Shelley           330
Oliver Wendell Holmes          329
Alfred Lord Tennyson           325
Henry Wadsworth Longfellow     295
Name: author, dtype: int64

In [219]:
df[df['period'] == '20th Century']['author'].value_counts()[:15]

Paul Laurence Dunbar                       417
Walter De La Mare                          329
Robert William Service                     212
John Frederick Freeman                     202
D. H. Lawrence (David Herbert Richards)    189
Vachel Lindsay                             179
Robert Lee Frost                           164
Edward Powys Mathers (As Translator)       120
Gilbert Keith Chesterton                   105
Conrad Potter Aiken                        100
James Stephens                              99
Edna St. Vincent Millay                     94
Rupert Brooke                               94
John Collings Squire, Sir                   90
Alfred Lichtenstein                         86
Name: author, dtype: int64

In [222]:
df[df['period'] == '18th Century']['author'].value_counts()[:15]

William Wordsworth      963
Robert Burns            499
William Cowper          308
Friedrich Schiller      256
William Lisle Bowles    232
Alexander Pope          208
William Blake           141
Anna Seward             126
John Gay                 76
Mark Akenside            73
Oliver Goldsmith         40
William Hayley           36
Samuel Rogers            34
Edward Young             33
Helen Maria Williams     26
Name: author, dtype: int64

In [223]:
df[df['period'] == '16th-17th Centuries (Early Modern)']['author'].value_counts()[:15]

Robert Herrick         1464
Michael Drayton         225
Matthew Prior           187
William Shakespeare     156
Philip Sidney (Sir)     142
John Milton             129
John Dryden             125
Ben Jonson               88
Anne Bradstreet          39
John Wilmot              39
Thomas Carew             37
William Browne           33
Edmund Spenser           22
Joseph Addison           19
Jan Kochanowski          19
Name: author, dtype: int64

In [224]:
df[df['period'] == '12th-15th Centuries (Middle English)']['author'].value_counts()[:15]

Francesco Petrarca (Petrarch)    375
Dante Alighieri                  100
John Gower                         9
Name: author, dtype: int64

## Write to CSV

In [225]:
df['age'] =  df['death_year'] - df['birth_year'] 

In [229]:
df.to_csv('public-domain-poetry-with-ages.csv', index=False, encoding='utf-8')

Drop extra columns for slim version

In [None]:
df = df.drop('age', axis='columns')
df = df.drop('dates', axis='columns')

In [239]:
df.to_csv('public-domain-poetry.csv', index=False, encoding='utf-8')