This notebook scrapes the UC-Santa Barbara presidency site to grab the time line of Obama's presidency, grabs the links, then grabs the speeches. There are about 135 speeches hosted by this site. They also link to some other speeches which are not saved.

In [1]:
# The usual suspects...
import requests
import re
import pandas as pd
from bs4 import BeautifulSoup
import time # avoid swamping the americanrhetoric website
import os # to check if directory exists and create it if it doesn't
from datetime import date, datetime # to parse speech date

<A HREF="https://www.presidency.ucsb.edu/documents/barack-obama-event-timeline">UCSB timeline and speech links</A>

<A HREF="https://pandas.pydata.org/Pandas_Cheat_Sheet.pdf">Pandas Cheat Sheet</A>

In [2]:
ucsburl = "https://www.presidency.ucsb.edu/documents/barack-obama-event-timeline"
ucsbPage = requests.get(ucsburl, headers={'user-agent': 'Mozilla/5.0'})
soup = BeautifulSoup(ucsbPage.content, 'html.parser')

In [3]:
# <div class="WordSection1">
timeline = soup.find('div', {'class':'WordSection1'})

In [4]:
# Example event
#<tr>
#<td style="border:none; border-bottom:solid #43b4e0 1.0pt; width:125.05pt; padding:0in 5.4pt 0in 5.4pt" valign="top" width="167">
#  <p align="center" style="margin-top:3.0pt; margin-right:0in; margin-bottom:3.0pt; margin-left:0in; text-align:center">
#  <span lang="EN-GB" style="font-size:12.0pt" xml:lang="EN-GB">
#    <span style="color:black">1/22/2009</span>
#  </span>
#  </p>
#</td>
#<td style="border:none; border-bottom:solid #43b4e0 1.0pt; width:299.45pt; padding:0in 5.4pt 0in 5.4pt" valign="top" width="399">
#  <p style="margin-top:3.0pt; margin-right:0in; margin-bottom:3.0pt; margin-left:0in">
#    <span style="font-size:12.0pt">
#      <a href="https://www.presidency.ucsb.edu/documents/executive-order-13491-ensuring-lawful-interrogations">Executive Order 13491, Ensuring Lawful Interrogations</a>.  Directs that detainees in armed conflict shall be treated humanely and not be subject “to violence to life and person” or “outrages to personal dignity.”
#    </span>
#  </p>
#</td>
#</tr>


In [5]:
# Date: <span lang="EN-GB" style="font-size:12.0pt" xml:lang="EN-GB"><span style="color:black">1/22/2009</span></span>
# Event: 

dates = [x.find('span', {'style': 'color:black'}).text for x in timeline.find_all('td', {'width': '167'}) if len(x.find('span', {'style': 'color:black'}).text)>4]
len(dates)

142

<A HREF="https://docs.python.org/3/library/datetime.html#datetime.date">Python datetime documentation</A><BR>
<A HREF="https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes">Format codes</A>

In [6]:
# type(dates[2]) = str
dates[2]
# mm/dd/YYYY

'01/20/2009'

In [7]:
cleanDates = []
for x in dates:
    try:
        cleanDates.append(datetime.strptime(x, '%m/%d/%Y').strftime("%d/%m/%Y"))
    except:
        try:
            cleanDates.append(datetime.strptime(x, '0%m/%d/%Y').strftime("%d/%m/%Y"))
        except:
            try:
                cleanDates.append(datetime.strptime(x[0:10], '%m/%d/%Y').strftime("%d/%m/%Y"))
            except:
                cleanDates.append(datetime.strptime(x[0:10], '%m/%d-%Y').strftime("%d/%m/%Y"))
len(cleanDates)

142

In [8]:
# Event: 

links = [x.find('a',href=True)['href'] for x in timeline.find_all('td', {'width': '399'}) if x.find('a') is not None]
len(links)

142

In [9]:
#font-size:12.0pt
descriptions = [x.find('span', {'style': 'font-size:12.0pt'}).text for x in timeline.find_all('td', {'width': '399'}) if x.find('a') is not None]
len(descriptions)

142

In [10]:
metadata=list(zip(cleanDates, links, descriptions))

In [11]:
df = pd.DataFrame(metadata)
df.to_csv('ucsb_metadata.csv', encoding='utf-8', header=False, index=False)

<A HREF="https://beautiful-soup-4.readthedocs.io/en/latest/">Beautiful Soup Documentation</A>

In [12]:
# Check if Data and associated folders exist for saving speech CSVs to.
# https://djangocentral.com/check-if-a-directory-exists-if-not-create-it/

MYDIRS = ["DataUCSB"]

# If folder doesn't exist, then create it.
for MYDIR in MYDIRS:
    CHECK_FOLDER = os.path.isdir(MYDIR)
    if CHECK_FOLDER:
        print(MYDIR, "folder already exists.")
    else:
        os.makedirs(MYDIR)
        print("created folder : ", MYDIR)

DataUCSB folder already exists.


In [13]:
for speechlink in links:
    # check if this is a ucsb presidency link, there are a couple other sites, don't feel like parsing them now.
    if 'www.presidency.ucsb.edu' in speechlink:
        try:
            speechPage = requests.get(speechlink, headers={'user-agent': 'Mozilla/5.0'})
        except Exception as e:
            print(e, speechlink)
            continue # skip this page and go back to beginning of for loop
        speechsoup = BeautifulSoup(speechPage.content, 'html.parser')

    # speech text is in <div class="field-docs-content">
    try:
        speechHTML = speechsoup.find('div', {'class':'field-docs-content'})
        # Each paragraph is in <p> ... </p>, then remove all html tags, including <em>...</em>
        speechCSV = [re.sub('\<.*?\>', '', str(x)) for x in speechHTML.find_all('p') if speechHTML is not None]
        df = pd.DataFrame(speechCSV)
        # filename is everything to left of all / in speechlink
        filename = 'DataUCSB/' + re.sub("((.*)[\/])", "", speechlink) + '.csv'
        df.to_csv(filename, encoding='utf-8', header=False, index=False)
    except Exception as e:
        print(e, speechlink)
    time.sleep(3) # trying to be nice here!

'NoneType' object has no attribute 'find_all' https://www.presidency.ucsb.edu/statistics/elections/2008
'NoneType' object has no attribute 'find_all' https://www.presidency.ucsb.edu/statistics/elections/2012
HTTPSConnectionPool(host='www.presidency.ucsb.edu', port=443): Max retries exceeded with url: /documents/remarks-georgetown-university-4 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000001B10537FFA0>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond')) https://www.presidency.ucsb.edu/documents/remarks-georgetown-university-4
[Errno 22] Invalid argument: 'DataUCSB/12-307?_escaped_fragment_=&_escaped_fragment_=.csv' https://www.oyez.org/cases/2012/12-307?_escaped_fragment_=&_escaped_fragment_=
'NoneType' object has no attribute 'find_all' https://www.presidency.ucs