This notebook scrapes the UC-Santa Barbara presidency site to grab the time line of Obama's presidency, grabs the links, then grabs the speeches. There are about 135 speeches hosted by this site. They also link to some other speeches which are not saved.

14 March 2023: Repurposed to get George W Bush speeches from UCSB, the HTML code of which was slightly different than the Obama page

In [1]:
# The usual suspects...
import requests
import re
import pandas as pd
from bs4 import BeautifulSoup
import time # avoid swamping the americanrhetoric website
import os # to check if directory exists and create it if it doesn't
from datetime import date, datetime # to parse speech date

<A HREF="https://www.presidency.ucsb.edu/documents/barack-obama-event-timeline">UCSB timeline and speech links</A>

<A HREF="https://pandas.pydata.org/Pandas_Cheat_Sheet.pdf">Pandas Cheat Sheet</A>

In [2]:
#ucsburl = "https://www.presidency.ucsb.edu/documents/barack-obama-event-timeline"
ucsburl = "https://www.presidency.ucsb.edu/documents/george-w-bush-event-timeline"
ucsbPage = requests.get(ucsburl, headers={'user-agent': 'Mozilla/5.0'})
soup = BeautifulSoup(ucsbPage.content, 'html.parser')

In [62]:
# Obama : <div class="WordSection1">
#timeline = soup.find('div', {'class':'WordSection1'})
# gwb : <div class="field-docs-content">
timeline = soup.find('div', {'class':'field-docs-content'})
#timeline = [x for x in timeline.find_all('tr') if str(x).find('https://www.presidency.ucsb.edu/')>0]

In [60]:
# Example event
#<tr>
#<td style="border:none; border-bottom:solid #43b4e0 1.0pt; width:125.05pt; padding:0in 5.4pt 0in 5.4pt" valign="top" width="167">
#  <p align="center" style="margin-top:3.0pt; margin-right:0in; margin-bottom:3.0pt; margin-left:0in; text-align:center">
#  <span lang="EN-GB" style="font-size:12.0pt" xml:lang="EN-GB">
#    <span style="color:black">1/22/2009</span>
#  </span>
#  </p>
#</td>
#<td style="border:none; border-bottom:solid #43b4e0 1.0pt; width:299.45pt; padding:0in 5.4pt 0in 5.4pt" valign="top" width="399">
#  <p style="margin-top:3.0pt; margin-right:0in; margin-bottom:3.0pt; margin-left:0in">
#    <span style="font-size:12.0pt">
#      <a href="https://www.presidency.ucsb.edu/documents/executive-order-13491-ensuring-lawful-interrogations">Executive Order 13491, Ensuring Lawful Interrogations</a>.  Directs that detainees in armed conflict shall be treated humanely and not be subject “to violence to life and person” or “outrages to personal dignity.”
#    </span>
#  </p>
#</td>
#</tr>


In [68]:
# for gwb page
dates = []
links = []
descriptions = []
for x in timeline.find_all('tr'):
    if str(x).find('https://www.presidency.ucsb.edu/')>0:
        date = [da.find('span', {'style': 'color:black'}).text for da in x.find_all('td', {'width': '192'}) \
         if da.find('span', {'style': 'color:black'}) != None]
        link = [l.find('a',href=True)['href'] for l in x.find_all('td', {'width': '375'}) if l.find('a') is not None]
        desc = [de.text for de in x.find_all('td', {'width': '375'}) if de.find('a') is not None]
        dates.append(date)
        links.append(link)
        descriptions.append(desc)
        
print(len(dates), len(links), len(descriptions))

99 99 99


In [63]:
# Date: <span lang="EN-GB" style="font-size:12.0pt" xml:lang="EN-GB"><span style="color:black">1/22/2009</span></span>
# Event: 
# Width: Obama: 167, gwb: 192
dates = [x.find('span', {'style': 'color:black'}).text for x in timeline.find_all('td', {'width': '192'}) \
         if x.find('span', {'style': 'color:black'}) != None and \
         len(x.find('span', {'style': 'color:black'}).text)>4]
len(dates)

0

<A HREF="https://docs.python.org/3/library/datetime.html#datetime.date">Python datetime documentation</A><BR>
<A HREF="https://docs.python.org/3/library/datetime.html#strftime-and-strptime-format-codes">Format codes</A>

In [69]:
# type(dates[2]) = str
dates[2]
# mm/dd/YYYY

['11/09/2000']

In [87]:
# manually fix some dates in gwb timeline
[i for i, x in enumerate(dates) if len(x)>0 and len(x[0])<10]

[4, 19, 48, 58]

In [89]:
dates[4] = ['01/22/2001']
dates[19] = ['01/29/2002']
dates[48] = ['02/02/2005']
dates[58] = ['01/31/2006']

In [108]:
# Clean out empty entries
import copy # for deep copy
newdates = []
newlinks = []
newdescr = []
for i, x in enumerate(dates):
    if len(x) > 0:
        newdates.append(x[0])
        newlinks.append(links[i][0])
        newdescr.append(descriptions[i][0])

# OK, that looks good, copy back to match the rest of the code
dates = copy.deepcopy(newdates)
links = copy.deepcopy(newlinks)
descriptions = copy.deepcopy(newdescr)

In [127]:
cleanDates = []
for x in dates:
    try:
        cleanDates.append(datetime.strptime(x, '%m/%d/%Y').strftime("%Y/%m/%d"))
    except:
        try:
            cleanDates.append(datetime.strptime(x, '0%m/%d/%Y').strftime("%Y/%m/%d"))
        except:
            try:
                cleanDates.append(datetime.strptime(x[0:10], '%m/%d/%Y').strftime("%Y/%m/%d"))
            except:
                cleanDates.append(datetime.strptime(x[0:10], '%m/%d-%Y').strftime("%Y/%m/%d"))
len(cleanDates)

95

In [110]:
# Event: 
# Width Obama: 399, gwb: 375
#links = [x.find('a',href=True)['href'] for x in timeline.find_all('td', {'width': '375'}) if x.find('a') is not None]
len(links)

95

In [117]:
#font-size:12.0pt
# Width Obama: 399, gwb: 375
# for the Obama page
#descriptions = [x.find('span', {'style': 'font-size:12.0pt'}).text for x in timeline.find_all('td', {'width': '337'}) if x.find('a') is not None]
# for the gwb page
descriptions = [d.replace('\xa0', '') for d in descriptions]
descriptions = [d.replace('\n', '') for d in descriptions]
descriptions
len(descriptions)                  
#

95

In [128]:
metadata=list(zip(cleanDates, links, descriptions))

In [129]:
columns = ['date', 'link', 'description']
mddf = pd.DataFrame(metadata, columns = columns)
#df.to_csv('./Data/genData/ucsb_metadata.csv', encoding='utf-8', header=False, index=False)

mddf.to_csv('./Data/genData/ucsb_metadata_gwb.csv', encoding='utf-8', index=False)

<A HREF="https://beautiful-soup-4.readthedocs.io/en/latest/">Beautiful Soup Documentation</A>

In [120]:
# Check if Data and associated folders exist for saving speech CSVs to.
# https://djangocentral.com/check-if-a-directory-exists-if-not-create-it/

#MYDIRS = ["DataUCSB"]
MYDIRS = ["GWB"]

# If folder doesn't exist, then create it.
for MYDIR in MYDIRS:
    CHECK_FOLDER = os.path.isdir(MYDIR)
    if CHECK_FOLDER:
        print(MYDIR, "folder already exists.")
    else:
        os.makedirs(MYDIR)
        print("created folder : ", MYDIR)

created folder :  GWB


In [122]:
textinfo = []
i=0
for j, speechlink in enumerate(links):
    # check if this is a ucsb presidency link, there are a couple other sites, don't feel like parsing them now.
    if 'www.presidency.ucsb.edu' in speechlink:
        try:
            speechPage = requests.get(speechlink, headers={'user-agent': 'Mozilla/5.0'})
        except Exception as e:
            print(e, speechlink)
            continue # skip this page and go back to beginning of for loop
        speechsoup = BeautifulSoup(speechPage.content, 'html.parser')

    # speech text is in <div class="field-docs-content">
    try:
        speechHTML = speechsoup.find('div', {'class':'field-docs-content'})
        text = speechHTML.text.replace('\n', ' ').lstrip().rstrip()
        # Each paragraph is in <p> ... </p>, then remove all html tags, including <em>...</em>
        speechCSV = [re.sub('\<.*?\>', '', str(x)) for x in speechHTML.find_all('p') if speechHTML is not None]
        df = pd.DataFrame(speechCSV)
        # filename is everything to left of all / in speechlink
        #filename = 'DataUCSB/' + re.sub("((.*)[\/])", "", speechlink) + '.csv'
        #textfilename = 'DataUCSB/' + re.sub("((.*)[\/])", "", speechlink) + '.txt'
        filename = 'GWB/' + re.sub("((.*)[\/])", "", speechlink) + '.csv'
        textfilename = 'GWB/' + re.sub("((.*)[\/])", "", speechlink) + '.txt'
        df.to_csv(filename, encoding='utf-8', header=False, index=False)
        # Write a text file
        text_file = open(textfilename, 'w')
        text_file.write(text)
        text_file.close()
        textinfo.append([metadata[i][0], textfilename])
    except Exception as e:
        print(e, speechlink, dates[j])
    time.sleep(5) # trying to be nice here!
    i+=1


'NoneType' object has no attribute 'text' https://www.presidency.ucsb.edu/statistics/elections/2000 11/07/2000
'NoneType' object has no attribute 'text' https://www.presidency.ucsb.edu/documents/presidential-documents-archive-guidebook/documents-related-the-2000-election-dispute/1109 11/09/2000
'NoneType' object has no attribute 'text' https://www.presidency.ucsb.edu/documents/presidential-documents-archive-guidebook/documents-related-the-2000-election-dispute 11/09/2000
'NoneType' object has no attribute 'text' https://www.presidency.ucsb.edu/node/332343/ 11/05/2002
'NoneType' object has no attribute 'text' https://www.presidency.ucsb.edu/node/332343/ 11/07/2006
'NoneType' object has no attribute 'text' https://www.presidency.ucsb.edu/statistics/elections/2008 11/04/2008


In [141]:
remove_dates=['2000/11/07','2000/11/09','2002/11/05','2006/11/07','2008/11/04']
mddf=mddf.drop(mddf[mddf['date'].isin(remove_dates)].index).reset_index(drop=True)
mddf.to_csv('./Data/genData/ucsb_metadata_gwb.csv', encoding='utf-8', index=False)

In [142]:
textdf = pd.DataFrame(textinfo)
textdf.to_csv('./Data/genData/speech_and_date_gwb.csv', encoding='utf-8', index=False)