## Scraping President speeches

- Import Libraries

In [1]:
from bs4 import BeautifulSoup as bs
import pandas as pd
import time
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from webdriver_manager.firefox import GeckoDriverManager
from selenium.webdriver.common.by import By
import re
import dateparser
import requests

- Initializing the driver and collect HTML content of the page using Beautiful soup

In [2]:
# Initialize Firefox browser
driver = webdriver.Firefox()

# Navigate to the Miller Center's presidential speeches page
url = "https://millercenter.org/the-presidency/presidential-speeches"
driver.get(url)

# Scroll down to load more content if needed (you may need to adapt this part)
scroll_pause_time = 4
initialcoord = driver.execute_script("return document.body.scrollHeight")
while True:
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(scroll_pause_time)
    new_coord = driver.execute_script("return document.body.scrollHeight")
    if new_coord == initialcoord:
        break
    initialcoord = new_coord

In [3]:
# Extract the HTML content after scrolling
page_source = driver.page_source
bsobject_linkpage = bs(page_source, 'html.parser')

In [4]:
# Extract speech URLs
speech_url_list = []

# Loop through the elements and extract data
for speech in bsobject_linkpage.find_all("a", href= re.compile('presidential-speeches/')):
    speech_data = speech["href"]
    speech_url_list.append(speech_data)

In [5]:
speech_url_list[:1]

['https://millercenter.org/the-presidency/presidential-speeches/february-21-2023-remarks-one-year-anniversary-ukraine-war']

### For each URL scraped from the millercenter home page using this URls to get the name,title,date,about,speech.

In [6]:
title, speech, name, date, about = ([] for i in range(5))
for index,link in enumerate(speech_url_list[:]):
    speech_text=[]
    response = requests.get(link)
    # Check if the request was successful (status code 200)
    if response.status_code == 200:
    # Parse the HTML content using BeautifulSoup
        soup = bs(response.text, 'html.parser')
        try:
            name.append((soup.find('p', class_="president-name").text).rstrip())
        except AttributeError:
            name.append("No name available")
        try:
            title.append((soup.find('h2', class_="presidential-speeches--title").text).rstrip())
        except AttributeError:
            title.append("No title available")
        try:
            date.append((soup.find('p', class_="episode-date").text).rstrip())
        except AttributeError:
            date.append("No date available")
        try:
            about.append((soup.find('div', class_="about-sidebar--intro").text).rstrip())
        except AttributeError:
            about.append("No about available")
        try:
            speech_raw = soup.find('div', class_="transcript-inner").text
        except:
            try:
                speech_raw = (soup.find('div', class_="view-transcript").text).rstrip()
            except:
                speech_raw = "No speech available"
        speech.append(re.sub("Transcript|\\n"," ",speech_raw))
    else:
        print(f"Failed to retrieve content. Status code: {response.status_code}")

### Converting the collected data into dataframe

In [None]:
if len(title) == len(speech) == len(name) == len(date) == len(about):
    speeches_presidents = pd.DataFrame({'name':name,'title':title,'date':date,'info':about,'speech':speech}, columns=['name','title','date','info','speech'])

### Convert the collected data to a csv file

In [None]:
speeches_presidents.to_csv("presidential_speeches.csv", encoding="utf-8",quotechar="'",index=False)