### Scarping President data

- Import libraries

In [77]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from webdriver_manager.firefox import GeckoDriverManager
import re
import dateparser 

- Initializing the driver and collect HTML content of the page using Beautiful soup

In [78]:
driver = webdriver.Firefox()

url = "https://www.britannica.com/topic/Presidents-of-the-United-States-1846696"
driver.get(url)
driver.implicitly_wait(10)

page_source = driver.page_source

presidents = pd.read_html(page_source)[0]

In [79]:
presidents.head()

Unnamed: 0.1,Unnamed: 0,no.,president,birthplace,political party,term
0,,1,George Washington,Va.,Federalist,1789–97
1,,2,John Adams,Mass.,Federalist,1797–1801
2,,3,Thomas Jefferson,Va.,Democratic-Republican,1801–09
3,,4,James Madison,Va.,Democratic-Republican,1809–17
4,,5,James Monroe,Va.,Democratic-Republican,1817–25


- Creating Function to convert term into start and end year

In [80]:
presidents['term'] = presidents.apply(lambda row: re.sub(r"[^\d–]", "", row['term']), axis=1) # replace any non-digit before dash with blank

# first, split the string in the term column using dash as delimiter - store this in new column called 'from'
presidents['start_year'] = presidents['term'].apply(lambda x: x.split("–")[0]) 

# if there is no end year, then the end year is the same as the start year, otherwise get the second element of the split string, this is the end year of the term
presidents['end_year'] = presidents['term'].apply(lambda x: x.split("–")[0] if len(x.split("–"))==1 else x.split("–")[1]) 

# if the end year is only two digits, then add the first two digits of the start year to the end year
presidents['end_year'] = presidents.apply(lambda row:  row['start_year'][:2]+row['end_year'] if len(row['end_year'])==2 else row['end_year'], axis=1) 

presidents

Unnamed: 0.1,Unnamed: 0,no.,president,birthplace,political party,term,start_year,end_year
0,,1,George Washington,Va.,Federalist,1789–97,1789.0,1797.0
1,,2,John Adams,Mass.,Federalist,1797–1801,1797.0,1801.0
2,,3,Thomas Jefferson,Va.,Democratic-Republican,1801–09,1801.0,1809.0
3,,4,James Madison,Va.,Democratic-Republican,1809–17,1809.0,1817.0
4,,5,James Monroe,Va.,Democratic-Republican,1817–25,1817.0,1825.0
5,,6,John Quincy Adams,Mass.,National Republican,1825–29,1825.0,1829.0
6,,7,Andrew Jackson,S.C.,Democratic,1829–37,1829.0,1837.0
7,,8,Martin Van Buren,N.Y.,Democratic,1837–41,1837.0,1841.0
8,,9,William Henry Harrison,Va.,Whig,1841,1841.0,1841.0
9,,10,John Tyler,Va.,Whig,1841–45,1841.0,1845.0


- Appending the start year and End year to presidents dataframe

### Converting it to a csv and saving file

In [81]:
presidents.to_csv("presidents.csv", encoding="utf-8",quotechar="'",index=False)

In [82]:
presidents.rename(columns={'president':'name'}, inplace=True)

In [83]:
presidents.head()

Unnamed: 0.1,Unnamed: 0,no.,name,birthplace,political party,term,start_year,end_year
0,,1,George Washington,Va.,Federalist,1789–97,1789,1797
1,,2,John Adams,Mass.,Federalist,1797–1801,1797,1801
2,,3,Thomas Jefferson,Va.,Democratic-Republican,1801–09,1801,1809
3,,4,James Madison,Va.,Democratic-Republican,1809–17,1809,1817
4,,5,James Monroe,Va.,Democratic-Republican,1817–25,1817,1825


### Loading presidental speeches from the csv file saved

In [84]:
speechs = pd.read_csv("presidential_speeches.csv", encoding="utf-8",quotechar="'")
speechs.shape

(928, 5)

### Matching the names of both the csv files

In [85]:
import difflib 

In [86]:
name_match ={}
for name in presidents['name']:
        closest_match = difflib.get_close_matches(name, speechs['name'], n=1)
        if closest_match:
                name_match[name] = closest_match[0]

presidents['name_match'] = presidents['name'].map(name_match)


### Merging both the csv's.

In [87]:

merged_df = presidents.merge(speechs, left_on='name_match', right_on='name', how='inner')

In [88]:
merged_df[:3]

Unnamed: 0.1,Unnamed: 0,no.,name_x,birthplace,political party,term,start_year,end_year,name_match,name_y,title,date,info,speech
0,,1,George Washington,Va.,Federalist,1789–97,1789,1797,George W. Bush,George W. Bush,"January 15, 2009: Farewell Address to the Nation","January 15, 2009",President George W. Bush gives his farewell ad...,"Fellow citizens:\r For eight years, it has ..."
1,,1,George Washington,Va.,Federalist,1789–97,1789,1797,George W. Bush,George W. Bush,"January 12, 2009: Final Press Conference","January 12, 2009","In his last press conference as president, Pre...",THE PRESIDENT: Thank you. Tapper. We have b...
2,,1,George Washington,Va.,Federalist,1789–97,1789,1797,George W. Bush,George W. Bush,"December 19, 2008: Remarks on Plan to Assist A...","December 19, 2008","From the White House's Roosevelt Room, Preside...","Good morning. For years, America's automake..."


### Collecting all the required information from the merged dataframe and converting it into a new csv for analysis

In [89]:
president_speech_df = pd.DataFrame()

In [90]:
president_speech_df['Name'] = merged_df['name_x']
president_speech_df['Political Party'] = merged_df['political party']
president_speech_df['Speech'] = merged_df['speech']
president_speech_df['Date'] = merged_df['date']
president_speech_df['Start_year'] = merged_df['start_year']
president_speech_df['End_year'] = merged_df['end_year']
president_speech_df['Title'] = merged_df['title']

In [91]:
president_speech_df.to_csv("A1_data_presidents.csv", encoding="utf-8",quotechar="'",index=False)

In [92]:
president_speech_df

Unnamed: 0,Name,Political Party,Speech,Date,Start_year,End_year,Title
0,George Washington,Federalist,"Fellow citizens:\r For eight years, it has ...","January 15, 2009",1789,1797,"January 15, 2009: Farewell Address to the Nation"
1,George Washington,Federalist,THE PRESIDENT: Thank you. Tapper. We have b...,"January 12, 2009",1789,1797,"January 12, 2009: Final Press Conference"
2,George Washington,Federalist,"Good morning. For years, America's automake...","December 19, 2008",1789,1797,"December 19, 2008: Remarks on Plan to Assist A..."
3,George Washington,Federalist,Thank you very much. Please be seated. Than...,"November 13, 2008",1789,1797,"November 13, 2008: Speech on Financial Markets..."
4,George Washington,Federalist,"A short time ago, the House of Representati...","October 03, 2008",1789,1797,"October 3, 2008: Remarks on Emergency Economic..."
...,...,...,...,...,...,...,...
1008,Joe Biden,Democratic,\r THE PRESIDENT: Thank you. (Applause.) Th...,"April 28, 2021",2021,,"April 28, 2021: Address to Joint Session of Co..."
1009,Joe Biden,Democratic,THE PRESIDENT: Thank you. (Applause.) (Inau...,"March 31, 2021",2021,,"March 31, 2021: Announcing the American Jobs Plan"
1010,Joe Biden,Democratic,"THE PRESIDENT: Please, please sit down. Tha...","March 25, 2021",2021,,"March 25, 2021: First Press Conference"
1011,Joe Biden,Democratic,"THE PRESIDENT: Good evening, my fellow Amer...","March 11, 2021",2021,,"March 11, 2021: Remarks on the Anniversary of ..."
