# US presidential statistics
## Simon Repko, Lukas Petrasek
### IES FSS CU
### 31.5.2019

This notebook serves as a demonstration of a school project whose goal is to achieve the following:
* scrape web pages to get historical data on US presidents
* manipulate the data into a form suitable for being visualized
* make vizualizations based the data

In [None]:
# TODO: import packages

In [None]:
from typing import Any, Dict, Optional, Type

from bs4 import BeautifulSoup
from tqdm import tqdm
import pandas as pd
import plotly
import requests



plotly.offline.init_notebook_mode(connected=True)

In [None]:
# TODO: initialize the scraping class and apply the methods necessary to get the data here

In [None]:
def get_soup(url: str) -> BeautifulSoup:
    ''' Returns soup for the given url. '''
    return BeautifulSoup(requests.get(url).text, 'html.parser')



class MillerScraper:
    '''
    Class for scraping the Miller Center webpage to get data about US presidents. 

    Particularly, it serves for scraping data on facts and characteristics (self.fast_facts), 
    brief description (self.descriptions), famous quotes (self.famous_quotes), and count of 
    notable events happened during the president's office (self.key_events_counts).
    '''

    def __init__(self):
        ''' Initializes the scraping class. '''
        self.origin: str = 'https://millercenter.org/'

        self.subdirectories: Optional[Dict[str, Any]] = None

        self.fast_facts: Optional[Dict[str, Any]] = None
        self.descriptions: Optional[Dict[str, Any]] = None
        self.famous_quotes: Optional[Dict[str, Any]] = None
        self.key_events_counts: Optional[Dict[str, Any]] = None

        self.all_presidents_data: Optional[pd.DataFrame()] = None


    def get_subdirectories(self) -> None:
        '''
        Creates a dictionary with the presidents' names as keys and their respective 
        subdirectories as values, then saves the dictionary to self.subdirectories.
        '''
        # parse the given origin (web address) utilizing BeautifulSoup 
        soup = get_soup(self.origin) 
        # enter the main navigation panel and find submenu that contains the links 
        # (subdirectories) to individual pages of US presidents
        navigation_menu = soup.find('nav', {'aria-labelledby':'block-mainnavigation-3-menu'})
        submenu = navigation_menu.find_all('ul', {'class':'submenu'})[1]
        a_blocks = submenu.find_all('a')

        subdirectories = {}
        for a_block in tqdm(a_blocks): 
            # each a_block represents one president, extract the subdirectory and save 
            # it under the president's name
            subdirectories[a_block.text] = a_block['href']

        self.subdirectories = subdirectories



def getDataMiller(html):
    soup = getSoup(html) # Parsing of given html utilizing BeautifulSoup 

    # Enter main navigation panel and find submenu that contains list of US presidents and following url 'ending' of their respective subsite
    # Select second list to aim for desired list and remove duplicates due to multiple similar/same lists in the whole html
    name_list = soup.find('nav',{'aria-labelledby':'block-mainnavigation-3-menu'}).find_all('ul',{'class':'submenu'})[1]

    pres_dict = {} # Creation of empty dictionary
    for pres in name_list.find_all('a'): 
        pres_dict[pres.text] = pres['href'] # extract and save names (key) and link 'endings' (value)
    # dictionary_1 -> contains names and links on Millercenter of each US president
    
    
    data_presidents = {}
    for name,href in pres_dict.items():
        # main loop: iterates names and link of presidents contained in previously created dictionary 'pres_dict'
        # getting on the subsite of specific president
        soup = getjoinSoupMiller(href)

        #1 extraction of FAST FACTS dashboard of specific president
        details = soup.find('div',{'class':'president-main-wrapper'}).find('div',{'class':'fast-facts-wrapper'})   

        list_of_relevant_details = [x for x in list(details.children) if x != '\n'] # removing redundant elements 
        list_of_relevant_details.pop(0) # removing first div with decsription

        fast_facts = {}
        for det in list_of_relevant_details:
            fast_facts[det.label.text] = det.div.text # for loop to load details and specifics into dict (key: label of detail)

        #2 brief description of the president
        brief_desc = {}
        brief_desc['Description'] = soup.find('div',{'class':'copy-wrapper'}).p.text # short description of president

        #3 famous quote of the president
        quote = {}
        quote['Quote'] = soup.find('blockquote',{'class':'president-quote'}).contents[0]

        #4 number of KEY EVENTS that happened during office
        # extracting url 'ending' for subsite with notable events that happened at time of office
        key_events = soup.find('div',{'class':'sub-nav-region'}).find_all('a')[1] 
        soup_1 = getjoinSoupMiller(key_events['href']) # getting into the list of key events of president

        ke_count = {}
        # count of number of major events that happened at time of office - key_events_count_X : ke_c_X  
        # D. Trump page has no information about major events hence we included error exception   
        # after some time they changed the notation when 'titles' are highlighted in bold
        try:   
            ke_c_1 = len(soup_1.find('div',{'class':'article-wysiwyg-body'}).find_all('strong')) # count of all events - highlighted by bolding
            ke_c_2 = len(soup_1.find('div',{'class':'article-wysiwyg-body'}).find_all('b')) # notation change
            ke_count['Number of major events'] = ke_c_1 + ke_c_2 # due to double 'bold' notation we sum the count
        except AttributeError:
            ke_count['Number of major events'] = 0
            pass

        data_presidents[name] = {**fast_facts,**brief_desc,**quote,**ke_count} #merge of the dictionaries
        #dictionary_2 -> contains characteristics and facts about each US president, his brief description, famous quote and 
        #                count of notable events happened at time of his office
        
    return data_presidents

In [None]:
def getjoinSoupPotus(link):
    html = "".join(['https://www.potus.com/',link]) 
    response = requests.get(html)
    return BeautifulSoup(response.text,'html.parser') 

def getDataPotus(html,typeofreturn):
    soup = getSoup(html) # Parsing of given html utilizing BeautifulSoup 

    pres_list = {}
    for pres in soup.find_all('a',{'target':'_self'}):
        pres_list[pres.find('img')['alt']] = pres['href'] # extract and save names (key) and link 'endings' (value)

    pres_list.pop('Facts About the Presidents') # removing first row as it is a title
    # dictionary_1 -> includes names and links of each president
    
    
    #1 main loop: iterates links from dictionary_1 and gathers data about US presidential election results and 
    #             presidential salary of each president - therefore it returns two outputs, conditional on the input value 
    elec_results = {} #creation of empty final dict of election results
    salary = {}       #creation of empty dict for salaries
    
    for name,href in pres_list.items(): 
        soup = getjoinSoupPotus(href)

        #2 extraction of salary each president received 
        if typeofreturn == 'salary':
            #Benjamin Harrison has space in the string - error msg - therefore workaround is here
            try:
                salary[name] = soup.find(string="Presidential Salary:").find_parent('p').text
            except AttributeError:
                salary[name] = soup.find(string="Presidential Salary: ").find_parent('p').text

        #3 extraction of html tables of Presidential election results       
        elif typeofreturn == 'election':
            
            elections = soup.find(string="Presidential Election Results:").find_parent('div')

            #3.1 extraction of results for individual years
            for tbl in elections.find_all('table'): #selecting tables for individual years

                year = tbl.find('tr',{'class','row-2'}).find('a').text #year extraction
                electee_list = {}

                #3.2 selecting row with individual candidate
                for electee in tbl.find_all('tr'): 
                    try: # first row is header hence we include this exception skip it and to continue

                        electee_name = electee.find('td',{'class':'column-2'}).a.text  # name of candidate
                        electee_votes = {}

                        #3.3 data contained in tables for results of elections further in the past do not include
                        #  'popular votes' column, therefore it was necessary to include this condition
                        if len(elections.find('tr').find_all('th')) == 3: 
                            electee_votes['popular_votes']   = "" 
                            # above: number of popular votes candidate gained
                            electee_votes['electoral_votes'] = electee.find('td',{'class':'column-3'}).text
                            # above: number of electoral votes candidate gained
                        else:
                            electee_votes['popular_votes']   = electee.find('td',{'class':'column-3'}).text   
                            electee_votes['electoral_votes'] = electee.find('td',{'class':'column-4'}).text   

                        electee_list[electee_name] = electee_votes 
                        
                    except AttributeError: # first row is header hence we include this exception to continue
                        pass
                    
                elec_results[year] = electee_list # writing in the results for given year    
    
    if typeofreturn == 'election':
        return elec_results 
        # dictionary_2 -> contains results of elections grouped by years, states names of the candidates and respective votes 
        #                 gained
    elif typeofreturn == 'salary':
        return salary 
        # dictionary_3 -> contains information about salaries of each president (strings as it includes additional information)

In [None]:
data_presidents = getDataMiller('https://millercenter.org/') #execution time over 1 minute

In [None]:
data_elections = getDataPotus('https://www.potus.com/','election') # execution time over 1 minute 35 sec

In [None]:
data_salary = getDataPotus('https://www.potus.com/','salary') # execution time over 1 minute 35 sec

In [None]:
# TODO: manipulate the data here

In [None]:
# Grover Cleveland was in office 2 non-consecutive times
data_presidents['Grover Cleveland 2'] = {
    key: value for key, value in data_presidents['Grover Cleveland'].items() if key not in ['Inauguration Date', 'Date Ended']
}

def correct_Grover_Cleveland_dates(data: Dict[str, Any]) -> Dict[str, Any]:
    inauguration_date_1 = data['Grover Cleveland']['Inauguration Date'].split('\n')[1]
    inauguration_date_2 = data['Grover Cleveland']['Inauguration Date'].split('\n')[3]
    date_ended_1 = data['Grover Cleveland']['Date Ended'].split('\n')[1]
    date_ended_2 = data['Grover Cleveland']['Date Ended'].split('\n')[3]

    data['Grover Cleveland']['Inauguration Date'] = inauguration_date_1
    data['Grover Cleveland 2']['Inauguration Date'] = inauguration_date_2
    data['Grover Cleveland']['Date Ended'] = date_ended_1
    data['Grover Cleveland 2']['Date Ended'] = date_ended_2

    return data

# run only once
data_presidents = correct_Grover_Cleveland_dates(data_presidents)

In [None]:
# Final table with data extracted from https://millercenter.org
# pd.DataFrame(data_presidents)

data = pd.DataFrame(data_presidents).applymap(lambda x: x.replace('\n', '') if isinstance(x, str) else x).T

In [None]:
str_variables = {
    'Birth Place', 
    'Burial Place', 
    'Career', 
    'Children',
    'Description', 
    'Education', 
    'Full Name',
    'Marriage',
    'Nickname',
    'Political Party', 
    'Quote',
    'Religion'
}

int_variables = {
    'Number of major events',
    'President Number'
}

timestamp_variables = {
    'Birth Date',
    'Date Ended', 
    'Death Date',
    'Inauguration Date'
}

for str_variable in str_variables:
    data[str_variable] = data[str_variable].map(str)

for int_variable in int_variables:
    data[int_variable] = data[int_variable].map(int)

for timestamp_variable in timestamp_variables:
    data[timestamp_variable] = data[timestamp_variable].map(pd.Timestamp)

In [None]:
data['Years at Inauguration'] = (data['Inauguration Date'] - data['Birth Date']).map(lambda x: x.days / 365)

In [None]:
# TODO: make the visualizations here

In [None]:
def plot_years_at_inauguration():
    trace = plotly.graph_objs.Scatter(
        x = data['Inauguration Date'],
        y = data['Years at Inauguration'],
        mode = 'lines+markers',
        name = 'chart_a',
        marker = {'size': 7},
    )

    layout = plotly.graph_objs.Layout(
        title = 'Years at Inauguration',
        yaxis = {'title': 'Years at Inauguration'},
        xaxis = {'title': 'Inauguration Date'},
    )

    plotly.offline.iplot(plotly.graph_objs.Figure(
        data = [trace],
        layout = layout
    ))

plot_years_at_inauguration()

In [None]:
def plot_number_of_major_events():
    trace = plotly.graph_objs.Scatter(
        x = data['Inauguration Date'],
        y = data['Number of major events'],
        mode = 'lines+markers',
        name = 'chart_a',
        marker = {'size': 7},
    )

    layout = plotly.graph_objs.Layout(
        title = 'Number of major events',
        yaxis = {'title': 'Number of major events'},
        xaxis = {'title': 'Inauguration Date'},
    )

    plotly.offline.iplot(plotly.graph_objs.Figure(
        data = [trace],
        layout = layout
    ))

plot_number_of_major_events()

In [None]:
# TODO: conclude here?