# US presidential statistics
## Simon Repko, Lukas Petrasek
### IES FSS CU
### 31.5.2019

This notebook serves as a demonstration of a school project whose goal is to achieve the following:
* scrape web pages to get historical data on US presidents
* manipulate the data into a form suitable for being visualized
* make vizualizations based the data

In [None]:
# TODO: import packages

In [None]:
from typing import Any, Dict, Optional, Type

from bs4 import BeautifulSoup
from tqdm import tqdm
import pandas as pd
import plotly
import requests



plotly.offline.init_notebook_mode(connected=True)

In [None]:
# TODO: initialize the scraping class and apply the methods necessary to get the data here

In [None]:
def get_soup(url: str) -> BeautifulSoup:
    ''' Returns soup for the given url. '''
    return BeautifulSoup(requests.get(url).text, 'html.parser')



class MillerScraper:
    '''
    Class for scraping the Miller Center webpage to get data about US presidents. 

    Particularly, it serves for scraping data on facts and characteristics (self.fast_facts), 
    brief description (self.descriptions), famous quotes (self.famous_quotes), and count of 
    notable events happened during the president's office (self.key_events_counts).
    '''

    def __init__(self):
        ''' Initializes the scraping class. '''
        self.origin: str = 'https://millercenter.org/'

        self.subdirectories: Optional[Dict[str, Any]] = None

        self.fast_facts: Optional[Dict[str, Any]] = None
        self.descriptions: Optional[Dict[str, Any]] = None
        self.famous_quotes: Optional[Dict[str, Any]] = None
        self.key_events_counts: Optional[Dict[str, Any]] = None

        self.all_presidents_data: Optional[pd.DataFrame()] = None


    def get_subdirectories(self) -> None:
        '''
        Creates a dictionary with the presidents' names as keys and their respective 
        subdirectories as values, then saves the dictionary to self.subdirectories.
        '''
        # parse the given origin (web address) utilizing BeautifulSoup 
        soup = get_soup(self.origin) 
        # enter the main navigation panel and find submenu that contains the links 
        # (subdirectories) to individual pages of US presidents
        navigation_menu = soup.find('nav', {'aria-labelledby':'block-mainnavigation-3-menu'})
        submenu = navigation_menu.find_all('ul', {'class':'submenu'})[1]
        a_blocks = submenu.find_all('a')

        subdirectories = {}
        for a_block in tqdm(a_blocks): 
            # each a_block represents one president, extract the subdirectory and save 
            # it under the president's name
            subdirectories[a_block.text] = a_block['href']

        self.subdirectories = subdirectories


    def get_fast_facts(self) -> None:
        '''
        Iterates over the subdirectories to get on the individual page of the respective 
        president and save the relevant fast facts into self.fast_facts. 
        '''
        fast_facts = {}
        for president, subdirectory in tqdm(self.subdirectories.items()):
            # parse the given path (web address) utilizing BeautifulSoup
            soup = get_soup(self.origin + subdirectory)
            # navigate through the soup to get to the part relevant for fast facts
            president_page = soup.find('div', {'class':'president-main-wrapper'})
            fast_facts_dashboard = president_page.find('div', {'class':'fast-facts-wrapper'})   

            # avoiding redundant elements (containing '\n')
            relevant_fast_facts = [x for x in fast_facts_dashboard.children if x != '\n']
            # popping the first element which contains just the 'Fast Facts' heading
            relevant_fast_facts.pop(0)

            fast_facts[president] = {}
            for fast_fact in relevant_fast_facts:
                # save the fast fact under its label into the dict of the given president
                fast_facts[president][fast_fact.label.text] = fast_fact.div.text

        self.fast_facts = fast_facts


    def get_descriptions(self) -> None:
        '''
        Iterates over the subdirectories to get on the individual page of the respective 
        president and save the relevant description into self.descriptions. 
        '''
        descriptions = {}
        for president, subdirectory in tqdm(self.subdirectories.items()):
            # parse the given path (web address) utilizing BeautifulSoup
            soup = get_soup(self.origin + subdirectory)
            # navigate through the soup to get to the part relevant for the description
            description_paragraph = soup.find('div', {'class':'copy-wrapper'})

            # save the description into the dict with descriptions
            descriptions[president] = description_paragraph.p.text

        self.descriptions = descriptions


    def get_famous_quotes(self) -> None:
        '''
        Iterates over the subdirectories to get on the individual page of the respective 
        president and save the relevant famous quote into self.famous_quotes. 
        '''
        famous_quotes = {}
        for president, subdirectory in tqdm(self.subdirectories.items()):
            # parse the given path (web address) utilizing BeautifulSoup
            soup = get_soup(self.origin + subdirectory)
            # navigate through the soup to get to the part relevant for the famous quote
            famous_quote_paragraph = soup.find('blockquote', {'class':'president-quote'})

            # save the famous quote into the dict with famous quotes
            famous_quotes[president] = famous_quote_paragraph.contents[0]

        self.famous_quotes = famous_quotes


    def get_key_events_counts(self) -> None:
        '''
        Iterates over the subdirectories to get on the individual page of the respective 
        president and save the relevant key events count into self.key_events_counts. 
        '''
        key_events_counts = {}
        for president, subdirectory in tqdm(self.subdirectories.items()):
            # parse the given path (web address) utilizing BeautifulSoup
            soup = get_soup(self.origin + subdirectory + '/key-events')
            # navigate through the soup to get to the part relevant for key events
            key_events_overview = soup.find('div', {'class':'article-wysiwyg-body'})

            try:
                # count of all events ('titles' highlighted in bold)
                key_events_count_bold = len(key_events_overview.find_all('strong'))
                # count of all events ('titles' no longer highlighted in bold)
                key_events_count_not_bold = len(key_events_overview.find_all('b'))
                # sum both counts
                key_events_count = key_events_count_bold + key_events_count_not_bold
            # D. Trump page has no information about major events, hence the exception 
            except AttributeError:
                key_events_count = 0
                pass

            # save the key events count into the dict with key events counts
            key_events_counts[president] = key_events_count

        self.key_events_counts = key_events_counts

In [None]:
class PotusScraper:
    '''
    Class for scraping the POTUS webpage to get data about US presidents and elections. 

    Particularly, it serves for scraping data on presidential salaries (self.salaries), 
    and election results (self.election_results).
    '''

    def __init__(self):
        ''' Initializes the scraping class. '''
        self.origin: str = 'https://www.potus.com/'

        self.subdirectories: Optional[Dict[str, Any]] = None

        self.salaries: Optional[Dict[str, Any]] = None
        self.election_results: Optional[Dict[str, Any]] = None


    def get_subdirectories(self) -> None:
        '''
        Creates a dictionary with the presidents' names as keys and their respective 
        subdirectories as values, then saves the dictionary to self.subdirectories.
        '''
        # parse the given origin (web address) utilizing BeautifulSoup 
        soup = get_soup(self.origin) 
        # navigate through the soup to get to the part that contains the links 
        # (subdirectories) to individual pages of US presidents
        a_blocks = soup.find_all('a', {'target':'_self'})

        subdirectories = {}
        for a_block in tqdm(a_blocks): 
            # each a_block represents one president, extract the subdirectory and save 
            # it under the president's name
            president_and_name_and_years = a_block.find('img')['alt']
            president_and_name = president_and_name_and_years.split(',')[0]
            name = president_and_name.replace('President ', '')

            subdirectories[name] = a_block['href']

        # popping the first element which contains the 'Facts About the Presidents' section
        subdirectories.pop('Facts About the Presidents')

        self.subdirectories = subdirectories


def getjoinSoupPotus(link):
    html = "".join(['https://www.potus.com/',link]) 
    response = requests.get(html)
    return BeautifulSoup(response.text,'html.parser') 

def getDataPotus(html,typeofreturn):
    soup = getSoup(html) # Parsing of given html utilizing BeautifulSoup 

    pres_list = {}
    for pres in soup.find_all('a',{'target':'_self'}):
        pres_list[pres.find('img')['alt']] = pres['href'] # extract and save names (key) and link 'endings' (value)

    pres_list.pop('Facts About the Presidents') # removing first row as it is a title
    # dictionary_1 -> includes names and links of each president
    
    
    #1 main loop: iterates links from dictionary_1 and gathers data about US presidential election results and 
    #             presidential salary of each president - therefore it returns two outputs, conditional on the input value 
    elec_results = {} #creation of empty final dict of election results
    salary = {}       #creation of empty dict for salaries
    
    for name,href in pres_list.items(): 
        soup = getjoinSoupPotus(href)

        #2 extraction of salary each president received 
        if typeofreturn == 'salary':
            #Benjamin Harrison has space in the string - error msg - therefore workaround is here
            try:
                salary[name] = soup.find(string="Presidential Salary:").find_parent('p').text
            except AttributeError:
                salary[name] = soup.find(string="Presidential Salary: ").find_parent('p').text

        #3 extraction of html tables of Presidential election results       
        elif typeofreturn == 'election':
            
            elections = soup.find(string="Presidential Election Results:").find_parent('div')

            #3.1 extraction of results for individual years
            for tbl in elections.find_all('table'): #selecting tables for individual years

                year = tbl.find('tr',{'class','row-2'}).find('a').text #year extraction
                electee_list = {}

                #3.2 selecting row with individual candidate
                for electee in tbl.find_all('tr'): 
                    try: # first row is header hence we include this exception skip it and to continue

                        electee_name = electee.find('td',{'class':'column-2'}).a.text  # name of candidate
                        electee_votes = {}

                        #3.3 data contained in tables for results of elections further in the past do not include
                        #  'popular votes' column, therefore it was necessary to include this condition
                        if len(elections.find('tr').find_all('th')) == 3: 
                            electee_votes['popular_votes']   = "" 
                            # above: number of popular votes candidate gained
                            electee_votes['electoral_votes'] = electee.find('td',{'class':'column-3'}).text
                            # above: number of electoral votes candidate gained
                        else:
                            electee_votes['popular_votes']   = electee.find('td',{'class':'column-3'}).text   
                            electee_votes['electoral_votes'] = electee.find('td',{'class':'column-4'}).text   

                        electee_list[electee_name] = electee_votes 
                        
                    except AttributeError: # first row is header hence we include this exception to continue
                        pass
                    
                elec_results[year] = electee_list # writing in the results for given year    
    
    if typeofreturn == 'election':
        return elec_results 
        # dictionary_2 -> contains results of elections grouped by years, states names of the candidates and respective votes 
        #                 gained
    elif typeofreturn == 'salary':
        return salary 
        # dictionary_3 -> contains information about salaries of each president (strings as it includes additional information)

In [None]:
# initialiaze the scraping class for Miller Center
miller_scrape = MillerScraper()

# get the page subdirectories for each president
miller_scrape.get_subdirectories()

# get data on facts and characteristics (fast_facts), brief descriptions (descriptions), 
# famous quotes (famous_quotes), and counts of notable events happened during the 
# president's office (key_events_counts)
miller_scrape.get_fast_facts()
miller_scrape.get_descriptions()
miller_scrape.get_famous_quotes()
miller_scrape.get_key_events_counts()

In [None]:
data_elections = getDataPotus('https://www.potus.com/','election') # execution time over 1 minute 35 sec

In [None]:
data_salary = getDataPotus('https://www.potus.com/','salary') # execution time over 1 minute 35 sec

In [None]:
# TODO: manipulate the data here

In [None]:
# Grover Cleveland was in office 2 non-consecutive times
data_presidents['Grover Cleveland 2'] = {
    key: value for key, value in data_presidents['Grover Cleveland'].items() if key not in ['Inauguration Date', 'Date Ended']
}

def correct_Grover_Cleveland_dates(data: Dict[str, Any]) -> Dict[str, Any]:
    inauguration_date_1 = data['Grover Cleveland']['Inauguration Date'].split('\n')[1]
    inauguration_date_2 = data['Grover Cleveland']['Inauguration Date'].split('\n')[3]
    date_ended_1 = data['Grover Cleveland']['Date Ended'].split('\n')[1]
    date_ended_2 = data['Grover Cleveland']['Date Ended'].split('\n')[3]

    data['Grover Cleveland']['Inauguration Date'] = inauguration_date_1
    data['Grover Cleveland 2']['Inauguration Date'] = inauguration_date_2
    data['Grover Cleveland']['Date Ended'] = date_ended_1
    data['Grover Cleveland 2']['Date Ended'] = date_ended_2

    return data

# run only once
data_presidents = correct_Grover_Cleveland_dates(data_presidents)

In [None]:
# Final table with data extracted from https://millercenter.org
# pd.DataFrame(data_presidents)

data = pd.DataFrame(data_presidents).applymap(lambda x: x.replace('\n', '') if isinstance(x, str) else x).T

In [None]:
str_variables = {
    'Birth Place', 
    'Burial Place', 
    'Career', 
    'Children',
    'Description', 
    'Education', 
    'Full Name',
    'Marriage',
    'Nickname',
    'Political Party', 
    'Quote',
    'Religion'
}

int_variables = {
    'Number of major events',
    'President Number'
}

timestamp_variables = {
    'Birth Date',
    'Date Ended', 
    'Death Date',
    'Inauguration Date'
}

for str_variable in str_variables:
    data[str_variable] = data[str_variable].map(str)

for int_variable in int_variables:
    data[int_variable] = data[int_variable].map(int)

for timestamp_variable in timestamp_variables:
    data[timestamp_variable] = data[timestamp_variable].map(pd.Timestamp)

In [None]:
data['Years at Inauguration'] = (data['Inauguration Date'] - data['Birth Date']).map(lambda x: x.days / 365)

In [None]:
# TODO: make the visualizations here

In [None]:
def plot_years_at_inauguration():
    trace = plotly.graph_objs.Scatter(
        x = data['Inauguration Date'],
        y = data['Years at Inauguration'],
        mode = 'lines+markers',
        name = 'chart_a',
        marker = {'size': 7},
    )

    layout = plotly.graph_objs.Layout(
        title = 'Years at Inauguration',
        yaxis = {'title': 'Years at Inauguration'},
        xaxis = {'title': 'Inauguration Date'},
    )

    plotly.offline.iplot(plotly.graph_objs.Figure(
        data = [trace],
        layout = layout
    ))

plot_years_at_inauguration()

In [None]:
def plot_number_of_major_events():
    trace = plotly.graph_objs.Scatter(
        x = data['Inauguration Date'],
        y = data['Number of major events'],
        mode = 'lines+markers',
        name = 'chart_a',
        marker = {'size': 7},
    )

    layout = plotly.graph_objs.Layout(
        title = 'Number of major events',
        yaxis = {'title': 'Number of major events'},
        xaxis = {'title': 'Inauguration Date'},
    )

    plotly.offline.iplot(plotly.graph_objs.Figure(
        data = [trace],
        layout = layout
    ))

plot_number_of_major_events()

In [None]:
# TODO: conclude here?