# US presidential statistics
## Simon Repko, Lukas Petrasek
### IES FSS CU
### 31.5.2019

This notebook serves as a demonstration of a school project whose goal is to achieve the following:
* scrape web pages to get historical data on US presidents
* manipulate the data into a form suitable for being visualized
* make vizualizations based the data

In [None]:
# TODO: import packages

In [None]:
from typing import Any, Dict, Optional, Type
import itertools

from bs4 import BeautifulSoup
from tqdm import tqdm
import numpy as np
import pandas as pd
import plotly
import requests



plotly.offline.init_notebook_mode(connected=True)

In [None]:
# TODO: initialize the scraping class and apply the methods necessary to get the data here

In [None]:
def get_soup(url: str) -> BeautifulSoup:
    ''' Returns soup for the given url. '''
    return BeautifulSoup(requests.get(url).text, 'html.parser')



class MillerScraper:
    '''
    Class for scraping the Miller Center webpage to get data about US presidents. 

    Particularly, it serves for scraping data on facts and characteristics (self.fast_facts), 
    brief description (self.descriptions), famous quotes (self.famous_quotes), and count of 
    notable events happened during the president's office (self.key_events_counts).
    '''

    def __init__(self):
        ''' Initializes the scraping class. '''
        self.origin: str = 'https://millercenter.org/'

        self.subdirectories: Optional[Dict[str, Any]] = None

        self.fast_facts: Optional[Dict[str, Any]] = None
        self.descriptions: Optional[Dict[str, Any]] = None
        self.famous_quotes: Optional[Dict[str, Any]] = None
        self.key_events_counts: Optional[Dict[str, Any]] = None

        self.all_presidents_data: Optional[pd.DataFrame()] = None


    def get_subdirectories(self) -> None:
        '''
        Creates a dictionary with the presidents' names as keys and their respective 
        subdirectories as values, then saves the dictionary to self.subdirectories.
        '''
        # parse the given origin (web address) utilizing BeautifulSoup 
        soup = get_soup(self.origin) 
        # enter the main navigation panel and find submenu that contains the links 
        # (subdirectories) to individual pages of US presidents
        navigation_menu = soup.find('nav', {'aria-labelledby':'block-mainnavigation-3-menu'})
        submenu = navigation_menu.find_all('ul', {'class':'submenu'})[1]
        a_blocks = submenu.find_all('a')

        subdirectories = {}
        for a_block in tqdm(a_blocks): 
            # each a_block represents one president, extract the subdirectory and save 
            # it under the president's name
            subdirectories[a_block.text] = a_block['href']

        self.subdirectories = subdirectories


    def get_fast_facts(self) -> None:
        '''
        Iterates over the subdirectories to get on the individual page of the respective 
        president and save the relevant fast facts into self.fast_facts. 
        '''
        fast_facts = {}
        for president, subdirectory in tqdm(self.subdirectories.items()):
            # parse the given path (web address) utilizing BeautifulSoup
            soup = get_soup(self.origin + subdirectory)
            # navigate through the soup to get to the part relevant for fast facts
            president_page = soup.find('div', {'class':'president-main-wrapper'})
            fast_facts_dashboard = president_page.find('div', {'class':'fast-facts-wrapper'})   

            # avoiding redundant elements (containing '\n')
            relevant_fast_facts = [x for x in fast_facts_dashboard.children if x != '\n']
            # popping the first element which contains just the 'Fast Facts' heading
            relevant_fast_facts.pop(0)

            fast_facts[president] = {}
            for fast_fact in relevant_fast_facts:
                # save the fast fact under its label into the dict of the given president
                fast_facts[president][fast_fact.label.text] = fast_fact.div.text

        self.fast_facts = fast_facts


    def get_descriptions(self) -> None:
        '''
        Iterates over the subdirectories to get on the individual page of the respective 
        president and save the relevant description into self.descriptions. 
        '''
        descriptions = {}
        for president, subdirectory in tqdm(self.subdirectories.items()):
            # parse the given path (web address) utilizing BeautifulSoup
            soup = get_soup(self.origin + subdirectory)
            # navigate through the soup to get to the part relevant for the description
            description_paragraph = soup.find('div', {'class':'copy-wrapper'})

            # save the description into the dict with descriptions
            descriptions[president] = description_paragraph.p.text

        self.descriptions = descriptions


    def get_famous_quotes(self) -> None:
        '''
        Iterates over the subdirectories to get on the individual page of the respective 
        president and save the relevant famous quote into self.famous_quotes. 
        '''
        famous_quotes = {}
        for president, subdirectory in tqdm(self.subdirectories.items()):
            # parse the given path (web address) utilizing BeautifulSoup
            soup = get_soup(self.origin + subdirectory)
            # navigate through the soup to get to the part relevant for the famous quote
            famous_quote_paragraph = soup.find('blockquote', {'class':'president-quote'})

            # save the famous quote into the dict with famous quotes
            famous_quotes[president] = str(famous_quote_paragraph.contents[0])

        self.famous_quotes = famous_quotes


    def get_key_events_counts(self) -> None:
        '''
        Iterates over the subdirectories to get on the individual page of the respective 
        president and save the relevant key events count into self.key_events_counts. 
        '''
        key_events_counts = {}
        for president, subdirectory in tqdm(self.subdirectories.items()):
            # parse the given path (web address) utilizing BeautifulSoup
            soup = get_soup(self.origin + subdirectory + '/key-events')
            # navigate through the soup to get to the part relevant for key events
            key_events_overview = soup.find('div', {'class':'article-wysiwyg-body'})

            try:
                # count of all events ('titles' highlighted in bold)
                key_events_count_bold = len(key_events_overview.find_all('strong'))
                # count of all events ('titles' no longer highlighted in bold)
                key_events_count_not_bold = len(key_events_overview.find_all('b'))
                # sum both counts
                key_events_count = key_events_count_bold + key_events_count_not_bold
            # D. Trump page has no information about major events, hence the exception 
            except AttributeError:
                key_events_count = 0
                pass

            # save the key events count into the dict with key events counts
            key_events_counts[president] = key_events_count

        self.key_events_counts = key_events_counts


    def correct_Grover_Cleveland_data(self) -> None:
        '''
        Corrects Grover Cleveland's data. 
        
        Because, due to Grover Cleveland being in office 2 non-consecutive times, the 
        'Inauguration Date', 'Date Ended' and 'President Number' facts are present twice 
        in the data.

        Also, duplicates other Grover Cleveland's data so that it is entered for each of
        his terms.
        '''
        # assert that 'Grover Cleveland 2' entry doesn't exist already
        assert 'Grover Cleveland 2' not in miller_scrape.fast_facts.keys()

        # create entries for Cleveland's second term
        self.fast_facts['Grover Cleveland 2'] = {
            key: value for key, value in self.fast_facts['Grover Cleveland'].items()
        }
        for attribute in [self.descriptions, self.famous_quotes, self.key_events_counts]:
            attribute['Grover Cleveland 2'] = attribute['Grover Cleveland']

        # input corrected entries
        for entry_name in ['Inauguration Date', 'Date Ended', 'President Number']:
            double_entry = self.fast_facts['Grover Cleveland'][entry_name]
            second_entry_index = 2 if entry_name == 'President Number' else 3

            entry_1 = double_entry.split('\n')[1]
            entry_2 = double_entry.split('\n')[second_entry_index]

            self.fast_facts['Grover Cleveland'][entry_name] = entry_1
            self.fast_facts['Grover Cleveland 2'][entry_name] = entry_2

In [None]:
class PotusScraper:
    '''
    Class for scraping the POTUS webpage to get data about US presidents and elections. 

    Particularly, it serves for scraping data on presidential salaries (self.salaries), 
    and election results (self.election_results).
    '''

    def __init__(self):
        ''' Initializes the scraping class. '''
        self.origin: str = 'https://www.potus.com/'

        self.subdirectories: Optional[Dict[str, Any]] = None

        self.salaries: Optional[Dict[str, Any]] = None
        self.election_results: Optional[Dict[str, Any]] = None


    def get_subdirectories(self) -> None:
        '''
        Creates a dictionary with the presidents' names as keys and their respective 
        subdirectories as values, then saves the dictionary to self.subdirectories.
        '''
        # parse the given origin (web address) utilizing BeautifulSoup 
        soup = get_soup(self.origin) 
        # navigate through the soup to get to the part that contains the links 
        # (subdirectories) to individual pages of US presidents
        a_blocks = soup.find_all('a', {'target':'_self'})

        subdirectories = {}
        for a_block in tqdm(a_blocks): 
            # each a_block represents one president, extract the subdirectory and save it 
            # under the president's name
            president_and_name_and_years = a_block.find('img')['alt']
            president_and_name = president_and_name_and_years.split(',')[0]
            name = president_and_name.replace('President ', '')

            subdirectories[name] = a_block['href']

        # popping the first element which contains the 'Facts About the Presidents' section
        subdirectories.pop('Facts About the Presidents')

        self.subdirectories = subdirectories


    def get_salaries(self) -> None:
        '''
        Iterates over the subdirectories to get on the individual page of the respective 
        president and save the relevant salary into self.salaries. 
        '''
        salaries = {}
        for president, subdirectory in tqdm(self.subdirectories.items()):
            # parse the given path (web address) utilizing BeautifulSoup
            soup = get_soup(self.origin + subdirectory)
            # navigate through the soup to get to the part relevant for the salary
            try:
                presidential_salary_title = soup.find(string = "Presidential Salary:")
                presidential_salary = presidential_salary_title.find_parent('p').text
            # Benjamin Harrison has space in the string, hence the exception
            except AttributeError:
                presidential_salary_title = soup.find(string = "Presidential Salary: ")
                presidential_salary = presidential_salary_title.find_parent('p').text

            # save the salary into the dict with salaries
            salaries[president] = presidential_salary

        self.salaries = salaries


    def get_election_results(self) -> None:
        '''
        Iterates over the subdirectories to get on the individual page of the respective 
        president and save the relevant salary into self.salaries. 
        '''  
        election_results = {}
        for president, subdirectory in tqdm(self.subdirectories.items()):
            # parse the given path (web address) utilizing BeautifulSoup
            soup = get_soup(self.origin + subdirectory)

            # navigate through the soup to get to the part relevant for election results
            presidential_elections_title = soup.find(string = "Presidential Election Results:")
            presidential_elections = presidential_elections_title.find_parent('div')
            presidential_elections_tables = presidential_elections.find_all('table')

            # extract election results for individual years
            for table in presidential_elections_tables:
                # extract the year
                year = table.find('tr', {'class','row-2'}).find('a').text
                election_results[year] = {}

                # extract the respective electee results
                electee_results = table.find_all('tr')

                for electee in electee_results:
                    try:
                        # extract the name of the candidate
                        electee_name = electee.find('td', {'class':'column-2'}).a.text
                        election_results[year][electee_name] = {}

                        # data contained in tables of early election results do not include the
                        # 'popular votes' column, therefore we include this condition
                        if len(presidential_elections.find('tr').find_all('th')) == 3: 
                            # number of popular votes candidate gained
                            popular_votes = None
                            # number of electoral votes candidate gained
                            electoral_votes = electee.find('td', {'class':'column-3'}).text
                        else:
                            popular_votes = electee.find('td', {'class':'column-3'}).text   
                            electoral_votes = electee.find('td', {'class':'column-4'}).text   

                        # save election results into the dict with election results
                        election_results[year][electee_name]['Popular Votes'] = popular_votes
                        election_results[year][electee_name]['Electoral Votes'] = electoral_votes 
                    # first row is the header, hence the exception
                    except AttributeError:
                        pass

        self.election_results = election_results


    def duplicate_Grover_Cleveland_salary(self) -> None:
        '''
        Duplicates Grover Cleveland's salary, the second entry being recorded for Grover Cleveland's
        second term in office. The second entry was not recorded because the two terms were not 
        consecutive.
        '''
        # assert that 'Grover Cleveland 2' entry doesn't exist already
        assert 'Grover Cleveland 2' not in self.salaries.keys()

        # create an entry for Cleveland's second term
        self.salaries['Grover Cleveland 2'] = self.salaries['Grover Cleveland']

In [None]:
# initialiaze the scraping class for Miller Center
miller_scrape = MillerScraper()

# get the page subdirectories for each president
miller_scrape.get_subdirectories()

# get data on facts and characteristics (fast_facts), brief descriptions (descriptions), 
# famous quotes (famous_quotes), and counts of notable events happened during the 
# president's office (key_events_counts)
miller_scrape.get_fast_facts()
miller_scrape.get_descriptions()
miller_scrape.get_famous_quotes()
miller_scrape.get_key_events_counts()

# correct Grover Cleveland's data which are flawed due to him serving two non-consecutive
# terms
miller_scrape.correct_Grover_Cleveland_data()

In [None]:
# initialiaze the scraping class for POTUS
potus_scrape = PotusScraper()

# get the page subdirectories for each president
potus_scrape.get_subdirectories()

# POTUS uses different formats for names of presidents and also has got some names wrong,
# correct the names in subdirectories now so that salaries and election results are saved
# under correct names
potus_scrape.correct_subdirectories(miller_scrape)

# get data on presidential salaries (salaries), and election results (election_results)
potus_scrape.get_salaries()
potus_scrape.get_election_results()

# duplicate Grover Cleveland's salary for his second term, the salary was not recorded twice
# because his two terms were not consecutive
potus_scrape.duplicate_Grover_Cleveland_salary()

In [None]:
# TODO: manipulate the data here

In [None]:
def get_all_presidents_data_df(
    *,
    miller_scrape: MillerScraper, 
    potus_scrape: PotusScraper
) -> pd.DataFrame:
    ''' 
    Merges available data about US presidents from miller_scrape and potus_scrape into one 
    DataFrame.

    Forces keyword arguments to avoid messing-up the order (Miller, POTUS).
    '''
    all_presidents_data = {}
    for president in miller_scrape.fast_facts.keys():
        all_presidents_data[president] = {
            **miller_scrape.fast_facts[president],
            'Description': miller_scrape.descriptions[president],
            'Famous Quote': miller_scrape.famous_quotes[president],
            'Key Events Count': miller_scrape.key_events_counts[president],
            'Salary': potus_scrape.salaries[president]
        }

    return pd.DataFrame(all_presidents_data).T


def get_election_results_df(potus_scrape: PotusScraper) -> pd.DataFrame:
    ''' 
    Merges available data on election results from potus_scrape into one DataFrame.

    The resulting DataFrame has multiindex columns ('Electoral Votes' and 'Popular Votes'
    for each 'Year').
    '''
    years = []
    election_results = []
    for year, year_results in potus_scrape.election_results.items():
        years.append(year)
        election_results.append(pd.DataFrame(year_results))

    return pd.concat(election_results, keys = years, sort = True).T


def clean_presidents_data(presidents_data: pd.DataFrame) -> pd.DataFrame:
    ''' 
    Replaces problematic strings in the given DataFrame with data about presidents. 
    '''
    PROBLEMATIC_STRINGS = {'\n', '\t', '\r', '\xa0'}

    for problematic_string in PROBLEMATIC_STRINGS:
        presidents_data = presidents_data.applymap(
            lambda x: x.replace(problematic_string, '') if isinstance(x, str) else x
        )

    return presidents_data


def convert_presidents_data(presidents_data: pd.DataFrame) -> pd.DataFrame:
    ''' 
    Converts values in the given DataFrame with data about presidents to appropriate types.
    '''
    STR_VARIABLES = {
        'Birth Place', 
        'Burial Place', 
        'Career', 
        'Children',
        'Description', 
        'Education', 
        'Famous Quote',
        'Full Name',
        'Marriage',
        'Nickname',
        'Political Party', 
        'Religion'
    }
    INT_VARIABLES = {
        'Key Events Count',
        'President Number',
        'Salary'
    }
    TIMESTAMP_VARIABLES = {
        'Birth Date',
        'Date Ended', 
        'Death Date',
        'Inauguration Date'
    }


    def _extract_salary(string: str) -> int:
        ''' Extracts salary from string. '''
        if not isinstance(string, str):
            return string

        base = int(string.split('$')[1].split(',')[0]) * 1_000

        expense_account = 0
        if 'expense' in string:
            if len(string.split('$')) > 3:
                expense_account = int(string.split('$')[3].split(',')[0]) * 1_000
            else:
                expense_account = int(string.split('$')[2].split(',')[0]) * 1_000

        return base + expense_account


    # convert data in respective columns
    for str_variable in STR_VARIABLES:
        presidents_data[str_variable] = presidents_data[str_variable].map(str)
        presidents_data[str_variable] = presidents_data[str_variable].map(
            lambda x: None if x in {'None', 'nan'} else x
        )
    for int_variable in INT_VARIABLES:
        if int_variable == 'Salary':
            presidents_data[int_variable] = presidents_data[int_variable].map(_extract_salary)
        else:
            presidents_data[int_variable] = presidents_data[int_variable].map(int)
    for timestamp_variable in TIMESTAMP_VARIABLES:
        presidents_data[timestamp_variable] = presidents_data[timestamp_variable].map(pd.Timestamp)

    return presidents_data


def convert_elections_data(elections_data: pd.DataFrame) -> pd.DataFrame:
    ''' 
    Converts values in the given DataFrame with data about elections to appropriate types.
    '''
    def _extract_votes(string: str) -> Optional[int]:
        ''' Extracts number of votes from string. '''
        if not isinstance(string, str):
            return string

        try:
            return int(string)
        # votes higher than 999 have the 'millions,thousands,units' format and some votes are 
        # missing, hence the exception
        except ValueError:
            if string in {'None', 'nan', ''}:
                return np.nan

            # convert one Popular Votes entry from 2012 containing '.' instead of ','
            if '.' in string and ',' in string:
                units = int(string.split('.')[-1])
                thousands = int(string.split('.')[-2].split(',')[-1])
                millions = int(string.split(',')[-2])

                return millions * 1_000_000 + thousands * 1_000 + units

            units = int(string.split(',')[-1])
            thousands = int(string.split(',')[-2])
            millions = 0

            if len(string.split(',')) == 3:
                millions = int(string.split(',')[-3])

            return millions * 1_000_000 + thousands * 1_000 + units
           

    # convert data in every cell
    elections_data = elections_data.applymap(str)
    elections_data = elections_data.applymap(_extract_votes)

    return elections_data


def order_presidents_data(presidents_data: pd.DataFrame) -> pd.DataFrame:
    ''' Orders data about presidents by 'Inauguration Date'. '''
    return presidents_data.sort_values('Inauguration Date')


def compute_presidents_features(presidents_data: pd.DataFrame) -> pd.DataFrame:
    ''' 
    Computes features from available data about presidents and adds them as new columns to the
    DataFrame. 
    '''
    # compute the features
    presidents_data['Years at Inauguration'] = (
        presidents_data['Inauguration Date'] - presidents_data['Birth Date']
    ).map(lambda x: x.days / 365)

    return presidents_data

In [None]:
# merge the available data for presidents and elections
all_presidents_data_df = get_all_presidents_data_df(
    miller_scrape = miller_scrape, 
    potus_scrape = potus_scrape
)
election_results_df = get_election_results_df(potus_scrape)

# clean presidents data
all_presidents_data_df = clean_presidents_data(all_presidents_data_df)
# election_results_df do not need to be cleaned, they do not contain problematic values

# convert values to appropriate types
all_presidents_data_df = convert_presidents_data(all_presidents_data_df)
election_results_df = convert_elections_data(election_results_df)

# order presidents data
all_presidents_data_df = order_presidents_data(all_presidents_data_df)

# compute new features
all_presidents_data_df = compute_presidents_features(all_presidents_data_df)

In [None]:
data['Years at Inauguration'] = (data['Inauguration Date'] - data['Birth Date']).map(lambda x: x.days / 365)

In [None]:
# TODO: make the visualizations here

In [None]:
def plot_years_at_inauguration():
    trace = plotly.graph_objs.Scatter(
        x = data['Inauguration Date'],
        y = data['Years at Inauguration'],
        mode = 'lines+markers',
        name = 'chart_a',
        marker = {'size': 7},
    )

    layout = plotly.graph_objs.Layout(
        title = 'Years at Inauguration',
        yaxis = {'title': 'Years at Inauguration'},
        xaxis = {'title': 'Inauguration Date'},
    )

    plotly.offline.iplot(plotly.graph_objs.Figure(
        data = [trace],
        layout = layout
    ))

plot_years_at_inauguration()

In [None]:
def plot_number_of_major_events():
    trace = plotly.graph_objs.Scatter(
        x = data['Inauguration Date'],
        y = data['Number of major events'],
        mode = 'lines+markers',
        name = 'chart_a',
        marker = {'size': 7},
    )

    layout = plotly.graph_objs.Layout(
        title = 'Number of major events',
        yaxis = {'title': 'Number of major events'},
        xaxis = {'title': 'Inauguration Date'},
    )

    plotly.offline.iplot(plotly.graph_objs.Figure(
        data = [trace],
        layout = layout
    ))

plot_number_of_major_events()

In [None]:
# TODO: conclude here?