# US presidential statistics
## Simon Repko, Lukas Petrasek
### IES FSS CU
### 31.5.2019

This notebook serves as a demonstration of a school project whose goal is to achieve the following:
* scrape web pages to get historical data on US presidents
* manipulate the data into a form suitable for being visualized
* make vizualizations based the data

In [None]:
# TODO: import packages

In [None]:
from typing import Any, Dict, Optional, Type
import functools
import itertools
import sys

from bs4 import BeautifulSoup
from tqdm import tqdm
import geopy.geocoders
import geopy.extra.rate_limiter
import numpy as np
import pandas as pd
import plotly
import requests

sys.path.append('../')

import scraping



plotly.offline.init_notebook_mode(connected = True)

In [None]:
# TODO: initialize the scraping class and apply the methods necessary to get the data here

In [None]:
# initialiaze the scraping class for Miller Center
miller_scrape = scraping.MillerScraper()

# get the page subdirectories for each president
miller_scrape.get_subdirectories()

# get data on facts and characteristics (fast_facts), brief descriptions (descriptions), 
# famous quotes (famous_quotes), and counts of notable events happened during the 
# president's office (key_events_counts)
miller_scrape.get_fast_facts()
miller_scrape.get_descriptions()
miller_scrape.get_famous_quotes()
miller_scrape.get_key_events_counts()

# correct Grover Cleveland's data which are flawed due to him serving two non-consecutive
# terms
miller_scrape.correct_Grover_Cleveland_data()

In [None]:
# initialiaze the scraping class for POTUS
potus_scrape = scraping.PotusScraper()

# get the page subdirectories for each president
potus_scrape.get_subdirectories()

# POTUS uses different formats for names of presidents and also has got some names wrong,
# correct the names in subdirectories now so that salaries and election results are saved
# under correct names
potus_scrape.correct_subdirectories(miller_scrape)

# get data on presidential salaries (salaries), and election results (election_results)
potus_scrape.get_salaries()
potus_scrape.get_election_results()

# duplicate Grover Cleveland's salary for his second term, the salary was not recorded twice
# because his two terms were not consecutive
potus_scrape.duplicate_Grover_Cleveland_salary()

In [None]:
# TODO: manipulate the data here

In [None]:
def get_all_presidents_data_df(
    *,
    miller_scrape: MillerScraper, 
    potus_scrape: PotusScraper
) -> pd.DataFrame:
    ''' 
    Merges available data about US presidents from miller_scrape and potus_scrape into one 
    DataFrame.

    Forces keyword arguments to avoid messing-up the order (Miller, POTUS).
    '''
    all_presidents_data = {}
    for president in miller_scrape.fast_facts.keys():
        all_presidents_data[president] = {
            **miller_scrape.fast_facts[president],
            'Description': miller_scrape.descriptions[president],
            'Famous Quote': miller_scrape.famous_quotes[president],
            'Key Events Count': miller_scrape.key_events_counts[president],
            'Salary': potus_scrape.salaries[president]
        }

    return pd.DataFrame(all_presidents_data).T


def get_election_results_df(potus_scrape: PotusScraper) -> pd.DataFrame:
    ''' 
    Merges available data on election results from potus_scrape into one DataFrame.

    The resulting DataFrame has multiindex columns ('Electoral Votes' and 'Popular Votes'
    for each 'Year').
    '''
    years = []
    election_results = []
    for year, year_results in potus_scrape.election_results.items():
        years.append(year)
        election_results.append(pd.DataFrame(year_results))

    return pd.concat(election_results, keys = years, sort = True).T


def clean_presidents_data(presidents_data: pd.DataFrame) -> pd.DataFrame:
    ''' 
    Replaces problematic strings in the given DataFrame with data about presidents. 
    '''
    PROBLEMATIC_STRINGS = {'\n', '\t', '\r', '\xa0'}

    for problematic_string in PROBLEMATIC_STRINGS:
        presidents_data = presidents_data.applymap(
            lambda x: x.replace(problematic_string, '') if isinstance(x, str) else x
        )

    return presidents_data


def convert_presidents_data(presidents_data: pd.DataFrame) -> pd.DataFrame:
    ''' 
    Converts values in the given DataFrame with data about presidents to appropriate types.
    '''
    STR_VARIABLES = {
        'Birth Place', 
        'Burial Place', 
        'Career', 
        'Children',
        'Description', 
        'Education', 
        'Famous Quote',
        'Full Name',
        'Marriage',
        'Nickname',
        'Political Party', 
        'Religion'
    }
    INT_VARIABLES = {
        'Key Events Count',
        'President Number',
        'Salary'
    }
    TIMESTAMP_VARIABLES = {
        'Birth Date',
        'Date Ended', 
        'Death Date',
        'Inauguration Date'
    }


    def _extract_salary(string: str) -> int:
        ''' Extracts salary from string. '''
        if not isinstance(string, str):
            return string

        base = int(string.split('$')[1].split(',')[0]) * 1_000

        expense_account = 0
        if 'expense' in string:
            if len(string.split('$')) > 3:
                expense_account = int(string.split('$')[3].split(',')[0]) * 1_000
            else:
                expense_account = int(string.split('$')[2].split(',')[0]) * 1_000

        return base + expense_account


    # convert data in respective columns
    for str_variable in STR_VARIABLES:
        presidents_data[str_variable] = presidents_data[str_variable].map(str)
        presidents_data[str_variable] = presidents_data[str_variable].map(
            lambda x: None if x in {'None', 'nan'} else x
        )
    for int_variable in INT_VARIABLES:
        if int_variable == 'Salary':
            presidents_data[int_variable] = presidents_data[int_variable].map(_extract_salary)
        else:
            presidents_data[int_variable] = presidents_data[int_variable].map(int)
    for timestamp_variable in TIMESTAMP_VARIABLES:
        presidents_data[timestamp_variable] = presidents_data[timestamp_variable].map(pd.Timestamp)

    return presidents_data


def convert_elections_data(elections_data: pd.DataFrame) -> pd.DataFrame:
    ''' 
    Converts values in the given DataFrame with data about elections to appropriate types.
    '''
    def _extract_votes(string: str) -> Optional[int]:
        ''' Extracts number of votes from string. '''
        if not isinstance(string, str):
            return string

        try:
            return int(string)
        # votes higher than 999 have the 'millions,thousands,units' format and some votes are 
        # missing, hence the exception
        except ValueError:
            if string in {'None', 'nan', ''}:
                return np.nan

            # convert one Popular Votes entry from 2012 containing '.' instead of ','
            if '.' in string and ',' in string:
                units = int(string.split('.')[-1])
                thousands = int(string.split('.')[-2].split(',')[-1])
                millions = int(string.split(',')[-2])

                return millions * 1_000_000 + thousands * 1_000 + units

            units = int(string.split(',')[-1])
            thousands = int(string.split(',')[-2])
            millions = 0

            if len(string.split(',')) == 3:
                millions = int(string.split(',')[-3])

            return millions * 1_000_000 + thousands * 1_000 + units
           

    # convert data in every cell
    elections_data = elections_data.applymap(str)
    elections_data = elections_data.applymap(_extract_votes)

    return elections_data


def order_presidents_data(presidents_data: pd.DataFrame) -> pd.DataFrame:
    ''' Orders data about presidents by 'Inauguration Date'. '''
    return presidents_data.sort_values('Inauguration Date')


def correct_elections_data_indices(
    elections_data: pd.DataFrame, 
    presidents_data: pd.DataFrame
) -> pd.DataFrame:
    ''' 
    Converts elections_data indicis which correspond to actual presidents to the respective indices
    in presidents_data.
    '''
    elections_index = elections_data.index.tolist()

    for president in all_presidents_data_df.index:
        first_name_president = president.split(' ')[0]
        surname_president = president.split(' ')[-1]

        for candidate in election_results_df.index:
            first_name_candidate = candidate.split(' ')[0]
            surname_candidate = candidate.split(' ')[-1]

            if first_name_candidate == first_name_president \
                and surname_candidate == surname_president:
                elections_index[elections_index.index(candidate)] = president

    elections_data.index = elections_index

    return elections_data


def compute_years_at_inauguration(presidents_data: pd.DataFrame) -> pd.DataFrame:
    ''' 
    Computes the age in years at inauguration for all presidents and adds it as new columns to the
    DataFrame. 
    '''
    presidents_data['Years at Inauguration'] = (
        presidents_data['Inauguration Date'] - presidents_data['Birth Date']
    ).map(lambda x: x.days / 365)

    return presidents_data


def compute_locations(presidents_data: pd.DataFrame) -> pd.DataFrame:
    ''' 
    Computes the latitude and longitude of birth places of all presidents and adds them as new 
    columns to the DataFrame. 
    '''
    # caching in order to avoid HTTPError for getting too many requests if repeating the
    # computations too many times
    @functools.lru_cache(64)
    def _get_location(place: str) -> geopy.location.Location:
        ''' Find location containing latitude and longitude of the given place. '''
        # first, adjust places which cannot be found on map directly so that they can
        if '(now' in place:
            place_adjusted = place.split('(now')[1].replace(')', '')
            return geocode(place_adjusted + ',' + 'USA')
        elif 'near' in place:
            place_adjusted = place.split('near')[1].replace(')', '')
            return geocode(place_adjusted + ',' + 'USA')
        elif 'Shadwell plantation' in place:
            return geocode('Shadwell, Virginia' + ',' + 'USA')
        elif 'Waxhaw area' in place:
            return geocode('Waxhaw, North Carolina' + ',' + 'USA')

        return geocode(place + ',' + 'USA')


    geolocator = geopy.geocoders.Nominatim(user_agent = "my-application")
    # using the recommended RateLimiter to distribute requests with some small delay
    geocode = geopy.extra.rate_limiter.RateLimiter(geolocator.geocode, 1)
    presidents_data['Birth Place Latitude'] = presidents_data['Birth Place'].map(
        lambda x: print(x) if not _get_location(x) else _get_location(x).latitude
    )
    presidents_data['Birth Place Longitude'] = presidents_data['Birth Place'].map(
        lambda x: None if not _get_location(x) else _get_location(x).longitude
    )

    return presidents_data


def compute_first_electoral_vote_share(
    presidents_data: pd.DataFrame, 
    elections_data: pd.DataFrame
) -> pd.DataFrame:
    ''' 
    Computes the electoral vote share in the first election won for all presidents and adds it as
    new columns to the DataFrame.
    '''
    sums_of_votes = elections_data.sum()

    for president in presidents_data.index:
        inauguration_year = presidents_data.loc[president, 'Inauguration Date'].year
        votes_column_1 = (str(inauguration_year - 1), 'Electoral Votes')
        votes_column_2 = (str(inauguration_year), 'Electoral Votes')

        try:
            president_electoral_votes = elections_data.loc[president, votes_column_1]
            total_electoral_votes = sums_of_votes.loc[votes_column_1]
            electoral_votes_share = president_electoral_votes / total_electoral_votes
            presidents_data.loc[president, 'Electoral Votes Share'] = electoral_votes_share
        # the inauguration year does not always correspond to election year - 1, hence the exception
        except KeyError:
            try:
                president_electoral_votes = elections_data.loc[president, votes_column_2]
                total_electoral_votes = sums_of_votes.loc[votes_column_2]
                electoral_votes_share = president_electoral_votes / total_electoral_votes
                presidents_data.loc[president, 'Electoral Votes Share'] = electoral_votes_share
            # some of the presidents were not voted (succession from the position of vice president 
            # after the death / resignation of the previous president), hence the exception
            except KeyError:
                presidents_data.loc[president, 'Electoral Votes Share'] = np.nan

    return presidents_data


def compute_number_of_children(presidents_data: pd.DataFrame) -> pd.DataFrame:
    ''' 
    Computes the number of children for all presidents and adds it as new columns to the DataFrame. 
    '''
    presidents_data['Number of Children'] = presidents_data['Children'].map(
        lambda x: 0 if x == None else len(x.replace(';', ',').split(','))
    )

    return presidents_data

In [None]:
# merge the available data for presidents and elections
all_presidents_data_df = get_all_presidents_data_df(
    miller_scrape = miller_scrape, 
    potus_scrape = potus_scrape
)
election_results_df = get_election_results_df(potus_scrape)

# clean presidents data
all_presidents_data_df = clean_presidents_data(all_presidents_data_df)
# election_results_df do not need to be cleaned, they do not contain problematic values

# convert values to appropriate types
all_presidents_data_df = convert_presidents_data(all_presidents_data_df)
election_results_df = convert_elections_data(election_results_df)

# order presidents data
all_presidents_data_df = order_presidents_data(all_presidents_data_df)

# correct indices in elections data
election_results_df = correct_elections_data_indices(election_results_df, all_presidents_data_df)

# compute new features
all_presidents_data_df = compute_years_at_inauguration(all_presidents_data_df)
all_presidents_data_df = compute_locations(all_presidents_data_df)
all_presidents_data_df = compute_first_electoral_vote_share(
    all_presidents_data_df, 
    election_results_df
)
all_presidents_data_df = compute_number_of_children(all_presidents_data_df)

In [None]:
# TODO: make the visualizations here

In [None]:
def plot_years_at_inauguration():
    trace = plotly.graph_objs.Scatter(
        x = all_presidents_data_df['Inauguration Date'],
        y = all_presidents_data_df['Years at Inauguration'],
        mode = 'lines+markers',
        name = 'chart_a',
        marker = {'size': 7},
        text = all_presidents_data_df.index
    )

    layout = plotly.graph_objs.Layout(
        title = 'Years at Inauguration',
        xaxis = {'title': 'Inauguration Date'},
        yaxis = {'title': 'Years at Inauguration'},
    )

    plotly.offline.iplot(plotly.graph_objs.Figure(
        data = [trace],
        layout = layout
    ))

plot_years_at_inauguration()

In [None]:
def plot_key_events_count():
    trace = plotly.graph_objs.Scatter(
        x = all_presidents_data_df['Inauguration Date'],
        y = all_presidents_data_df['Key Events Count'],
        mode = 'lines+markers',
        name = 'chart_a',
        marker = {'size': 7},
        text = all_presidents_data_df.index
    )

    layout = plotly.graph_objs.Layout(
        title = 'Key Events Count',
        xaxis = {'title': 'Inauguration Date'},
        yaxis = {'title': 'Key Events Count'},
    )

    plotly.offline.iplot(plotly.graph_objs.Figure(
        data = [trace],
        layout = layout
    ))

plot_key_events_count()

In [None]:
def plot_years_at_inauguration_overlapping_histograms():
    parties = {
        'Democrats': 'Dem',
        'Federalists': 'Fed', 
        'Republicans': 'Rep',
        'Unionists': 'Uni',
        'Whigs': 'Whi'
    }

    traces = {}
    for party in parties:
        party_filter = all_presidents_data_df['Political Party'].map(lambda x: parties[party] in x)
        traces[party] = plotly.graph_objs.Histogram(
            x = all_presidents_data_df.loc[party_filter, 'Years at Inauguration'],
            name = party,
            opacity = 0.5
        )

    layout = plotly.graph_objs.Layout(
        barmode='overlay',
        title = 'Histogram of Years at Inauguration',
        yaxis = {'title': 'Number of Presidents'},
        xaxis = {'title': 'Years at Inauguration'},
    )

    plotly.offline.iplot(plotly.graph_objs.Figure(
        data = [trace for trace in traces.values()],
        layout = layout
    ))

plot_years_at_inauguration_overlapping_histograms()

In [None]:
def plot_birth_places_and_paths_map():
    birth_places = [plotly.graph_objs.Scattergeo(
        locationmode = 'USA-states',
        lon = all_presidents_data_df['Birth Place Longitude'],
        lat = all_presidents_data_df['Birth Place Latitude'],
        hoverinfo = 'text',
        text = all_presidents_data_df['Birth Place'],
        mode = 'markers',
        marker = plotly.graph_objs.scattergeo.Marker(
            size = 2,
            color = 'rgb(255, 0, 0)',
            line = plotly.graph_objs.scattergeo.marker.Line(
                width = 3,
                color = 'rgba(68, 68, 68, 0)'
            )
        )
    )]

    paths = []
    for i in range(len(all_presidents_data_df) - 1):
        paths.append(
            plotly.graph_objs.Scattergeo(
                locationmode = 'USA-states',
                lon = [
                    all_presidents_data_df['Birth Place Longitude'][i], 
                    all_presidents_data_df['Birth Place Longitude'][i + 1]
                ],
                lat = [
                    all_presidents_data_df['Birth Place Latitude'][i], 
                    all_presidents_data_df['Birth Place Latitude'][i + 1]
                ],
                mode = 'lines',
                line = plotly.graph_objs.scattergeo.Line(
                    width = 1,
                    color = 'red',
                )
            )
        )
    
    layout = plotly.graph_objs.Layout(
        title = plotly.graph_objs.layout.Title(
            text = 'US Presidents Birth Places Path'
        ),
        showlegend = False,
        geo = plotly.graph_objs.layout.Geo(
            scope = 'usa',
            projection = dict(type = 'albers usa'),
            showland = True,
            landcolor = 'rgb(243, 243, 243)',
            countrycolor = 'rgb(204, 204, 204)',
        ),
    )

    plotly.offline.iplot(plotly.graph_objs.Figure(
        data = birth_places + paths,
        layout = layout
    ))

plot_birth_places_and_paths_map()

In [None]:
def plot_vote_share_heatmap():
    parties = {
        'Democrats': 'Dem',
        'Federalists': 'Fed', 
        'Republicans': 'Rep',
        'Unionists': 'Uni',
        'Whigs': 'Whi'
    }

    number_of_children_list = list(set(all_presidents_data_df['Number of Children']))
    political_party_list = list(parties.keys())
    vote_share_list = []

    for number in number_of_children_list:
        number_filter = all_presidents_data_df['Number of Children'] == number
        vote_share_per_number_of_children_list = []

        for party in political_party_list:
            party_filter = all_presidents_data_df['Political Party'].map(lambda x: parties[party] in x)
            vote_share = all_presidents_data_df.loc[
                number_filter & party_filter, 'Electoral Votes Share'
            ].mean()
            vote_share = None if np.isnan(vote_share) else vote_share
            vote_share_per_number_of_children_list.append(vote_share)

        vote_share_list.append(vote_share_per_number_of_children_list)

    trace = plotly.graph_objs.Heatmap(
        x = number_of_children_list,
        y = political_party_list,
        z = vote_share_list
    )

    plotly.offline.iplot([trace])

plot_vote_share_heatmap()

In [None]:
# TODO: conclude here?