# US presidential statistics
## Simon Repko, Lukas Petrasek
### IES FSS CU
### 31.5.2019

This notebook serves as a demonstration of a school project whose goal is to achieve the following:
* scrape web pages to get historical data on US presidents
* manipulate the data into a form suitable for being visualized
* make vizualizations based the data

In [None]:
# TODO: import packages

In [None]:
from typing import Any, Dict, Optional, Type
import functools
import itertools
import sys

from bs4 import BeautifulSoup
from tqdm import tqdm
import geopy.geocoders
import geopy.extra.rate_limiter
import numpy as np
import pandas as pd
import plotly
import requests

sys.path.append('../')

import manipulation
import scraping



plotly.offline.init_notebook_mode(connected = True)

In [None]:
# TODO: initialize the scraping class and apply the methods necessary to get the data here

In [None]:
# initialiaze the scraping class for Miller Center
miller_scrape = scraping.MillerScraper()

# get the page subdirectories for each president
miller_scrape.get_subdirectories()

# get data on facts and characteristics (fast_facts), brief descriptions (descriptions), 
# famous quotes (famous_quotes), and counts of notable events happened during the 
# president's office (key_events_counts)
miller_scrape.get_fast_facts()
miller_scrape.get_descriptions()
miller_scrape.get_famous_quotes()
miller_scrape.get_key_events_counts()

# correct Grover Cleveland's data which are flawed due to him serving two non-consecutive
# terms
miller_scrape.correct_Grover_Cleveland_data()

In [None]:
# initialiaze the scraping class for POTUS
potus_scrape = scraping.PotusScraper()

# get the page subdirectories for each president
potus_scrape.get_subdirectories()

# POTUS uses different formats for names of presidents and also has got some names wrong,
# correct the names in subdirectories now so that salaries and election results are saved
# under correct names
potus_scrape.correct_subdirectories(miller_scrape)

# get data on presidential salaries (salaries), and election results (election_results)
potus_scrape.get_salaries()
potus_scrape.get_election_results()

# duplicate Grover Cleveland's salary for his second term, the salary was not recorded twice
# because his two terms were not consecutive
potus_scrape.duplicate_Grover_Cleveland_salary()

In [None]:
# TODO: manipulate the data here

In [None]:
# merge the available data for presidents and elections
all_presidents_data_df = manipulation.get_all_presidents_data_df(
    miller_scrape = miller_scrape, 
    potus_scrape = potus_scrape
)
election_results_df = manipulation.get_election_results_df(potus_scrape)

# clean presidents data
all_presidents_data_df = manipulation.clean_presidents_data(all_presidents_data_df)
# election_results_df do not need to be cleaned, they do not contain problematic values

# convert values to appropriate types
all_presidents_data_df = manipulation.convert_presidents_data(all_presidents_data_df)
election_results_df = manipulation.convert_elections_data(election_results_df)

# order presidents data
all_presidents_data_df = manipulation.order_presidents_data(all_presidents_data_df)

# correct indices in elections data
election_results_df = manipulation.correct_elections_data_indices(
    election_results_df, 
    all_presidents_data_df
)

# compute new features
all_presidents_data_df = manipulation.compute_years_at_inauguration(all_presidents_data_df)
all_presidents_data_df = manipulation.compute_locations(all_presidents_data_df)
all_presidents_data_df = manipulation.compute_first_electoral_vote_share(
    all_presidents_data_df, 
    election_results_df
)
all_presidents_data_df = manipulation.compute_number_of_children(all_presidents_data_df)

In [None]:
# TODO: make the visualizations here

In [None]:
def plot_years_at_inauguration():
    trace = plotly.graph_objs.Scatter(
        x = all_presidents_data_df['Inauguration Date'],
        y = all_presidents_data_df['Years at Inauguration'],
        mode = 'lines+markers',
        name = 'chart_a',
        marker = {'size': 7},
        text = all_presidents_data_df.index
    )

    layout = plotly.graph_objs.Layout(
        title = 'Years at Inauguration',
        xaxis = {'title': 'Inauguration Date'},
        yaxis = {'title': 'Years at Inauguration'},
    )

    plotly.offline.iplot(plotly.graph_objs.Figure(
        data = [trace],
        layout = layout
    ))

plot_years_at_inauguration()

In [None]:
def plot_key_events_count():
    trace = plotly.graph_objs.Scatter(
        x = all_presidents_data_df['Inauguration Date'],
        y = all_presidents_data_df['Key Events Count'],
        mode = 'lines+markers',
        name = 'chart_a',
        marker = {'size': 7},
        text = all_presidents_data_df.index
    )

    layout = plotly.graph_objs.Layout(
        title = 'Key Events Count',
        xaxis = {'title': 'Inauguration Date'},
        yaxis = {'title': 'Key Events Count'},
    )

    plotly.offline.iplot(plotly.graph_objs.Figure(
        data = [trace],
        layout = layout
    ))

plot_key_events_count()

In [None]:
def plot_years_at_inauguration_overlapping_histograms():
    parties = {
        'Democrats': 'Dem',
        'Federalists': 'Fed', 
        'Republicans': 'Rep',
        'Unionists': 'Uni',
        'Whigs': 'Whi'
    }

    traces = {}
    for party in parties:
        party_filter = all_presidents_data_df['Political Party'].map(lambda x: parties[party] in x)
        traces[party] = plotly.graph_objs.Histogram(
            x = all_presidents_data_df.loc[party_filter, 'Years at Inauguration'],
            name = party,
            opacity = 0.5
        )

    layout = plotly.graph_objs.Layout(
        barmode='overlay',
        title = 'Histogram of Years at Inauguration',
        yaxis = {'title': 'Number of Presidents'},
        xaxis = {'title': 'Years at Inauguration'},
    )

    plotly.offline.iplot(plotly.graph_objs.Figure(
        data = [trace for trace in traces.values()],
        layout = layout
    ))

plot_years_at_inauguration_overlapping_histograms()

In [None]:
def plot_birth_places_and_paths_map():
    birth_places = [plotly.graph_objs.Scattergeo(
        locationmode = 'USA-states',
        lon = all_presidents_data_df['Birth Place Longitude'],
        lat = all_presidents_data_df['Birth Place Latitude'],
        hoverinfo = 'text',
        text = all_presidents_data_df['Birth Place'],
        mode = 'markers',
        marker = plotly.graph_objs.scattergeo.Marker(
            size = 2,
            color = 'rgb(255, 0, 0)',
            line = plotly.graph_objs.scattergeo.marker.Line(
                width = 3,
                color = 'rgba(68, 68, 68, 0)'
            )
        )
    )]

    paths = []
    for i in range(len(all_presidents_data_df) - 1):
        paths.append(
            plotly.graph_objs.Scattergeo(
                locationmode = 'USA-states',
                lon = [
                    all_presidents_data_df['Birth Place Longitude'][i], 
                    all_presidents_data_df['Birth Place Longitude'][i + 1]
                ],
                lat = [
                    all_presidents_data_df['Birth Place Latitude'][i], 
                    all_presidents_data_df['Birth Place Latitude'][i + 1]
                ],
                mode = 'lines',
                line = plotly.graph_objs.scattergeo.Line(
                    width = 1,
                    color = 'red',
                )
            )
        )
    
    layout = plotly.graph_objs.Layout(
        title = plotly.graph_objs.layout.Title(
            text = 'US Presidents Birth Places Path'
        ),
        showlegend = False,
        geo = plotly.graph_objs.layout.Geo(
            scope = 'usa',
            projection = dict(type = 'albers usa'),
            showland = True,
            landcolor = 'rgb(243, 243, 243)',
            countrycolor = 'rgb(204, 204, 204)',
        ),
    )

    plotly.offline.iplot(plotly.graph_objs.Figure(
        data = birth_places + paths,
        layout = layout
    ))

plot_birth_places_and_paths_map()

In [None]:
def plot_vote_share_heatmap():
    parties = {
        'Democrats': 'Dem',
        'Federalists': 'Fed', 
        'Republicans': 'Rep',
        'Unionists': 'Uni',
        'Whigs': 'Whi'
    }

    number_of_children_list = list(set(all_presidents_data_df['Number of Children']))
    political_party_list = list(parties.keys())
    vote_share_list = []

    for number in number_of_children_list:
        number_filter = all_presidents_data_df['Number of Children'] == number
        vote_share_per_number_of_children_list = []

        for party in political_party_list:
            party_filter = all_presidents_data_df['Political Party'].map(lambda x: parties[party] in x)
            vote_share = all_presidents_data_df.loc[
                number_filter & party_filter, 'Electoral Votes Share'
            ].mean()
            vote_share = None if np.isnan(vote_share) else vote_share
            vote_share_per_number_of_children_list.append(vote_share)

        vote_share_list.append(vote_share_per_number_of_children_list)

    trace = plotly.graph_objs.Heatmap(
        x = number_of_children_list,
        y = political_party_list,
        z = vote_share_list
    )

    plotly.offline.iplot([trace])

plot_vote_share_heatmap()

In [None]:
# TODO: conclude here?