# US presidential statistics
## Simon Repko, Lukas Petrasek
### IES FSS CU
### 31.5.2019

This notebook serves as a demonstration of a school project whose goal is to achieve the following:
* scrape web pages to get historical data on US presidents
* manipulate the data into a form suitable for being visualized
* make vizualizations based the data

In [None]:
import sys

import plotly

sys.path.append('../')

import manipulation
import scraping
import visualizations



plotly.offline.init_notebook_mode(connected = True)

In [None]:
# initialiaze the scraping class for Miller Center
miller_scrape = scraping.MillerScraper()

# get the page subdirectories for each president
miller_scrape.get_subdirectories()

# get data on facts and characteristics (fast_facts), brief descriptions (descriptions), 
# famous quotes (famous_quotes), and counts of notable events happened during the 
# president's office (key_events_counts)
miller_scrape.get_fast_facts()
miller_scrape.get_descriptions()
miller_scrape.get_famous_quotes()
miller_scrape.get_key_events_counts()

# correct Grover Cleveland's data which are flawed due to him serving two non-consecutive
# terms
miller_scrape.correct_Grover_Cleveland_data()

In [None]:
# initialiaze the scraping class for POTUS
potus_scrape = scraping.PotusScraper()

# get the page subdirectories for each president
potus_scrape.get_subdirectories()

# POTUS uses different formats for names of presidents and also has got some names wrong,
# correct the names in subdirectories now so that salaries and election results are saved
# under correct names
potus_scrape.correct_subdirectories(miller_scrape)

# get data on presidential salaries (salaries), and election results (election_results)
potus_scrape.get_salaries()
potus_scrape.get_election_results()

# duplicate Grover Cleveland's salary for his second term, the salary was not recorded twice
# because his two terms were not consecutive
potus_scrape.duplicate_Grover_Cleveland_salary()

In [None]:
# merge the available data for presidents and elections
all_presidents_data_df = manipulation.get_all_presidents_data_df(
    miller_scrape = miller_scrape, 
    potus_scrape = potus_scrape
)
election_results_df = manipulation.get_election_results_df(potus_scrape)

# clean presidents data
all_presidents_data_df = manipulation.clean_presidents_data(all_presidents_data_df)
# election_results_df do not need to be cleaned, they do not contain problematic values

# convert values to appropriate types
all_presidents_data_df = manipulation.convert_presidents_data(all_presidents_data_df)
election_results_df = manipulation.convert_elections_data(election_results_df)

# order presidents data
all_presidents_data_df = manipulation.order_presidents_data(all_presidents_data_df)

# correct indices in elections data
election_results_df = manipulation.correct_elections_data_indices(
    election_results_df, 
    all_presidents_data_df
)

# compute new features
all_presidents_data_df = manipulation.compute_years_at_inauguration(all_presidents_data_df)
all_presidents_data_df = manipulation.compute_locations(all_presidents_data_df)
all_presidents_data_df = manipulation.compute_first_electoral_vote_share(
    all_presidents_data_df, 
    election_results_df
)
all_presidents_data_df = manipulation.compute_number_of_children(all_presidents_data_df)

In [None]:
visualizations.plot_years_at_inauguration(all_presidents_data_df)

In [None]:
visualizations.plot_key_events_count(all_presidents_data_df)

In [None]:
visualizations.plot_years_at_inauguration_overlapping_histograms(all_presidents_data_df)

In [None]:
visualizations.plot_birth_places_and_paths_map(all_presidents_data_df)

In [None]:
visualizations.plot_vote_share_heatmap(all_presidents_data_df)