# US presidential statistics
## Simon Repko, Lukas Petrasek
### IES FSS CU
### 31.5.2019

This notebook serves as a demonstration of a school project whose goal is to achieve the following:
* scrape web pages to get historical data on US presidents
* manipulate the data into a form suitable for being visualized
* make vizualizations based the data

In [None]:
# TODO: import packages

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [None]:
# TODO: initialize the scraping class and apply the methods necessary to get the data here

In [None]:
def getSoup(link):
    response = requests.get(link) 
    return BeautifulSoup(response.text,'html.parser')

def getjoinSoup(link):
    html = "".join(['https://millercenter.org',link]) 
    response = requests.get(html)
    return BeautifulSoup(response.text,'html.parser') 

def getDataMiller(dictionary):
    data_presidents = {}
    for name,href in dictionary.items():
        # main loop: iterates names and link of presidents contained in previously created dictionary 'pres_dict'
        # getting on the subsite of specific president
        soup = getjoinSoup(href)

        #1 extraction of FAST FACTS dashboard of specific president
        details = soup.find('div',{'class':'president-main-wrapper'}).find('div',{'class':'fast-facts-wrapper'})   

        list_of_relevant_details = [x for x in list(details.children) if x != '\n'] # removing redundant elements 
        list_of_relevant_details.pop(0) # removing first div with decsription

        fast_facts = {}
        for det in list_of_relevant_details:
            fast_facts[det.label.text] = det.div.text # for loop to load details and specifics into dict (key: label of detail)

        #2 brief description of the president
        brief_desc = {}
        brief_desc['Description'] = soup.find('div',{'class':'copy-wrapper'}).p.text # short description of president

        #3 famous quote of the president
        quote = {}
        quote['Quote'] = soup.find('blockquote',{'class':'president-quote'}).contents[0]

        #4 number of KEY EVENTS that happened during office
        # extracting url 'ending' for subsite with notable events that happened at time of office
        key_events = soup.find('div',{'class':'sub-nav-region'}).find_all('a')[1] 
        soup_1 = getjoinSoup(key_events['href']) # getting into the list of key events of president

        ke_count = {}
        # count of number of major events that happened at time of office - key_events_count_X : ke_c_X  
        # D. Trump page has no information about major events hence we included error exception   
        # after some time they changed the notation when 'titles' are highlighted in bold
        try:   
            ke_c_1 = len(soup_1.find('div',{'class':'article-wysiwyg-body'}).find_all('strong')) # count of all events - highlighted by bolding
            ke_c_2 = len(soup_1.find('div',{'class':'article-wysiwyg-body'}).find_all('b')) # notation change
            ke_count['Number of major events'] = ke_c_1 + ke_c_2 # due to double 'bold' notation we sum the count
        except AttributeError:
            ke_count['Number of major events'] = 0
            pass

        data_presidents[name] = {**fast_facts,**brief_desc,**quote,**ke_count} #merge of the dictionaries
        
    return data_presidents

In [None]:
html = 'https://millercenter.org/'
soup = getSoup(html) # Parsing of given html utilizing BeautifulSoup 

# Enter main navigation panel and find submenu that contains list of US presidents and following url 'ending' of their respective subsite
# Select second list to aim for desired list and remove duplicates due to multiple similar/same lists in the whole html
name_list = soup.find('nav',{'aria-labelledby':'block-mainnavigation-3-menu'}).find_all('ul',{'class':'submenu'})[1]

pres_dict = {} # Creation of empty dictionary
for pres in name_list.find_all('a'): 
    pres_dict[pres.text] = pres['href'] # extract and save names (key) and link 'endings' (value)

In [None]:
data_presidents = getDataMiller(pres_dict)

# Final table with data extracted from https://millercenter.org
pd.DataFrame(data_presidents)

In [None]:
# TODO: manipulate the data here

In [None]:
# TODO: make the visualizations here

In [None]:
# TODO: conclude here?