In [1]:
# Import preliminaries
import pandas as pd
from sqlalchemy import create_engine, inspect
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import func

# EXTRACT 1: Create dataframes for health measurement data

In [2]:
# Import the annual_report csv file and inspect the columns
health_csv = "annual_report.csv"
health_df = pd.read_csv(health_csv)
health_df.head()

Unnamed: 0,Edition,Report Type,Measure Name,State Name,Rank,Value,Score,Lower CI,Upper CI,Source,Source Year
0,2019,2019 Annual,Adverse Childhood Experiences,Alaska,34.0,24.1,0.96,27.7,20.5,"U.S. HHS, HRSA, Maternal and Child Health Bure...",2016-2017
1,2019,2019 Annual,Adverse Childhood Experiences,Alabama,46.0,26.3,1.56,30.1,22.5,"U.S. HHS, HRSA, Maternal and Child Health Bure...",2016-2017
2,2019,2019 Annual,Adverse Childhood Experiences,United States,,20.5,,21.2,19.8,"U.S. HHS, HRSA, Maternal and Child Health Bure...",2016-2017
3,2019,2019 Annual,Adverse Childhood Experiences,Arkansas,47.0,27.1,1.76,30.8,23.3,"U.S. HHS, HRSA, Maternal and Child Health Bure...",2016-2017
4,2019,2019 Annual,Adverse Childhood Experiences,Arizona,48.0,27.3,1.82,31.1,23.4,"U.S. HHS, HRSA, Maternal and Child Health Bure...",2016-2017


## TRANSFORM: Only keep columns of interest

In [3]:
# Only keep 6 columns: state name, rank, measure name, score, source, source year
health_df = health_df[['State Name', 'Rank', 'Measure Name', 'Score', 'Source', 'Source Year']].copy()
health_df.head()

Unnamed: 0,State Name,Rank,Measure Name,Score,Source,Source Year
0,Alaska,34.0,Adverse Childhood Experiences,0.96,"U.S. HHS, HRSA, Maternal and Child Health Bure...",2016-2017
1,Alabama,46.0,Adverse Childhood Experiences,1.56,"U.S. HHS, HRSA, Maternal and Child Health Bure...",2016-2017
2,United States,,Adverse Childhood Experiences,,"U.S. HHS, HRSA, Maternal and Child Health Bure...",2016-2017
3,Arkansas,47.0,Adverse Childhood Experiences,1.76,"U.S. HHS, HRSA, Maternal and Child Health Bure...",2016-2017
4,Arizona,48.0,Adverse Childhood Experiences,1.82,"U.S. HHS, HRSA, Maternal and Child Health Bure...",2016-2017


## TRANSFORM: Isolate each Health Measure Name of interest 
* mental illness
* insufficient sleep
* air_pollution

In [4]:
# Create mental illness dataframe
health_mental_df = health_df.loc[health_df['Measure Name']=='Mental illness', :]
health_mental_df.head()

Unnamed: 0,State Name,Rank,Measure Name,Score,Source,Source Year
23598,Alaska,45.0,Mental illness,1.94,"U.S. HHS, HRSA, Maternal and Child Health Bure...",2016-2017
23599,Alabama,14.0,Mental illness,0.05,"U.S. HHS, HRSA, Maternal and Child Health Bure...",2016-2017
23600,United States,,Mental illness,,"U.S. HHS, HRSA, Maternal and Child Health Bure...",2016-2017
23601,Arkansas,41.0,Mental illness,1.59,"U.S. HHS, HRSA, Maternal and Child Health Bure...",2016-2017
23602,Arizona,28.0,Mental illness,0.45,"U.S. HHS, HRSA, Maternal and Child Health Bure...",2016-2017


In [5]:
# Create insufficient sleep dataframe
health_sleep_df = health_df.loc[health_df['Measure Name']=='Insufficient Sleep', :]
health_sleep_df.head()

Unnamed: 0,State Name,Rank,Measure Name,Score,Source,Source Year
21310,Alabama,43.0,Insufficient Sleep,1.21,"CDC, Behavioral Risk Factor Surveillance System",2018
21311,Alaska,20.0,Insufficient Sleep,-0.3,"CDC, Behavioral Risk Factor Surveillance System",2018
21312,Arizona,22.0,Insufficient Sleep,-0.15,"CDC, Behavioral Risk Factor Surveillance System",2018
21313,Arkansas,29.0,Insufficient Sleep,0.47,"CDC, Behavioral Risk Factor Surveillance System",2018
21314,California,25.0,Insufficient Sleep,-0.03,"CDC, Behavioral Risk Factor Surveillance System",2018


In [6]:
# Create air pollution dataframe
health_airpollution_df = health_df.loc[health_df['Measure Name']=='Air Pollution', :]
health_airpollution_df.head()

Unnamed: 0,State Name,Rank,Measure Name,Score,Source,Source Year
52,Alaska,10.0,Air Pollution,-1.4,U.S. Environmental Protection Agency; U.S. Cen...,2016-2018
53,Alabama,36.0,Air Pollution,-0.21,U.S. Environmental Protection Agency; U.S. Cen...,2016-2018
54,Arkansas,19.0,Air Pollution,-0.91,U.S. Environmental Protection Agency; U.S. Cen...,2016-2018
55,Arizona,49.0,Air Pollution,0.91,U.S. Environmental Protection Agency; U.S. Cen...,2016-2018
56,California,50.0,Air Pollution,2.0,U.S. Environmental Protection Agency; U.S. Cen...,2016-2018


# TRANSFORM: Clean each dataframe. 
Define a function to perform the following: 
* Sort each dataframe by state
* Rename columns to match the SQL table column names
* Reset index for each dataframe.

In [7]:
# Function to clean dataframe
def clean_df(df, col_append):
    df = df.sort_values(by='State Name', ascending=False)
    df = df.rename(columns={"State Name":"state", 'Measure Name': 'measure_name', 'Source Year': 'source_year'})
    df = df.reset_index(drop=True)
    df.columns = ['{}_'.format(col_append)+col_name for col_name in df.columns]
    return df

In [8]:
# Clean air pollution dataframe
health_airpollution_df = clean_df(health_airpollution_df, "pollution")
health_airpollution_df.head()
# health_airpollution_df.count()

Unnamed: 0,pollution_state,pollution_Rank,pollution_measure_name,pollution_Score,pollution_Source,pollution_source_year
0,Wyoming,3.0,Air Pollution,-2.0,U.S. Environmental Protection Agency; U.S. Cen...,2016-2018
1,Wisconsin,15.0,Air Pollution,-1.12,U.S. Environmental Protection Agency; U.S. Cen...,2016-2018
2,West Virginia,29.0,Air Pollution,-0.56,U.S. Environmental Protection Agency; U.S. Cen...,2016-2018
3,Washington,34.0,Air Pollution,-0.28,U.S. Environmental Protection Agency; U.S. Cen...,2016-2018
4,Virginia,17.0,Air Pollution,-1.05,U.S. Environmental Protection Agency; U.S. Cen...,2016-2018


In [9]:
# Clean mental illness dataframe
health_mental_df = clean_df(health_mental_df, "mentalill")
health_mental_df.head()
# health_mental_df.count()

Unnamed: 0,mentalill_state,mentalill_Rank,mentalill_measure_name,mentalill_Score,mentalill_Source,mentalill_source_year
0,Wyoming,42.0,Mental illness,1.74,"U.S. HHS, HRSA, Maternal and Child Health Bure...",2016-2017
1,Wisconsin,38.0,Mental illness,1.19,"U.S. HHS, HRSA, Maternal and Child Health Bure...",2016-2017
2,West Virginia,43.0,Mental illness,1.89,"U.S. HHS, HRSA, Maternal and Child Health Bure...",2016-2017
3,Washington,34.0,Mental illness,0.85,"U.S. HHS, HRSA, Maternal and Child Health Bure...",2016-2017
4,Virginia,13.0,Mental illness,-0.05,"U.S. HHS, HRSA, Maternal and Child Health Bure...",2016-2017


In [10]:
# Clean insufficient sleep dataframe
health_sleep_df = clean_df(health_sleep_df, "sleep")
health_sleep_df.head()
# health_sleep_df.count()

Unnamed: 0,sleep_state,sleep_Rank,sleep_measure_name,sleep_Score,sleep_Source,sleep_source_year
0,Wyoming,15.0,Insufficient Sleep,-0.47,"CDC, Behavioral Risk Factor Surveillance System",2018
1,Wisconsin,10.0,Insufficient Sleep,-0.74,"CDC, Behavioral Risk Factor Surveillance System",2018
2,West Virginia,49.0,Insufficient Sleep,2.0,"CDC, Behavioral Risk Factor Surveillance System",2018
3,Washington,7.0,Insufficient Sleep,-0.86,"CDC, Behavioral Risk Factor Surveillance System",2018
4,Virginia,41.0,Insufficient Sleep,1.03,"CDC, Behavioral Risk Factor Surveillance System",2018


# EXTRACT 2: Store State Happiness Ranking Data

In [11]:
happiness_csv = "happiness.csv"
happiness_df = pd.read_csv(happiness_csv)
happiness_df = happiness_df.rename(columns={"State":"state", \
                                            "overall": "overall_rank",\
                                            "emotAndPhysRank": "emotional_rank",\
                                           "Pop": "population"})
happiness_df = happiness_df.sort_values(by='state', ascending=True)
happiness_df = happiness_df.reset_index(drop=True)
happiness_df.head()

Unnamed: 0,overall_rank,state,totalScore,emotional_rank,workEnvironRank,communityAndEnvironRank,population
0,45,Alabama,39.35,46,39,43,4898246
1,47,Alaska,38.21,33,49,50,735720
2,21,Arizona,52.92,27,12,34,7275070
3,49,Arkansas,36.61,50,29,23,3026412
4,5,California,63.14,4,24,12,39747267


In [None]:
# LOAD: Load the 4 dataframes to SQL database