# Load States Data
* States
* States Population 

In [199]:
# Dependencies
import pandas as pd
import os
import datetime

# Import SQL Alchemy
import sqlalchemy as db
from sqlalchemy import create_engine
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session

# Database - Postgres
import psycopg2

In [231]:
# Constants, configuration
START_YEAR = 2015
DATA_SRC_STATE_POPULATION = 2;

In [207]:
# Database connection
def get_dbconnection():
    connection = psycopg2.connect(user = "postgres",
                                  password = "postgres",
                                  host = "localhost",
                                  port = "5432",
                                  database = "ETLproject")    
    return connection

# Get States database. returns dataframe
def get_states():
    engine = create_engine('postgresql+psycopg2://postgres:postgres@localhost/ETLproject')
    metadata = db.MetaData()
    connection = engine.connect()
    states = db.Table('state', metadata, autoload=True, autoload_with=engine)
    results = connection.execute(db.select([states])).fetchall()
    df = pd.DataFrame(results)
    df.columns = results[0].keys()
    return df

# Read columns in spreadsheet and get index for years
def get_yearcolumns(df):
    yearColumns = {}

    now = datetime.datetime.now()
    
    idx = 0
    for col in df.columns.values:
        year = None
        if(isinstance(col, int)):
            year = col
        elif (isinstance(col, str)):
            if(col.isdigit()):
                year = int(col)
        if(year != None and year >= START_YEAR and year <= now.year):
            yearColumns[year] = idx
        idx += 1 
    return yearColumns

### States Basics
* Input data in Excel - state_codes.xls
* Expects name, A2 name, FIPS code in first 3 columns 

In [195]:
# Read states data with FIPS codes, values are in second sheet
states = os.path.join("..", "InputData", "state_codes.xls")
statesdata_df = pd.read_excel(states, sheet_name='Sheet2')
statesdata_df.columns = ["name", "name_a2", "id"]
#statesdata_df.index.names = ["id"]
statesdata_df.head()

Unnamed: 0,name,name_a2,id
0,Alabama,AL,1
1,Alaska,AK,2
2,Arizona,AZ,4
3,Arkansas,AR,5
4,California,CA,6


In [196]:
# Insert data into database
connection = get_dbconnection()
cursor = connection.cursor()

try:
    # DO NOTHING / UPDATE SET (name, name_a2) = (EXCLUDED.name, EXCLUDED.name_a2)
    for index, row in statesdata_df.iterrows():
        cursor.execute("INSERT INTO state (id, name, name_a2) VALUES (%s, %s, %s) ON CONFLICT DO NOTHING", 
                       (row['id'], row['name'], row['name_a2']))
     # commit the changes to the database
    connection.commit()
    cursor.close()
except psycopg2.DatabaseError as e:
    print(e)
finally:
    connection.rollback()
    if connection is not None:
        connection.close()    
    print("States import finished.")


Import finished.


### States Population
* Input data in nst-est2018-01.xlsx
* Sheet constains data for multiple years

In [238]:
# States population from file
# Code reflects layout of data in the file
population_file = os.path.join("..", "InputData", "nst-est2018-01.xlsx")

# Skip first 3 rows, they do not contains used data
population_df = pd.read_excel(population_file, sheet_name='NST01', skiprows=3)

# State name in first column
population_df.columns.values[0] = "state"

# States from database
states_df = get_states() 

# Merge data - be sure to cleanup data in merging columns, i.e. state names
states_population_df = pd.merge(states_df, population_df, left_on='name', right_on='state')

# Save data in database
connection = get_dbconnection()
cursor = connection.cursor()

try:
    for year in get_yearcolumns(population_df):
        state_pop_year_df = states_population_df[['state', 'id', year]]
        yeardate =  datetime.datetime.strptime(f"{year}0606", '%Y%m%d')
        for index, row in state_pop_year_df.iterrows():
            cursor.execute("INSERT INTO state_population (state_id, population, year, source_id) \
                            VALUES (%s, %s, %s, %s) ON CONFLICT DO NOTHING", 
                          (row['id'], row[year], yeardate, DATA_SRC_STATE_POPULATION))
        print(f"Year: {year}, states sum population: {int(states_population_df[year].sum())}")
    connection.commit()
    cursor.close()
except psycopg2.DatabaseError as e:
    print(e)
    connection.rollback()
finally:
    if connection is not None:
        connection.close()    
    print("Population import finished.")

Year: 2015, states sum population: 320742673
Year: 2016, states sum population: 323071342
Year: 2017, states sum population: 325147121
Year: 2018, states sum population: 327167434
Population import finished.


Unnamed: 0,state,id,2018
0,Alabama,1,4887871.0
1,Alaska,2,737438.0
2,Arizona,4,7171646.0
3,Arkansas,5,3013825.0
4,California,6,39557045.0
5,Colorado,8,5695564.0
6,Connecticut,9,3572665.0
7,Delaware,10,967171.0
8,District of Columbia,11,702455.0
9,Florida,12,21299325.0


2019
