# Load States Data
* States
* States Population 

In [159]:
# Dependencies
import pandas as pd
import os

# Import SQL Alchemy
import sqlalchemy as db
from sqlalchemy import create_engine
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session

# Database - Postgres
import psycopg2

In [180]:
# Database connection
def get_dbconnection():
    connection = psycopg2.connect(user = "postgres",
                                  password = "postgres",
                                  host = "localhost",
                                  port = "5432",
                                  database = "ETLproject")    
    return connection

# Get States database. returns dataframe
def get_states():
    engine = create_engine('postgresql+psycopg2://postgres:postgres@localhost/ETLproject')
#
#    # Reflect an existing database into a new model
#    Base = automap_base()
#    # reflect the tables
#    Base.prepare(engine, reflect=True)
#
#    state = Base.classes.state
#    session = Session(engine)
#    
#    for row in session.query(state, state).all():
#        print(row)
    metadata = db.MetaData()
    connection = engine.connect()
    states = db.Table('state', metadata, autoload=True, autoload_with=engine)
    results = connection.execute(db.select([states])).fetchall()
    df = pd.DataFrame(results)
    df.columns = results[0].keys()
    return df

### States Basics
* Input data in Excel - state_codes.xls
* Expects name, A2 name, FIPS code in first 3 columns 

In [148]:
# Read states data with FIPS codes, values are in second sheet
states = os.path.join("..", "InputData", "state_codes.xls")
statesdata_df = pd.read_excel(states, sheet_name='Sheet2')
statesdata_df.columns = ["name", "name_a2", "id"]
#statesdata_df.index.names = ["id"]
statesdata_df.head()

Unnamed: 0,name,name_a2,id
0,Alabama,AL,1
1,Alaska,AK,2
2,Arizona,AZ,4
3,Arkansas,AR,5
4,California,CA,6


In [150]:
# Insert data into database
connection = get_dbconnection()
cursor = connection.cursor()

try:
    # DO NOTHING / UPDATE SET (name, name_a2) = (EXCLUDED.name, EXCLUDED.name_a2)
    for index, row in statesdata_df.iterrows():
        cursor.execute("INSERT INTO state (id, name, name_a2) VALUES (%s, %s, %s) ON CONFLICT DO NOTHING", 
                       (row['id'], row['name'], row['name_a2']))
     # commit the changes to the database
    connection.commit()
    cursor.close()
except psycopg2.DatabaseError as e:
    print(e)
finally:
    connection.rollback()
    if connection is not None:
        connection.close()    
    print("Import finished.")


Import finished.


### States Population
* Input data in nst-est2018-01.xlsx
* Sheet constains data for multiple years

In [183]:
# Constants
START_YEAR = 2015

# Column index for columns with years data
def getColLocation(df):
    yearColumns = {}

    idx = 0
    for col in df.columns.values:
        year = None
        if(isinstance(col, int)):
            year = col
        elif (isinstance(col, str)):
            if(col.isdigit()):
                year = int(col)
        if(year != None and year >= START_YEAR):
            yearColumns[year] = idx
        idx += 1 
    return yearColumns

        
# States population from file
# Code reflects layout of data in the file
population_file = os.path.join("..", "InputData", "nst-est2018-01.xlsx")

# Skip first 3 rows, they do not contains used data
population_df = pd.read_excel(population_file, sheet_name='NST01', skiprows=3)
population_df

# State name in first column
population_df.columns.values[0] = "state"

# States from database
states_df = get_states() 

# Merge data - be sure to cleanup data in merging columns
merged_df = pd.merge(states_df, population_df, left_on='name', right_on='state')
merged_df


getColLocation(population_df)


{2015: 8, 2016: 9, 2017: 10, 2018: 11}