# Project Two
## Group 5: Brian, Josh, Jeff, Yuliya, Natalie
2016 Election and Police/Judicial Employment

In [172]:
# Import packages
import pandas as pd
import sqlalchemy
from sqlalchemy import create_engine
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
import os

# (E) EXTRACTION

### Load in police data

In [173]:
# Define directory to police files
police_dir = "Resources/ucr-police-employee-data-2016-by-state-by-counties/"

# Find paths to all police files
police_paths = os.listdir(police_dir)

# Initialize df list
police_df_list = []

# Iterate through each path
for path in police_paths:
    # Identify state of origin
    state = pd.read_excel(police_dir + path, header = 0).iloc[0,0]
    
    # Load in data
    police_df = pd.read_excel(police_dir + path, header = 4)

    # Add state column
    police_df['State'] = state
    
    # Remove '\n' from column names
    police_df = police_df.rename(columns={'Metropolitan/Nonmetropolitan': 'Metropolitan',
                                          'Total law\nenforcement\nemployees': 'Total Law Enforcement Employees',
                                          'Total\nofficers': 'Total Officers',
                                          'Total\ncivilians': 'Total Civilians'})
    
    # Append to list
    police_df_list.append(police_df)

# Merge dataframes together
police_df = pd.concat(police_df_list, axis = 0)
police_df.head()

Unnamed: 0,Metropolitan,County,Total Law Enforcement Employees,Total Officers,Total Civilians,State
0,Metropolitan Counties,Anoka,256.0,129.0,127.0,MINNESOTA
1,,Benton,70.0,24.0,46.0,MINNESOTA
2,,Blue Earth,69.0,31.0,38.0,MINNESOTA
3,,Carlton,52.0,22.0,30.0,MINNESOTA
4,,Carver,148.0,77.0,71.0,MINNESOTA


### Load in voting and county data

In [174]:
# Create engine using the `demographics.sqlite` database file
engine = create_engine("sqlite:///Resources/archive/database.sqlite")

# Declare a Base using `automap_base()`
Base = automap_base()

# Use the Base class to reflect the database tables
Base.prepare(engine, reflect=True)

# Create a session
session = Session(engine)

# Create a connection
conn = engine.connect()

# Generate dataframe
voting_df = pd.read_sql_query('select * from primary_results', con = engine)
county_df = pd.read_sql_query('select * from county_facts', con = engine)
county_dict_df = pd.read_sql_query('select * from county_facts_dictionary', con = engine)

# (T) TRANSFORM

### Format Police Data

In [175]:
# Modify values in Metropolitan column to be more readable
police_df = police_df.drop('Metropolitan', axis = 1)

# Modify State to be first capitalized only
police_df['State'] = police_df['State'].str.capitalize()

# Change column names
police_df = police_df.rename({"State":"state_name", "County":"county",
                              "Total Law Enforcement Employees": "total_law_enforcement_employees",
                              "Total Officers": "total_officers",
                              "Total Civilians": "total_civilians"}, axis = 1)
police_df.head()

Unnamed: 0,county,total_law_enforcement_employees,total_officers,total_civilians,state_name
0,Anoka,256.0,129.0,127.0,Minnesota
1,Benton,70.0,24.0,46.0,Minnesota
2,Blue Earth,69.0,31.0,38.0,Minnesota
3,Carlton,52.0,22.0,30.0,Minnesota
4,Carver,148.0,77.0,71.0,Minnesota


### Format Voting Data

In [176]:
# Make column names consistent
voting_df = voting_df.rename(columns={'state': 'state_name'})

voting_df.head()

Unnamed: 0,state_name,state_abbreviation,county,fips,party,candidate,votes,fraction_votes
0,Alabama,AL,Autauga,1001,Democrat,Bernie Sanders,544,0.182
1,Alabama,AL,Autauga,1001,Democrat,Hillary Clinton,2387,0.8
2,Alabama,AL,Baldwin,1003,Democrat,Bernie Sanders,2694,0.329
3,Alabama,AL,Baldwin,1003,Democrat,Hillary Clinton,5290,0.647
4,Alabama,AL,Barbour,1005,Democrat,Bernie Sanders,222,0.078


### Format County Data

In [177]:
# Remove state and country rows
county_df = county_df[ county_df['state_abbreviation'] != "" ]

# Subset shared states with voting_df for mapping state names
shared_states = list(set(county_df['state_abbreviation']) & set(voting_df['state_abbreviation']))
county_df = county_df[county_df['state_abbreviation'].isin(shared_states)]

# Add full state name column
state_dict = dict(zip(voting_df['state_abbreviation'],voting_df['state_name']))
county_df['state_name'] = county_df['state_abbreviation'].map(state_dict)
county_df.head()

# Subset for columns of interest
columns = ['PST045214', 'POP010210', 'POP060210', 'LND110210', 'PST120214', 
           'EDU635213', 'EDU685213', 'INC910213', 'INC110213']
county_dict_df = county_dict_df[county_dict_df['column_name'].isin(columns)]
county_df = county_df.loc[:,county_df.columns.isin(['area_name', 'fips', 'state_abbreviation', 'State',
                                                    columns[0], columns[1], columns[2], columns[3], columns[4],
                                                    columns[5], columns[6], columns[7], columns[8]])]
county_df.head()

# Make shorter column name descriptions
county_dict_df['short_description'] = ['2014_pop', 'pct_pop_delt10to14', '2010_pop', 'pct_hs_grad', 'pct_post_bach_grad',
                                       'per_capita_income', 'median_household_income', 'sq_miles', 'pop_per_sq_mile']

# Map column names of county_df to short description
column_name_dict = dict(zip(county_dict_df['column_name'], county_dict_df['short_description']))
county_df = county_df.rename(column_name_dict, axis = 1)

# Remove "County" from county column and rename
county_df = county_df.rename({'area_name':'county'}, axis = 1)
county_df['county'] = county_df['county'].str.replace(" County", "")

In [178]:
county_df.head().to_csv('Results/county_df.csv')

In [179]:
police_df.head().to_csv('Results/police_df.csv')

In [180]:
county_dict_df.head().to_csv('Results/county_dict_df.csv')

In [181]:
voting_df.head().to_csv('Results/voting_df.csv')

In [182]:
county_df.head()

Unnamed: 0,fips,county,state_abbreviation,2014_pop,pct_pop_delt10to14,2010_pop,pct_post_hs_grad,pct_post_bach_grad,per_capita_income,median_household_income,sq_miles,pop_per_sq_mile
2,1001,Autauga,AL,55395,1.5,54571,85.6,20.9,24571,53682,594.44,91.8
3,1003,Baldwin,AL,200111,9.8,182265,89.1,27.7,26766,50221,1589.78,114.6
4,1005,Barbour,AL,26887,-2.1,27457,73.7,13.4,16829,32911,884.88,31.0
5,1007,Bibb,AL,22506,-1.8,22915,77.5,12.1,17427,36447,622.58,36.8
6,1009,Blount,AL,57719,0.7,57322,77.0,12.1,20730,44145,644.78,88.9


In [183]:
police_df.head()

Unnamed: 0,county,total_law_enforcement_employees,total_officers,total_civilians,state_name
0,Anoka,256.0,129.0,127.0,Minnesota
1,Benton,70.0,24.0,46.0,Minnesota
2,Blue Earth,69.0,31.0,38.0,Minnesota
3,Carlton,52.0,22.0,30.0,Minnesota
4,Carver,148.0,77.0,71.0,Minnesota


In [184]:
county_dict_df.head()

Unnamed: 0,column_name,description,short_description
0,PST045214,"Population, 2014 estimate",2014_pop
2,PST120214,"Population, percent change - April 1, 2010 to ...",pct_pop_delt10to14
3,POP010210,"Population, 2010",2010_pop
19,EDU635213,"High school graduate or higher, percent of per...",pct_post_hs_grad
20,EDU685213,"Bachelor's degree or higher, percent of person...",pct_post_bach_grad


In [185]:
voting_df.head()

Unnamed: 0,state_name,state_abbreviation,county,fips,party,candidate,votes,fraction_votes
0,Alabama,AL,Autauga,1001,Democrat,Bernie Sanders,544,0.182
1,Alabama,AL,Autauga,1001,Democrat,Hillary Clinton,2387,0.8
2,Alabama,AL,Baldwin,1003,Democrat,Bernie Sanders,2694,0.329
3,Alabama,AL,Baldwin,1003,Democrat,Hillary Clinton,5290,0.647
4,Alabama,AL,Barbour,1005,Democrat,Bernie Sanders,222,0.078
