In [17]:
import pandas as pd
from sqlalchemy import create_engine
from census import Census
from config import census_api_key, mysql_pw
import requests
import json
import os
import gmaps
import csv

# Loading public health statistics for cities to dataframe

**Source:** Kaggle (https://www.kaggle.com/cdc/500-cities)

**Description:** This data was collected by Centers for Disease Control and Prevention, National Center for Chronic Disease Prevention and Health Promotion, Division of Population Health (2016)

**Data collection and preparation credit:** David Chen

In [14]:
csv_file = "500_Cities_CDC.csv"
old_cityhealth_data_df = pd.read_csv(csv_file)
old_cityhealth_data_df.head()

Unnamed: 0,StateAbbr,PlaceName,PlaceFIPS,Population2010,ACCESS2_CrudePrev,ACCESS2_Crude95CI,ACCESS2_AdjPrev,ACCESS2_Adj95CI,ARTHRITIS_CrudePrev,ARTHRITIS_Crude95CI,...,SLEEP_Adj95CI,STROKE_CrudePrev,STROKE_Crude95CI,STROKE_AdjPrev,STROKE_Adj95CI,TEETHLOST_CrudePrev,TEETHLOST_Crude95CI,TEETHLOST_AdjPrev,TEETHLOST_Adj95CI,Geolocation
0,AL,Birmingham,107000,212237,22.6,"(22.1, 23.0)",21.4,"(21.0, 21.8)",32.6,"(32.5, 32.8)",...,"(46.6, 47.0)",5.0,"( 5.0, 5.1)",5.0,"( 5.0, 5.1)",26.1,"(25.1, 27.2)",25.9,"(25.0, 26.9)","(33.52756637730, -86.7988174678)"
1,AL,Hoover,135896,81619,10.6,"(10.2, 11.1)",10.2,"( 9.7, 10.7)",26.3,"(26.0, 26.6)",...,"(34.2, 35.0)",2.3,"( 2.2, 2.3)",2.2,"( 2.1, 2.3)",9.6,"( 8.6, 10.8)",9.5,"( 8.5, 10.9)","(33.37676027290, -86.8051937568)"
2,AL,Huntsville,137000,180105,17.4,"(16.9, 17.8)",16.3,"(15.9, 16.7)",30.0,"(29.8, 30.2)",...,"(39.4, 40.0)",3.3,"( 3.3, 3.4)",3.2,"( 3.1, 3.2)",14.9,"(14.1, 15.7)",14.7,"(13.8, 15.5)","(34.69896926710, -86.6387042882)"
3,AL,Mobile,150000,195111,20.0,"(19.6, 20.4)",19.1,"(18.7, 19.5)",33.1,"(32.9, 33.2)",...,"(42.0, 42.4)",4.3,"( 4.3, 4.4)",4.1,"( 4.0, 4.1)",24.3,"(23.4, 25.3)",24.1,"(23.1, 25.0)","(30.67762486480, -88.1184482714)"
4,AL,Montgomery,151000,205764,19.7,"(19.2, 20.2)",18.5,"(18.1, 19.0)",31.0,"(30.8, 31.2)",...,"(41.0, 41.5)",4.0,"( 3.9, 4.1)",4.1,"( 4.0, 4.1)",21.2,"(20.3, 22.2)",21.2,"(20.1, 22.2)","(32.34726453330, -86.2677059552)"


In [16]:
city_health_data_df = old_cityhealth_data_df[['StateAbbr', 'PlaceName', 'PlaceFIPS', 'Geolocation','Population2010','ARTHRITIS_CrudePrev']].copy()
city_health_data_df.head()

Unnamed: 0,StateAbbr,PlaceName,PlaceFIPS,Geolocation,Population2010,ARTHRITIS_CrudePrev
0,AL,Birmingham,107000,"(33.52756637730, -86.7988174678)",212237,32.6
1,AL,Hoover,135896,"(33.37676027290, -86.8051937568)",81619,26.3
2,AL,Huntsville,137000,"(34.69896926710, -86.6387042882)",180105,30.0
3,AL,Mobile,150000,"(30.67762486480, -88.1184482714)",195111,33.1
4,AL,Montgomery,151000,"(32.34726453330, -86.2677059552)",205764,31.0


# Loading Census Data

**Source:** Census API (https://api.census.gov)

**Description:** This data was called by county FIPS for all
**Data collection and preparation credit:** David Chen

In [18]:
url = "https://api.census.gov/data/2017/acs/acs5?get=B01003_001E,B01001_020E,B01001_021E,B01001_022E,B01001_023E,B01001_024E,B01001_025E,B01001_044E,B01001_045E,B01001_046E,B01001_047E,B01001_048E,B01001_049E,B02001_002E,B02001_003E,B02001_004E,B02001_005E,B02001_006E,B02001_007E,B01001I_001E,B09020_001E,B15003_002E,B15003_017E,B15003_022E,B15003_023E,B15003_025E,B17001_002E,B27001_002E,B27001_030E,NAME&for=place:*&key=" + census_api_key

response = requests.get(url)
response_json = response.json()

census_df = pd.DataFrame(response_json)

census_df.columns = census_df.iloc[0]
census_df = census_df[1:]

census_df.head()

Unnamed: 0,B01003_001E,B01001_020E,B01001_021E,B01001_022E,B01001_023E,B01001_024E,B01001_025E,B01001_044E,B01001_045E,B01001_046E,...,B15003_017E,B15003_022E,B15003_023E,B15003_025E,B17001_002E,B27001_002E,B27001_030E,NAME,state,place
1,174,0,0,0,0,0,0,0,0,0,...,0,0,0,0,45,69,105,"Abanda CDP, Alabama",1,100
2,2594,51,40,35,60,40,29,8,27,109,...,567,112,112,4,500,1248,1164,"Abbeville city, Alabama",1,124
3,4404,49,67,84,46,67,0,25,65,201,...,755,283,103,38,701,2063,2326,"Adamsville city, Alabama",1,460
4,725,15,12,38,13,5,0,24,14,29,...,171,59,38,0,246,385,340,"Addison town, Alabama",1,484
5,318,0,5,2,0,1,6,2,2,14,...,52,5,1,0,155,139,179,"Akron town, Alabama",1,676


In [19]:
column_names = {"B01003_001E":"total_pop", "B01001_020E":"male_65_66", "B01001_021E":"male_67_69",
                "B01001_022E":"male_70_74", "B01001_023E":"male_75_79", "B01001_024E":"male_80_84",
                "B01001_025E":"male_over_85", "B01001_044E":"female_65_66", "B01001_045E":"female_67_69",
                "B01001_046E":"female_70_74", "B01001_047E":"female_75_79", "B01001_048E":"female_80_84",
                "B01001_049E":"female_over_85", "B02001_002E":"white_pop", "B02001_003E":"black_pop",
                "B02001_004E":"native_amer_pop", "B02001_005E":"asian_pop", "B02001_006E":"pac_island_pop",
                "B02001_007E":"other_race_pop", "B01001I_001E":"hispanic_pop", "B15003_002E":"no_high_school",
                "B15003_017E":"high_school_grad", "B15003_022E":"bachelor_deg", "B15003_023E":"master_deg", 
                "B15003_025E":"doctorate_deg", "B17001_002E":"below_poverty", "B27001_002E":"male_w_health_ins",
                "B27001_030E":"female_w_health_ins", "place":"city_FIPS"}

census_df.rename(columns=column_names, 
                 inplace=True)


In [20]:
for column in census_df.iloc[:, 0:29]:
    census_df[column] = pd.to_numeric(census_df[column])
    
census_df["with_health_ins"] = census_df["male_w_health_ins"] + census_df["female_w_health_ins"]
census_df.head()

Unnamed: 0,total_pop,male_65_66,male_67_69,male_70_74,male_75_79,male_80_84,male_over_85,female_65_66,female_67_69,female_70_74,...,bachelor_deg,master_deg,doctorate_deg,below_poverty,male_w_health_ins,female_w_health_ins,NAME,state,city_FIPS,with_health_ins
1,174,0,0,0,0,0,0,0,0,0,...,0,0,0,45,69,105,"Abanda CDP, Alabama",1,100,174
2,2594,51,40,35,60,40,29,8,27,109,...,112,112,4,500,1248,1164,"Abbeville city, Alabama",1,124,2412
3,4404,49,67,84,46,67,0,25,65,201,...,283,103,38,701,2063,2326,"Adamsville city, Alabama",1,460,4389
4,725,15,12,38,13,5,0,24,14,29,...,59,38,0,246,385,340,"Addison town, Alabama",1,484,725
5,318,0,5,2,0,1,6,2,2,14,...,5,1,0,155,139,179,"Akron town, Alabama",1,676,318


In [21]:
census_df["city_FIPS"] = census_df["state"] + census_df["city_FIPS"]
census_df.head()

Unnamed: 0,total_pop,male_65_66,male_67_69,male_70_74,male_75_79,male_80_84,male_over_85,female_65_66,female_67_69,female_70_74,...,bachelor_deg,master_deg,doctorate_deg,below_poverty,male_w_health_ins,female_w_health_ins,NAME,state,city_FIPS,with_health_ins
1,174,0,0,0,0,0,0,0,0,0,...,0,0,0,45,69,105,"Abanda CDP, Alabama",1,100100,174
2,2594,51,40,35,60,40,29,8,27,109,...,112,112,4,500,1248,1164,"Abbeville city, Alabama",1,100124,2412
3,4404,49,67,84,46,67,0,25,65,201,...,283,103,38,701,2063,2326,"Adamsville city, Alabama",1,100460,4389
4,725,15,12,38,13,5,0,24,14,29,...,59,38,0,246,385,340,"Addison town, Alabama",1,100484,725
5,318,0,5,2,0,1,6,2,2,14,...,5,1,0,155,139,179,"Akron town, Alabama",1,100676,318


In [22]:
census_df["pop_over_65"] = census_df.iloc[:,1:13].sum(axis=1)
census_df["with_degree"] = census_df.iloc[:, 23:26].sum(axis=1)

In [23]:
census_demographics_df=census_df[["total_pop", "with_health_ins", "pop_over_65", "white_pop", "black_pop", "native_amer_pop",
                                "asian_pop", "pac_island_pop", "other_race_pop", "hispanic_pop", "no_high_school",
                                "high_school_grad", "with_degree", "below_poverty", "city_FIPS"]]

census_demographics_df.head()

Unnamed: 0,total_pop,with_health_ins,pop_over_65,white_pop,black_pop,native_amer_pop,asian_pop,pac_island_pop,other_race_pop,hispanic_pop,no_high_school,high_school_grad,with_degree,below_poverty,city_FIPS
1,174,174,0,150,24,0,0,0,0,0,0,0,0,45,100100
2,2594,2412,640,1463,1083,0,0,0,14,117,34,567,228,500,100124
3,4404,4389,814,2077,2284,0,0,0,7,7,23,755,424,701,100460
4,725,725,174,685,0,3,0,0,0,0,5,171,97,246,100484
5,318,318,34,70,248,0,0,0,0,0,1,52,6,155,100676


In [24]:
for column in census_demographics_df.iloc[:, 1:14]:
    census_demographics_df[column] = census_demographics_df[column]/census_demographics_df.iloc[:,0]

census_demographics_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,total_pop,with_health_ins,pop_over_65,white_pop,black_pop,native_amer_pop,asian_pop,pac_island_pop,other_race_pop,hispanic_pop,no_high_school,high_school_grad,with_degree,below_poverty,city_FIPS
1,174,1.0,0.0,0.862069,0.137931,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.258621,100100
2,2594,0.929838,0.246723,0.563994,0.417502,0.0,0.0,0.0,0.005397,0.045104,0.013107,0.218581,0.087895,0.192753,100124
3,4404,0.996594,0.184832,0.471617,0.518619,0.0,0.0,0.0,0.001589,0.001589,0.005223,0.171435,0.096276,0.159173,100460
4,725,1.0,0.24,0.944828,0.0,0.004138,0.0,0.0,0.0,0.0,0.006897,0.235862,0.133793,0.33931,100484
5,318,1.0,0.106918,0.220126,0.779874,0.0,0.0,0.0,0.0,0.0,0.003145,0.163522,0.018868,0.487421,100676


# DataBase Creation and Upload

In [25]:
connection_string = f'root:{mysql_pw}@localhost/'
engine = create_engine(f'mysql://{connection_string}')

connection = engine.connect()

Exception during reset or similar
Traceback (most recent call last):
  File "C:\Anaconda\lib\site-packages\sqlalchemy\pool.py", line 709, in _finalize_fairy
    fairy._reset(pool)
  File "C:\Anaconda\lib\site-packages\sqlalchemy\pool.py", line 880, in _reset
    pool._dialect.do_rollback(self)
  File "C:\Anaconda\lib\site-packages\sqlalchemy\dialects\mysql\base.py", line 1805, in do_rollback
    dbapi_connection.rollback()
MySQLdb._exceptions.OperationalError: (2006, 'MySQL server has gone away')


In [26]:
connection.execute('use city_health_demographics;')

<sqlalchemy.engine.result.ResultProxy at 0x23c0a4a46d8>

In [27]:
connection.execute('drop table city_demographics;')

<sqlalchemy.engine.result.ResultProxy at 0x23c06c92b70>

In [28]:
db_connection_string = f"root:{mysql_pw}@localhost/City_Health_Demographics"
db_engine = create_engine(f'mysql://{db_connection_string}')

In [None]:
cdc_data_df_update_column.to_sql(name='City_Health', con=db_engine, if_exists='append', index=True)

In [29]:
census_demographics_df.to_sql(name='City_Demographics', con=db_engine, if_exists='append', index=True)



In [44]:
# Confirm tables
db_engine.table_names()

['city_demographics', 'city_health']

In [43]:
db_engine.execute('''alter table city_demographics 
                     drop column index;''')

ProgrammingError: (MySQLdb._exceptions.ProgrammingError) (1064, "You have an error in your SQL syntax; check the manual that corresponds to your MySQL server version for the right syntax to use near 'index' at line 2") [SQL: 'alter table city_demographics \n                     drop column index;'] (Background on this error at: http://sqlalche.me/e/f405)

In [45]:
db_engine.execute('''create table city_health_demographics
                    select cd.*, ch.
                    from city_demographics cd
                    inner join city_health ch
                    on cd.city_FIPS = ch.PlaceFIPS;''')

OperationalError: (MySQLdb._exceptions.OperationalError) (1060, "Duplicate column name 'index'") [SQL: 'create table city_health_demographics\n                    select *\n                    from city_demographics cd\n                    inner join city_health ch\n                    on cd.city_FIPS = ch.PlaceFIPS;'] (Background on this error at: http://sqlalche.me/e/e3q8)