# Tables for covariates
On this code, I select varibles from datasets downloaded from Gloogle Cloud

In [37]:
import pandas as pd
import glob

## 1. ACS 1-year



### 1.1. Make table with all ACS-1 year
#### list all available files and paths

In [38]:
# all ACS files downloaded
f = [file[-12:] for file in glob.glob("../input2/ACS_*")]
f.sort()
f

['ACS_2010.csv',
 'ACS_2011.csv',
 'ACS_2012.csv',
 'ACS_2013.csv',
 'ACS_2014.csv',
 'ACS_2015.csv',
 'ACS_2016.csv',
 'ACS_2017.csv',
 'ACS_2018.csv']

In [39]:
all_files = glob.glob("../input2/ACS_*")
all_files.sort()
all_files

['../input2/ACS_2010.csv',
 '../input2/ACS_2011.csv',
 '../input2/ACS_2012.csv',
 '../input2/ACS_2013.csv',
 '../input2/ACS_2014.csv',
 '../input2/ACS_2015.csv',
 '../input2/ACS_2016.csv',
 '../input2/ACS_2017.csv',
 '../input2/ACS_2018.csv']

#### check missing variables accross files

The columns (variables) from imported files are not the same, as seen bellow. 

In [40]:
# number of columns for each file
dic = dict()
for f in all_files:
    dic.update({f[-12:-4]:len(pd.read_csv(f, index_col=0,nrows=1).columns)})
df = pd.DataFrame.from_dict(dic, orient='index').reset_index()
df.columns = ['file','n. variables']
df

Unnamed: 0,file,n. variables
0,ACS_2010,247
1,ACS_2011,252
2,ACS_2012,252
3,ACS_2013,252
4,ACS_2014,252
5,ACS_2015,246
6,ACS_2016,252
7,ACS_2017,252
8,ACS_2018,252


#### Investigate variables left out

In [41]:
# list with columns for each df
cols = [list(pd.read_csv(f, index_col=0,nrows=1).columns) for f in all_files]

In [42]:
# lengths do not match
[len(col) for col in cols], 

([247, 252, 252, 252, 252, 246, 252, 252, 252],)

In [43]:
all_files[-1]

'../input2/ACS_2018.csv'

In [44]:
# check variables left out
# I will use for reference last df (2018)
reference_col = set(cols[-1])
# variables missing (relative to last dataframe)
[reference_col - set(c) for c in cols]

[{'armed_forces',
  'civilian_labor_force',
  'employed_pop',
  'not_in_labor_force',
  'pop_16_over',
  'pop_in_labor_force',
  'unemployed_pop'},
 set(),
 set(),
 set(),
 set(),
 {'pop_15_and_over',
  'pop_divorced',
  'pop_never_married',
  'pop_now_married',
  'pop_separated',
  'pop_widowed'},
 set(),
 set(),
 set()]

Conclusion: data for 2010 and 2015 have missing columns. As I am not using 2010, for now, I will keep employment and unemployment, and drop variables missing on 2015 from concatenated final table.

In [45]:
# remove 2010 form files list
all_files.remove('../input2/ACS_2010.csv')
all_files

['../input2/ACS_2011.csv',
 '../input2/ACS_2012.csv',
 '../input2/ACS_2013.csv',
 '../input2/ACS_2014.csv',
 '../input2/ACS_2015.csv',
 '../input2/ACS_2016.csv',
 '../input2/ACS_2017.csv',
 '../input2/ACS_2018.csv']

#### concatenate files
I will concatenate files for 2011 to 2018, and drop columns not available for 2015. I will also drop geoids that are not FIPS.

In [46]:
# concatenate all files
df = pd.concat((pd.read_csv(f, index_col=0) for f in all_files))
df

Unnamed: 0,geo_id,nonfamily_households,family_households,median_year_structure_built,rent_burden_not_computed,rent_over_50_percent,rent_40_to_50_percent,rent_35_to_40_percent,rent_30_to_35_percent,rent_25_to_30_percent,...,speak_only_english_at_home,speak_spanish_at_home,speak_spanish_at_home_low_english,pop_15_and_over,pop_never_married,pop_now_married,pop_separated,pop_widowed,pop_divorced,do_date
0,26055,11109.0,22290.0,1985.0,151.0,2346.0,597.0,409.0,702.0,1082.0,...,,,,73364.0,19754.0,38686.0,723.0,3608.0,10332.0,2011
1,37035,16362.0,40016.0,1982.0,1425.0,3519.0,753.0,1330.0,1583.0,1184.0,...,,,,124575.0,35123.0,64074.0,2925.0,8574.0,13266.0,2011
2,40131,7143.0,26362.0,1988.0,600.0,954.0,497.0,436.0,676.0,945.0,...,,,,69611.0,12467.0,45910.0,1571.0,3511.0,6146.0,2011
3,41071,9191.0,23487.0,1984.0,723.0,2363.0,669.0,924.0,765.0,1494.0,...,,,,80186.0,24263.0,40392.0,2172.0,4641.0,7684.0,2011
4,36091,32017.0,58213.0,1979.0,538.0,4705.0,1651.0,2669.0,1543.0,2463.0,...,,,,181368.0,49757.0,100158.0,4914.0,9222.0,16547.0,2011
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
833,51199,8233.0,16763.0,1989.0,280.0,983.0,677.0,358.0,1192.0,844.0,...,,,,,,,,,,2018
834,41003,14720.0,20800.0,1978.0,798.0,5402.0,826.0,685.0,940.0,1137.0,...,,,,,,,,,,2018
835,4003,18646.0,31105.0,1985.0,1151.0,2766.0,1024.0,989.0,1329.0,1345.0,...,,,,,,,,,,2018
836,53029,10416.0,24156.0,1985.0,533.0,1670.0,929.0,619.0,955.0,888.0,...,,,,,,,,,,2018


In [47]:
# save local
df.to_csv('../output/tbl_ACS.csv', index=False)

### 1.2. Choose variables

Bellow I list all the variables available. I have edited the output form `df.columns` to manually organize subjects.

In [48]:
# copy/paste the results from df.columns to manually organize subjects, as bellow

variables = ['geo_id',
 
 # population age/sex
 'total_pop',
 'median_age',
 'not_us_citizen_pop',
 
 'population_1_year_and_over',
 'population_3_years_over',
 'pop_5_years_over',
 'pop_15_and_over',
 'pop_25_years_over',
 'pop_25_64',    # labour age

 'male_pop',
 'male_under_5',
 'male_5_to_9',
 'male_10_to_14',
 'male_15_to_17',
 'male_18_to_19',
 'male_20',
 'male_21',
 'male_22_to_24',
 'male_25_to_29',
 'male_30_to_34',
 'male_35_to_39',
 'male_40_to_44',
 'male_45_to_49',
 'male_50_to_54',
 'male_55_to_59',
 'male_60_61',
 'male_62_64',
 'male_65_to_66',
 'male_67_to_69',
 'male_70_to_74',
 'male_75_to_79',
 'male_80_to_84',
 'male_85_and_over',
 'male_45_to_64',   # on highest income bin
 
 'female_pop',
 'female_under_5',
 'female_5_to_9',
 'female_10_to_14',
 'female_15_to_17',
 'female_18_to_19',
 'female_20',
 'female_21',
 'female_22_to_24',
 'female_25_to_29',
 'female_30_to_34',
 'female_35_to_39',
 'female_40_to_44',
 'female_45_to_49',
 'female_50_to_54',
 'female_55_to_59',
 'female_60_to_61',
 'female_62_to_64',
 'female_65_to_66',
 'female_67_to_69',
 'female_70_to_74',
 'female_75_to_79',
 'female_80_to_84',
 'female_85_and_over',
 
 'children',
 
 # household composition
 'households',
             
 'nonfamily_households',
 'family_households',
 'married_households',
            
 'female_female_households',
 'male_male_households',
             
 'children_in_single_female_hh',
             
 'families_with_young_children',
 'one_parent_families_with_young_children',
 'two_parent_families_with_young_children',
             
 'two_parents_father_in_labor_force_families_with_young_children',
 'two_parents_mother_in_labor_force_families_with_young_children',
 'two_parents_in_labor_force_families_with_young_children',
 'two_parents_not_in_labor_force_families_with_young_children',
 
 'father_in_labor_force_one_parent_families_with_young_children',
 'father_one_parent_families_with_young_children',

 # income
 'median_income',
 'income_per_capita',
 'households_retirement_income',
 'income_less_10000',
 'income_10000_14999',
 'income_15000_19999',
 'income_20000_24999',
 'income_25000_29999',
 'income_30000_34999',
 'income_35000_39999',
 'income_40000_44999',
 'income_45000_49999',
 'income_50000_59999',
 'income_60000_74999',
 'income_75000_99999',
 'income_100000_124999',
 'income_125000_149999',
 'income_150000_199999',
 'income_200000_or_more',
   
 # schooling 
 'in_grades_1_to_4',
 'in_grades_5_to_8',
 'in_grades_9_to_12',
 'in_school',
 'in_undergrad_college',
 
 'less_than_high_school_graduate',
 'high_school_diploma',
 'some_college_and_associates_degree',
 'associates_degree',
 'bachelors_degree',
 'masters_degree',
 'graduate_professional_degree',

 'bachelors_degree_2',
 'bachelors_degree_or_higher_25_64',
 'high_school_including_ged',
 
 'less_one_year_college', # college only 
 'one_year_more_college', # college only 
 
 # sex/age/schooling - males only
 'male_45_64_associates_degree',
 'male_45_64_bachelors_degree',
 'male_45_64_graduate_degree',
 'male_45_64_less_than_9_grade',
 'male_45_64_grade_9_12',
 'male_45_64_high_school',
 'male_45_64_some_college',
 # male_45_64 (already on 'demograhics')
 
 # employment/occupation SECTOR
 'employed_agriculture_forestry_fishing_hunting_mining',
 'employed_arts_entertainment_recreation_accommodation_food',
 'employed_construction',
 'employed_education_health_social',
 'employed_finance_insurance_real_estate',
 'employed_information',
 'employed_manufacturing',
 'employed_other_services_not_public_admin',
 'employed_public_administration',
 'employed_retail_trade',
 'employed_science_management_admin_waste',
 'employed_transportation_warehousing_utilities',
 'employed_wholesale_trade',
 
 'occupation_management_arts',
 'occupation_natural_resources_construction_maintenance',
 'occupation_production_transportation_material',
 'occupation_sales_office',
 'occupation_services',
 
 'management_business_sci_arts_employed',
 'sales_office_employed',
 
 'worked_at_home',
 'workers_16_and_over',
 
 # inequality / poverty
 'gini_index',
 'households_public_asst_or_food_stamps',
 'poverty',
 'pop_determined_poverty_status', 
 
 # marital status
 'pop_never_married',
 'pop_now_married',
 'pop_separated',
 'pop_widowed',
 'pop_divorced',
 #'married_households', # added to 'households'
 
 # race and race/age            
 'white_pop',
 'amerindian_pop',
 'asian_pop',
 'black_pop',
 'hispanic_pop', 
 'other_race_pop',
 'not_hispanic_pop',
 'two_or_more_races_pop',
 
 'hispanic_any_race',
 'amerindian_including_hispanic',
 'asian_including_hispanic', 
 'black_including_hispanic',
 'white_including_hispanic',

 'asian_male_45_54',
 'asian_male_55_64',
 'black_male_45_54',
 'black_male_55_64', 
 'hispanic_male_45_54',
 'hispanic_male_55_64',
 'white_male_45_54',
 'white_male_55_64',
    
 'speak_only_english_at_home',
 'speak_spanish_at_home',
 'speak_spanish_at_home_low_english',
             
  # commute/accessibility
 'aggregate_travel_time_to_work',

 'commute_5_9_mins',
 'commute_35_39_mins',
 'commute_40_44_mins',
 'commute_60_89_mins',
 'commute_90_more_mins',
 
 'commute_less_10_mins',
 'commute_35_44_mins',
 'commute_60_more_mins',
 
 'commuters_by_public_transportation',
 'commuters_by_bus',
 'commuters_by_car_truck_van',
 'commuters_by_carpool',
 'commuters_by_subway_or_elevated',
 'commuters_drove_alone',
 'walked_to_work',
             
 'commute_10_14_mins',
 'commute_15_19_mins',
 'commute_20_24_mins',
 'commute_25_29_mins',
 'commute_30_34_mins',
 'commute_45_59_mins',
 
 'commuters_16_over', # commuters not children

 'no_car',
 'no_cars',
 'one_car',
 'two_cars',
 'three_cars',
 'four_more_cars',
            
 #------------------------------------------------------------------------------------------
  # housing
 #------------------------------------------------------------------------------------------
 'housing_units',
 'million_dollar_housing_units',
 'mortgaged_housing_units',
 'median_year_structure_built',
 
 # housing structure
 'dwellings_1_units_detached',
 'dwellings_1_units_attached',
 'dwellings_2_units',
 'dwellings_3_to_4_units',
 'dwellings_5_to_9_units',
 'dwellings_10_to_19_units',
 'dwellings_20_to_49_units',
 'dwellings_50_or_more_units',
 
 'housing_built_2005_or_later',
 'housing_built_2000_to_2004',
 'housing_built_1939_or_earlier',    
             
 # occupancy/vacancy
 'occupied_housing_units',
 'housing_units_renter_occupied',
 'owner_occupied_housing_units',
 'owner_occupied_housing_units_lower_value_quartile',
 'owner_occupied_housing_units_median_value',
 'owner_occupied_housing_units_upper_value_quartile',
 'vacant_housing_units',
 'vacant_housing_units_for_rent',
 'vacant_housing_units_for_sale',

 #other dwellings         
 'group_quarters',  
 'mobile_homes',
 
 # rental burden
 'median_rent',
 'percent_income_spent_on_rent',
 'rent_10_to_15_percent',
 'rent_15_to_20_percent',
 'rent_20_to_25_percent',
 'rent_25_to_30_percent',
 'rent_30_to_35_percent',
 'rent_35_to_40_percent',
 'rent_40_to_50_percent',
 'rent_burden_not_computed',
             
 'rent_over_50_percent',
 'rent_under_10_percent',
             
 'renter_occupied_housing_units_paying_cash_median_gross_rent',
  
 # movers           
 'different_house_year_ago_different_city',
 'different_house_year_ago_same_city',
           
 #year
 'do_date']

#### list chosen variables
These are the variables I chose from ACS-1. I will use them either as **covariates** or for **descriptive statistics**.

In [49]:
var_chosen = [
  
 'geo_id',   # FIPS
 'do_date', # year
    
 ### size/scale
 'total_pop', 
    
 ### Age structure 
 'pop_25_years_over',  # subtract from 'pop_25_64' to get elderly
 'pop_25_64',          # labour age  
  #'median_age',

 # cosmopolitan county?
 'not_us_citizen_pop', # how international
    
 # income distribution: later will bin, cutting on 25k, 50k, 100k
 'income_less_10000',
 'income_10000_14999',
 'income_15000_19999',
 'income_20000_24999',
 'income_25000_29999',
 'income_30000_34999',
 'income_35000_39999',
 'income_40000_44999',
 'income_45000_49999',
 'income_50000_59999',
 'income_60000_74999',
 'income_75000_99999',
 'income_100000_124999',
 'income_125000_149999',
 'income_150000_199999',
 'income_200000_or_more',
    
# employment
 'employed_pop',
 'employed_arts_entertainment_recreation_accommodation_food',
 'employed_finance_insurance_real_estate',
 'employed_information',
 'employed_public_administration',
 'employed_science_management_admin_waste',

 # schooling
 'bachelors_degree_or_higher_25_64',

 # indicator for central/suburban/residential area
 'aggregate_travel_time_to_work',
    
# movers           
 'different_house_year_ago_different_city',

 # for descriptive statistics
 'male_pop',
 'female_pop',
 'asian_including_hispanic', 
 'black_including_hispanic',
 'white_including_hispanic',
]

In [50]:
df = pd.read_csv('../output/tbl_ACS.csv')

In [51]:
df = df[var_chosen]
df.head(3)

Unnamed: 0,geo_id,do_date,total_pop,pop_25_years_over,pop_25_64,not_us_citizen_pop,income_less_10000,income_10000_14999,income_15000_19999,income_20000_24999,...,employed_public_administration,employed_science_management_admin_waste,bachelors_degree_or_higher_25_64,aggregate_travel_time_to_work,different_house_year_ago_different_city,male_pop,female_pop,asian_including_hispanic,black_including_hispanic,white_including_hispanic
0,26055,2011,88349.0,62370.0,49054.0,1351.0,1521.0,2099.0,2817.0,2178.0,...,1155.0,4066.0,15372.0,853610.0,11227.0,43962.0,44387.0,565.0,1976.0,83703.0
1,37035,2011,154181.0,104803.0,82218.0,7853.0,3723.0,3563.0,2968.0,4491.0,...,1834.0,4620.0,18739.0,1395380.0,13511.0,75406.0,78775.0,4709.0,13430.0,127753.0
2,40131,2011,87706.0,57866.0,45681.0,1086.0,1537.0,1492.0,1347.0,1961.0,...,2047.0,3130.0,12354.0,957410.0,,43259.0,44447.0,1120.0,1144.0,66501.0


In [52]:
# % missing for variables with NAs
100 * df.isna().sum().sort_values(ascending=False)[:15]/len(df)

different_house_year_ago_different_city                      26.163579
employed_finance_insurance_real_estate                       14.670884
employed_science_management_admin_waste                      14.670884
employed_arts_entertainment_recreation_accommodation_food    14.670884
employed_information                                         14.670884
employed_public_administration                               14.670884
black_including_hispanic                                      5.061003
white_including_hispanic                                      5.061003
asian_including_hispanic                                      5.061003
not_us_citizen_pop                                            2.364814
pop_25_years_over                                             0.346438
bachelors_degree_or_higher_25_64                              0.105438
aggregate_travel_time_to_work                                 0.105438
pop_25_64                                                     0.105438
total_

In [53]:
# number missing for a specific variable, per year
df.do_date[df.employed_finance_insurance_real_estate.isna()].value_counts()

2011    147
2017    141
2018    139
2016    116
2013    116
2015    110
2014    104
2012    101
Name: do_date, dtype: int64

Despite over 14% of NAs for employemnt by sector, the missing data is scattered accross years. I will leave any adjustments for later, after I select only the FIPS needed.

### 1.3  Edit columns

#### 1.3.1. income - reduce number of categories

In [54]:
income_cols = df.columns[df.columns.str.contains('income')]
income_cols 

Index(['income_less_10000', 'income_10000_14999', 'income_15000_19999',
       'income_20000_24999', 'income_25000_29999', 'income_30000_34999',
       'income_35000_39999', 'income_40000_44999', 'income_45000_49999',
       'income_50000_59999', 'income_60000_74999', 'income_75000_99999',
       'income_100000_124999', 'income_125000_149999', 'income_150000_199999',
       'income_200000_or_more'],
      dtype='object')

In [55]:
# concat 'df.' and use it to copy/paste for condensed categories bellow
pd.Series.add_prefix(income_cols.to_series(), 'df.').index

Index(['df.income_less_10000', 'df.income_10000_14999',
       'df.income_15000_19999', 'df.income_20000_24999',
       'df.income_25000_29999', 'df.income_30000_34999',
       'df.income_35000_39999', 'df.income_40000_44999',
       'df.income_45000_49999', 'df.income_50000_59999',
       'df.income_60000_74999', 'df.income_75000_99999',
       'df.income_100000_124999', 'df.income_125000_149999',
       'df.income_150000_199999', 'df.income_200000_or_more'],
      dtype='object')

In [56]:
# total number of income respondents - sum all income categories for each row
income_respondents = df[df.columns[df.columns.str.contains('income')]].sum(axis=1)

# condensed categories
df['income_less_25k']  = (df.income_less_10000  + 
                          df.income_10000_14999 + 
                          df.income_15000_19999 + 
                          df.income_20000_24999)/income_respondents 
            
df['income_25k_50k']   = (df.income_25000_29999 +
                          df.income_30000_34999 +
                          df.income_35000_39999 +
                          df.income_40000_44999 +
                          df.income_45000_49999)/income_respondents 
            
df['income_50k_100k']  = (df.income_50000_59999 +
                          df.income_60000_74999 +
                          df.income_75000_99999)/income_respondents 
               
df['income_100k_plus'] = (df.income_100000_124999 +
                          df.income_125000_149999 +
                          df.income_150000_199999 +
                          df.income_200000_or_more)/income_respondents 

In [57]:
df.drop(income_cols,1, inplace = True)

#### 1.3.2.  Age - make '65 plus' column and show percentages

In [58]:
df.total_pop
    
### Age structure 
df['age_65plus'] = (df.pop_25_years_over - df.pop_25_64)/df.total_pop
df['age_25_64']= df.pop_25_64/df.total_pop
df['bachelors'] = df.bachelors_degree_or_higher_25_64/df.pop_25_64
df.drop(['pop_25_years_over','bachelors_degree_or_higher_25_64'],1, inplace = True)

#### 1.3.2.  Employment per sector - percentages


In [59]:
employed_cols = df.columns[df.columns.str.contains('employed')].to_list()
employed_cols.remove('employed_pop')
employed_cols

['employed_arts_entertainment_recreation_accommodation_food',
 'employed_finance_insurance_real_estate',
 'employed_information',
 'employed_public_administration',
 'employed_science_management_admin_waste']

In [60]:
# percentages - divide all employment columns by total employment
df[employed_cols] = df[employed_cols].div(df.employed_pop, axis=0)


In [61]:
df.rename(columns={'employed_arts_entertainment_recreation_accommodation_food': 'emp_hospitality',
                   'employed_finance_insurance_real_estate': 'emp_finance', 
                   'employed_information':'emp_information',
                   'employed_public_administration':'emp_public_adm',
                   'employed_science_management_admin_waste':'emp_science_admin'},inplace=True)

#### 1.3.3. Race

In [62]:
# make percentages
df.asian_including_hispanic = df.asian_including_hispanic/df.total_pop
df.black_including_hispanic = df.black_including_hispanic/df.total_pop
df.white_including_hispanic = df.white_including_hispanic/df.total_pop


In [63]:
df.rename(columns={'asian_including_hispanic': 'asian',
                   'black_including_hispanic': 'black', 
                   'white_including_hispanic':'white',},inplace=True)

#### 1.3.4. Sex

In [64]:
# make percentages
df.male_pop = df.male_pop/df.total_pop
df.female_pop = df.female_pop/df.total_pop

In [65]:
df.rename(columns={'male_pop': 'male',
                   'female_pop': 'female',},inplace=True)

#### 1.3.5. Other columns to percentage

In [66]:
df.not_us_citizen_pop = df.not_us_citizen_pop/df.total_pop
df.employed_pop = df.employed_pop/df.pop_25_64
# movers           
df.different_house_year_ago_different_city = df.different_house_year_ago_different_city/df.total_pop
df.drop('pop_25_64',1, inplace=True)

In [67]:
df.rename(columns={'not_us_citizen_pop': 'not_us_citizen',
                   'employed_pop': 'employment',
                   'aggregate_travel_time_to_work':'commute_time',
                   'do_date':'year',
                   'total_pop':'population',
                   'different_house_year_ago_different_city':'immigration'},inplace=True)

### 1.4. add density and centroid coordinates 

In [68]:
df_area = pd.read_csv('../input2/UScounty_boundaries.csv', index_col=0)

df_area.rename(columns={'int_point_lat':'centroid_lat', 
                   'int_point_lon':'centroid_lon',},inplace=True)
df_area.head(3)
                      

Unnamed: 0,geo_id,area_land_meters,area_water_meters,centroid_lat,centroid_lon
0,1013,2012002531,2701198,31.751667,-86.681969
1,1059,1641841404,32643981,34.441989,-87.842814
2,1041,1576952799,5388562,31.732826,-86.319222


In [69]:
df = df.merge(df_area.drop('area_water_meters',1), how='left', on='geo_id')

In [70]:
df['density'] = df.population/df.area_land_meters
df.drop('area_land_meters',1, inplace=True)

### 1.5. Normalize remaining variables to  [0,1] range
Later, once we chose FIPS for analysis, I will normalize `Population`, `commute time` and `density` to [0,1], so units are similar to the other variables,  keeping originals (not normalized) for robustness check. I willleave code bellow for reference.

In [71]:
def maxmin_scale(s, smax=0, smin=1):
    s_std = (s - s.min()) / (s.max() - s.min())
    s = s_std * (smax - smin) + smin
    return s

In [72]:
df['population_norm'] = maxmin_scale(df.population)
df['commute_norm'] = maxmin_scale(df.commute_time)
df['density_norm'] = maxmin_scale(df.density)

### 1.6. save



In [73]:
# save local -  Name 'ready' for ready to use!
df.to_csv('../output/tbl_ACS_ready.csv', index=False)

In [75]:
# test reading it
f = pd.read_csv('../output/tbl_ACS_ready.csv')
f.head()

Unnamed: 0,geo_id,year,population,not_us_citizen,employment,emp_hospitality,emp_finance,emp_information,emp_public_adm,emp_science_admin,...,income_100k_plus,age_65plus,age_25_64,bachelors,centroid_lat,centroid_lon,density,population_norm,commute_norm,density_norm
0,26055,2011,88349.0,0.015292,0.853651,0.116943,0.080573,0.021994,0.027582,0.097099,...,0.191084,0.15072,0.55523,0.313369,44.718688,-85.553848,7.3e-05,0.997373,0.99642,0.997461
1,37035,2011,154181.0,0.050934,0.826765,0.07797,0.042074,0.01324,0.026981,0.067966,...,0.143567,0.146484,0.533256,0.227918,35.661883,-81.214906,0.000148,0.99086,0.992758,0.994821
2,40131,2011,87706.0,0.012382,0.880322,0.053812,0.047421,0.016089,0.050903,0.077834,...,0.206178,0.13893,0.520842,0.270441,36.377794,-95.601383,5e-05,0.997436,0.995719,0.998284
3,41071,2011,100000.0,0.05227,0.897886,0.100885,0.050126,0.00798,0.058216,0.094083,...,0.172165,0.13709,0.5108,0.198434,45.247827,-123.316399,5.4e-05,0.99622,0.99538,0.998149
4,36091,2011,220882.0,0.021699,0.911223,0.085904,0.085073,0.022258,0.078181,0.102075,...,0.279563,0.138984,0.555808,0.421266,43.106135,-73.855387,0.000105,0.984262,0.984308,0.996338


In [77]:
f.describe()

Unnamed: 0,geo_id,year,population,not_us_citizen,employment,emp_hospitality,emp_finance,emp_information,emp_public_adm,emp_science_admin,...,income_100k_plus,age_65plus,age_25_64,bachelors,centroid_lat,centroid_lon,density,population_norm,commute_norm,density_norm
count,6639.0,6639.0,6639.0,6482.0,6632.0,5665.0,5665.0,5665.0,5665.0,5665.0,...,6639.0,6611.0,6632.0,6632.0,6639.0,6639.0,6639.0,6639.0,6632.0,6639.0
mean,30534.095647,2014.513782,327623.6,0.044343,0.889347,0.096121,0.060532,0.01818,0.050512,0.100119,...,0.225644,0.152573,0.516849,0.295786,37.776895,-89.510628,0.000337,0.973702,0.975928,0.988169
std,16178.101571,2.291912,577491.5,0.039731,0.097308,0.027483,0.02247,0.008187,0.028269,0.033982,...,0.100351,0.041362,0.029954,0.109421,5.618361,15.119939,0.001319,0.057129,0.052816,0.046494
min,1003.0,2011.0,61792.0,0.000201,0.481008,0.028177,0.009004,0.001197,0.005537,0.023703,...,0.009595,0.062596,0.31376,0.067995,18.001717,-159.705965,1e-06,0.0,0.0,0.0
25%,17115.0,2013.0,94726.0,0.017293,0.830893,0.079731,0.044896,0.01276,0.032372,0.076653,...,0.153921,0.126189,0.502165,0.214281,34.277239,-94.888456,5.4e-05,0.974074,0.978142,0.992082
50%,34007.0,2015.0,155810.0,0.031718,0.893188,0.092042,0.057374,0.016966,0.042009,0.09511,...,0.203315,0.148584,0.520562,0.281161,38.82712,-85.353048,0.000104,0.990699,0.99124,0.996369
75%,42095.0,2017.0,323861.0,0.056642,0.945926,0.106857,0.072318,0.022139,0.060183,0.118011,...,0.276602,0.171286,0.535078,0.356591,41.549013,-79.640119,0.000226,0.996742,0.995943,0.998158
max,72139.0,2018.0,10170290.0,0.25453,1.354425,0.319878,0.237129,0.066441,0.220409,0.31825,...,0.662944,0.568169,0.66927,0.78713,64.676044,-65.968778,0.028365,1.0,1.0,1.0


------------
## 2. Air quality

##### load data

In [None]:
%ls ../input2/airquality*

In [None]:
df = pd.read_csv('../input2/airquality_2019.csv', index_col=0)

In [None]:
df.head(3)

##### add FIPS column

In [None]:
def add_FIPS(df):
    
    '''adds FIPS columns  to dataframe'''
    
    # new column with fips codes
    fips_S = [str(code).zfill(2) for code in df['state_code']]
    fips_C = [str(code1).zfill(3) for code1 in df['county_code']] 
    loc = df.columns.get_loc('county_code')
    df.insert(loc=loc, column='FIPS', value=[S+C for S,C in zip(fips_S, fips_C)]) 
    df.drop(['state_code','county_code'],1, inplace=True) 
    

In [None]:
add_FIPS(df)

-------
### Variables available
Now I explore the variables from `airquality` dataset, in order to chose some to use.

##### number of variables and counties

In [None]:
print(f'variables: {len(df.parameter_name.unique())}')
print(f'counties: {len(df.FIPS.unique())}')

In [None]:
# function to list variables containing a specific string
def check_variables(string):
    data = df.parameter_name[df.parameter_name.str.contains(string)].unique()
    return pd.DataFrame(data=data)

##### temperature

In [None]:
check_variables('emperature')

##### wind

In [None]:
check_variables('ind')

##### carbon

In [None]:
check_variables('arbon')

##### particles

In [None]:
check_variables('article')

### Choose variables

In [None]:
# used code bellow to copy/paste variables to manually edit

#variables=[]
#for string in ['emperature', 'arbon', 'artic']:
#    variables.extend(list(df.parameter_name[df.parameter_name.str.contains(string)].unique()))
#variables    

In [None]:
variables = ['Outdoor Temperature',
     'Temperature Difference',
     'Indoor Temperature',
     'Average Ambient Temperature',
     'Ambient Max Temperature',
     'Ambient Min Temperature',
     'Carbon monoxide',
     'Carbon dioxide',
     'Total hydrocarbons',
     'Suspended particulate (TSP)',
     'Solar radiation',   # function of latitude, pollution and weather conditions
     'Wind Speed - Resultant',
     'Relative Humidity ',]