In [1]:
# import statements
import pandas as pd
import numpy as np
from io import StringIO
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

In [2]:
# display settings
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [3]:
# Read in our beautiful TN non-spatial dataset
tn_data = pd.read_csv('../data/TN_aggregated_data.csv', index_col=[0])
tn_data.head()

FileNotFoundError: [Errno 2] File ../data/TN_aggregated_data.csv does not exist: '../data/TN_aggregated_data.csv'

In [None]:
tn_data.info()

## Step 1. Identify numeric variables of interest that need to be "reverse-coded"  

LOWER NUMBERS = LOWER RISK/VULNERABILITY
HIGHER NUMBERS = HIGHER RISK/VULNERABILITY  

#### Variables that need to be created:  
tn_data['pop_density'] = tn_data['population_est'] / tn_data['area_miles2']   

tn_data['pct_edu_HSgrad'] = tn_data['edu_HSgrad'] / tn_data['population_est']   **percent w/edu above less than HS**  
tn_data['pct_edu_assoc'] = tn_data['edu_assoc'] / tn_data['population_est']  
tn_data['pct_edu_min_bac'] = tn_data['edu_min_bac'] / tn_data['population_est']  

**calculate the percentile rank for remaining edu variables; .rank(pct = True) ranks each county on a scale from 0 (lowest %) to 1 (highest %); after creating, these need to be flipped for analyses so that high values = high vulnerability**  
tn_data['pctrnk_edu_HSgrad'] = tn_data.pct_edu_HSgrad.rank(pct = True)    
tn_data['pctrnk_edu_assoc'] = tn_data.pct_edu_assoc.rank(pct = True)      
tn_data['pctrnk_edu_min_bac'] = tn_data.pct_edu_min_bac.rank(pct = True)    

**perform the same function as above for poverty bands using the variables below**  
tot_w_poverty_lvl_data  
tot_poverty_138_399  
tot_poverty_400_up 

#### Variables that need to be retrieved:


#### Variables that need to be "reverse-coded" (made negative):  
tn_data['pctrnk_edu_HSgrad']   
tn_data['pctrnk_edu_assoc']   
tn_data['pctrnk_edu_min_bac']   
tot_poverty_138_399  (as pct rank)  
tot_poverty_400_up  (as pct rank)  
gdp_2018  **(higher values should mean lower risk)**  
gdp_per_cap_2018  
depth_mean  **(lower values should mean higher risk)**  
depth_median  
depth_min

#### Variables going in the correct direction:  
**[higher number = higher risk]**  
total_housing_units_2019   
population_est    
socioeconomic  
household_comp_and_disability  
minority_status_and_language  
housing_type_and_transportation  
total_vulnerability  
pct_uninsured  
mag_mean  
mag_median  
mag_max  
num_earthquakes  
pct_prob_100y_median  
pct_prob_100y_max  

**[lower number = lower risk]**
we don't have any variables that fall into this category

#### Variables that will not be used in risk calculation equation (because they are already part of SVI or other components):  
*pct_children_under_18  
pct_people_of_color    
pct_people_with_disabilities    
pct_senior_citizens  
pct_insured    **(use pct_uninsured instead)**  
race_afam  
race_asam  
race_latn  
race_natam  
race_paci  
race_white  
race_othr  
race_multi  
disability_y  
disability_n 
edu_lessHS  
edu_HSgrad  
edu_assoc  
edu_min_bac  
employed_est  
unemployed_est  
pct_people_living_in_rural_areas    **(leaving off because population density will be more useful for risk calc)**  
tot_poverty_less100    **(already present in SES data)**  
tot_poverty_101_138  

In [4]:
# create variables that need to be created
tn_data['pop_density'] = tn_data['population_est'] / tn_data['area_miles2']

NameError: name 'tn_data' is not defined

In [5]:
# # get percent rank in default order so that higher percentages correspond with higher risk rankings
# tn_data['pctrnk_edu_HSgrad'] = tn_data.pct_edu_HSgrad.rank(pct = True)    
# tn_data['pctrnk_edu_assoc'] = tn_data.pct_edu_assoc.rank(pct = True)      
# tn_data['pctrnk_edu_min_bac'] = tn_data.pct_edu_min_bac.rank(pct = True)

In [6]:
# # check output
# tn_data.head()

In [7]:
# get percent rank in reverse so that higher percentages correspond with lower risk rankings
tn_data['pct_edu_HSgrad'] = tn_data['edu_HSgrad'] / tn_data['population_est']   
tn_data['pctrnk_edu_HSgrad'] = tn_data.pct_edu_HSgrad.rank(pct = True, ascending = False)

tn_data['pct_edu_assoc'] = tn_data['edu_assoc'] / tn_data['population_est']
tn_data['pctrnk_edu_assoc'] = tn_data.pct_edu_assoc.rank(pct = True, ascending = False)

tn_data['pct_edu_min_bac'] = tn_data['edu_min_bac'] / tn_data['population_est']
tn_data['pctrnk_edu_min_bac'] = tn_data.pct_edu_min_bac.rank(pct = True, ascending = False)

NameError: name 'tn_data' is not defined

In [8]:
# check output
tn_data.head()

NameError: name 'tn_data' is not defined

In [9]:
# get percent rank in reverse so that higher percentages correspond with lower risk rankings
tn_data['pct_poverty_138_399'] = tn_data['tot_poverty_138_399'] / tn_data['tot_w_poverty_lvl_data']
tn_data['pctrnk_poverty_138_399'] = tn_data.pct_poverty_138_399.rank(pct = True, ascending = False)

tn_data['pct_poverty_400_up'] = tn_data['tot_poverty_400_up'] / tn_data['tot_w_poverty_lvl_data']
tn_data['pctrnk_poverty_400_up'] = tn_data.pct_poverty_400_up.rank(pct = True, ascending = False)

NameError: name 'tn_data' is not defined

In [10]:
# get percent rank in reverse so that higher values correspond with lower risk rankings
tn_data['pctrnk_gdp_2018'] = tn_data.gdp_2018.rank(pct = True, ascending = False)
tn_data['pctrnk_gdp_per_cap_2018'] = tn_data.gdp_per_cap_2018.rank(pct = True, ascending = False)

tn_data['pctrnk_depth_mean'] = tn_data.depth_mean.rank(pct = True, ascending = False)
tn_data['pctrnk_depth_median'] = tn_data.depth_median.rank(pct = True, ascending = False)    
tn_data['pctrnk_depth_min']  = tn_data.depth_min.rank(pct = True, ascending = False)

NameError: name 'tn_data' is not defined

In [11]:
# get percentile rank for variables going in the correct direction (i.e., higher values correspond with higher risk rankings)
tn_data['pctrnk_housing_units'] = tn_data.total_housing_units_2019.rank(pct=True)
tn_data['pctrnk_pop_est'] = tn_data.population_est.rank(pct=True)
tn_data['pctrnk_uninsured'] = tn_data.pct_uninsured.rank(pct=True)
tn_data['pctrnk_mag_mean'] = tn_data.mag_mean.rank(pct=True)
tn_data['pctrnk_mag_median'] = tn_data.mag_median.rank(pct=True)
tn_data['pctrnk_mag_max'] = tn_data.mag_max.rank(pct=True)
tn_data['pctrnk_earthquakes'] = tn_data.num_earthquakes.rank(pct=True)
tn_data['pctrnk_prob_100y_median'] = tn_data.pct_prob_100y_median.rank(pct=True)
tn_data['pctrnk_prob_100y_max'] =  tn_data.pct_prob_100y_max.rank(pct=True)

NameError: name 'tn_data' is not defined

Low numbers = low risk  
High numbers = high risk  

For fully aggregated data set (prioritize - #1):  
* for every variable, does it follow the right pattern/direction?
* reverse-coded: depth, GDP, GDP per capita (as neg)
* depth (median and max)
* magnitude (median and max)

Generate histograms for each variable of interest; if skewed (probably pop, magnitude), create new column with np.log() to normalize it (prioritize - #3)

Scikitlearn (for weighting) (prioritize - #2)

Calculations
* Cum H, Ex, V
   * Average of each category (weighing every variable equally, regardless of how many components)
* Cum H, Ex, V
   * Sum of each category (biases towards areas with more data points)
(Either way should go in the same direction)

Plug in both cum calc into risk formulas for two columns
risk_avg (also w/np.log() columns)
risk_sum (also w/np.log() columns)

Map risk_avg and risk_sum; throw into widget 

If not informative, do we want to double weights? Play around with formula to get a cleaner visualization to make a case for opportunities to invest

## Step 2 - Calculate Values for Risk Categories    
  
EmR / DR / CatR = H + Ex × V    
  
#### Code for calculating hazard score column:    
tn_data[‘hazzard_wcs’] = tn_data[‘pctrnk_earthquakes’] + tn_data[‘pctrnk_mag_max’] + tn_data[‘pctrnk_prob_100y_max’] + tn_data[‘pctrnk_depth_min’]
  
#### Code for calculating exposure score column:  
tn_data[‘exposure_wcs’] = tn_data[‘pctrnk_pop_est’] + tn_data[‘pctrnk_gdp_2018’] + tn_data[‘pctrnk_gdp_per_cap_2018’] + tn_data[‘pctrnk_housing_units’]
  
#### Code for calculating vulnerability score column:  
tn_data[‘vulnerability_wcs’] = tn_data[‘socioeconomic’] + tn_data[‘household_comp_and_disability’] + tn_data[‘minority_status_and_language’] + tn_data[‘housing_type_and_transportation’] + tn_data[‘pctrnk_uninsured’]
  
#### Code for calculating risk score column:  
tn_data[‘risk_calc_wcs’] = tn_data[‘hazzard_wcs’] + tn_data[‘exposure_wcs’] * tn_data[‘vulnerability_wcs’]  
  
The 'wcs' ending refers to 'worst case scenario' since all of these calcuations use the max values for mag, prob_100y, and depth. We could use a different ending/abbreviation for the inclusion of mean and median values if desired.

In [12]:
# Code for calculating hazard score column:  
tn_data['hazzard_wcs'] = tn_data['pctrnk_earthquakes'] + tn_data['pctrnk_mag_max'] + tn_data['pctrnk_prob_100y_max'] + tn_data['pctrnk_depth_min']
tn_data['pctrnk_hazzard_wcs'] = tn_data.hazzard_wcs.rank(pct=True)

# Code for calculating exposure score column:
tn_data['exposure_wcs'] = tn_data['pctrnk_pop_est'] + tn_data['pctrnk_gdp_2018'] + tn_data['pctrnk_gdp_per_cap_2018'] + tn_data['pctrnk_housing_units']
tn_data['pctrnk_exposure_wcs'] = tn_data.exposure_wcs.rank(pct=True)

# Code for calculating vulnerability score column:
tn_data['vulnerability_wcs'] = tn_data['socioeconomic'] + tn_data['household_comp_and_disability'] + tn_data['minority_status_and_language'] + tn_data['housing_type_and_transportation'] + tn_data['pctrnk_uninsured']
tn_data['pctrnk_vul_wcs'] = tn_data.vulnerability_wcs.rank(pct=True)

# Code for calculating risk score column:
tn_data['risk_calc_wcs'] = tn_data['hazzard_wcs'] + tn_data['exposure_wcs'] * tn_data['vulnerability_wcs']
tn_data['risk_calc_wcs'] = tn_data.risk_calc_wcs.rank(pct=True)

NameError: name 'tn_data' is not defined

In [13]:
tn_data

NameError: name 'tn_data' is not defined

In [14]:
tn_data.to_csv('../data/tn_data_pctrnk_risk_calc.csv')

NameError: name 'tn_data' is not defined