In [1]:
import geopandas as gp
from shapely import wkt
from shapely.geometry import Point, Polygon
from shapely.ops import unary_union
import pandas as pd
import numpy as np
from pprint import pprint
import os
import glob
import openpyxl
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import plotly.express as px #if using plotly
import folium
import warnings
import uuid
from sklearn.preprocessing import StandardScaler

In [2]:
pd.set_option('display.max_columns', None)  # display all columns
pd.options.display.float_format = '{:20,.2f}'.format  # suppress scientific notation

### RQ #2) Which specific flares are the most impactful? 

### RQ #3) Which specific block grous are being most impacted? 


Exploratory question; no hypothesis testing

EJ index = (The Environmental Indicator Percentile for Block Group) x (Demographic Index for Block Group)
  
a) Determine flaring impact metric. 

Something like: buffer_population x EJ Index x flare volume  

b) Calculate impact per flare  

c) Visualize top_x flares by impact  


### Read in files

In [3]:
#ca_state = gp.read_file("data/CA_State_TIGER2016.shp")  # CA state
ca_counties = gp.read_file("data/CA_Counties_TIGER2016.shp")  # CA counties
ca_counties.rename(columns={'NAMELSAD':'cnty_name'}, inplace=True)  # old:new. Match col names for merging

In [4]:
#ca_bg = gp.read_file("data/ca_bg_clean.shp")  # CA block groups, cleaned
# CA block groups merged with EJscreen data
ca_bg_joined = gp.read_file("data/ca_bg_joined_clean.shp")

# update col name for correct area b/c it gets saved as a truncated version when saved as a shapefile out of the 
# main data cleaning notebook
ca_bg_joined.rename(columns={'shape_ar_1':'shape_area_new'}, inplace=True)  # old:new.
ca_bg_joined.rename(columns={'CNTY_NAME':'cnty_name'}, inplace=True)  # old:new.
print(f'BGs found: {len(ca_bg_joined)}')

BGs found: 25607


In [5]:
# just CA flares
ca_flares = gp.read_file("data/ca_flares_clean.shp")

# renaming 
ca_flares.rename(columns={'NAMELSAD':'cnty_name'}, inplace=True)  # old:new. Match col names for merging

print(f'Flares found: {len(ca_flares)}')

Flares found: 117


In [6]:
ca_flares.sample(1)

Unnamed: 0,Country,ISO Code,Catalog ID,id #,Latitude,Longitude,BCM_2019,avg_temp,Ellipticit,Detection_,clr_obs19,Type,ID 2020,BCM_2020,Avg. temp.,Det_freq20,clr_obs20,ID 2021,BCM_2021,Det_freq21,clr_obs21,id_key_201,Avg_Temp_K,Det_freq17,clr_obs17,ISO_Code,BCM_2017,id_key,BCM_2012,BCM_2013,BCM_2014,BCM_2015,BCM_2016,clr_obs12,clr_obs13,clr_obs14,clr_obs15,clr_obs16,Det_freq12,Det_freq13,Det_freq14,Det_freq15,Det_freq16,Catalog _1,BCM_2018,Det_freq18,clr_obs18,flare_cate,Detection,Det_freq19,flare_id,STATEFP,COUNTYFP,COUNTYNS,GEOID,NAME,cnty_name,LSAD,CLASSFP,MTFCC,CSAFP,CBSAFP,METDIVFP,FUNCSTAT,ALAND,AWATER,INTPTLAT,INTPTLON,geometry
54,United States,,,,35.45,-119.73,,,1.6,,,upstream,,,,,,,,,,,1770.14,,,USA,,VNF_e2012-2015_n6003_x1197319W_y354492N_v1.0,0.0,0.0,0.0,0.0,0.0,352.0,440.0,424.0,423.0,401.0,0.0,0.45,0.24,0.71,1.5,,,,,flares_upstream,,,15cf0f89596146,6,29,2054176,6029,Kern,Kern County,6,H1,G4020,,12540,,A,21061589620,79574760,35.3466288,-118.7295064,POINT (24309.461 -285273.837)


In [7]:
ca_flares.geometry

0      POINT (101702.097 -392465.466)
1       POINT (50045.686 -404732.075)
2      POINT (111804.156 -403954.218)
3       POINT (67527.269 -411223.881)
4       POINT (62783.972 -410781.392)
                    ...              
112    POINT (165197.510 -469510.426)
113    POINT (165197.511 -469609.028)
114     POINT (-239947.045 35040.908)
115    POINT (245587.514 -437618.082)
116    POINT (225403.428 -439194.020)
Name: geometry, Length: 117, dtype: geometry

In [8]:
# set common crs for project
# epsg3310: https://epsg.io/3310-1739
# units: meters
meters_crs = 3310  # Projected crs. this should be good for this overlay() calculation and all of project. 

ca_counties = ca_counties.to_crs(meters_crs)
ca_flares = ca_flares.to_crs(meters_crs)
ca_bg_joined = ca_bg_joined.to_crs(meters_crs)

### Update ca_flares df

In [9]:
# set col list for BCM_avg calculation
bcm_list = ['BCM_2012','BCM_2013','BCM_2014','BCM_2015','BCM_2016','BCM_2017',
            'BCM_2018','BCM_2019','BCM_2020','BCM_2021']

# add new column for average BCM across all years
ca_flares['BCM_avg'] = ca_flares[bcm_list].mean(axis=1) 

In [10]:
# subset to only columns needed for Tableau
col_list = ['flare_id','flare_cate','cnty_name','BCM_2012','BCM_2013','BCM_2014','BCM_2015','BCM_2016','BCM_2017',
            'BCM_2018','BCM_2019','BCM_2020','BCM_2021','BCM_avg', 'geometry']

ca_flares_sub = ca_flares[col_list].copy()

In [11]:
ca_flares_sub.sample(5)

Unnamed: 0,flare_id,flare_cate,cnty_name,BCM_2012,BCM_2013,BCM_2014,BCM_2015,BCM_2016,BCM_2017,BCM_2018,BCM_2019,BCM_2020,BCM_2021,BCM_avg,geometry
96,139d4801336969,flares_oil_downstream,Contra Costa County,,,,,,,,,,0.0,0.0,POINT (-180447.932 3117.456)
5,7ea48fa2652174,flares_upstream,Ventura County,,,,,,,,,0.0,,0.0,POINT (101710.719 -392440.949)
13,0f78932c848460,flares_upstream,Ventura County,,,,,,,,,,0.0,0.0,POINT (101739.009 -392441.506)
61,57da3077196030,flares_upstream,Kern County,0.0,0.0,0.0,0.0,0.0,,,,,,0.0,POINT (65896.751 -284128.218)
75,11625c0b606442,flares_upstream,Fresno County,,,,,,,,0.0,,,0.0,POINT (-9984.386 -168129.034)


### Update ca_bg_joined df

In [12]:
# There are five BGs that seem to just be aquatic buffers around the actual county land.
# Dropping them for now 
# Explore if needed: ca_bg_joined.explore()

ids_to_drop = [60839900000, 61119901000, 60379902000, 60379903000, 60599901000]

# Drop the rows with those IDs
ca_bg_joined = ca_bg_joined[~ca_bg_joined['bg_id'].isin(ids_to_drop)]

In [13]:
# subset to only cols needed
ca_bg_joined_sub = ca_bg_joined[['bg_id', 'cnty_name', 'ACSTOTPOP', 'MINORPOP',
                           'D_PM25_2', 'shape_area_new', 'geometry']].copy()

ca_bg_joined_sub.rename(columns={'geometry':'bg_geom'}, inplace=True)  # old:new. Match col names for merging

# counties_sub = ca_counties[['cnty_name', 'geometry']].copy()

# counties_sub.rename(columns={'geometry':'cnty_geom'}, inplace=True)  # old:new. Match col names for merging

# # add county names to social dataframe
# ca_bg_joined_sub = pd.merge(ca_bg_joined_sub, counties_sub, 
#                              on='cnty_name', how='left')

### Gather data by BG and export shapefile for Tableau

In [14]:


# social_df = social_df.set_geometry('bg_geom')

# #flares_df = set_geometry_buffer(flares_df, buffer_size)
# buffer_col = f"buffer_{buffer_size}m"
# flares_df[buffer_col] = flares_df['geometry'].buffer(distance=buffer_size)

# flares_df = flares_df.set_geometry(buffer_col)

# # temp = flares_df.unary_union
# # all_buffers = gp.GeoDataFrame({'geometry': [temp]}, crs=flares_df.crs)  # convert back to geodf for processing

# test = social_df[social_df['bg_id'] == 60290033042].copy()

In [15]:
social_df = ca_bg_joined_sub
flares_df = ca_flares

In [16]:
social_df.sample(1)

Unnamed: 0,bg_id,cnty_name,ACSTOTPOP,MINORPOP,D_PM25_2,shape_area_new,bg_geom
18069,60371902011,Los Angeles County,1463,902,63.6,148906.52,"POLYGON ((153764.726 -433349.262, 153861.940 -..."


In [17]:
flares_df.sample(1)

Unnamed: 0,Country,ISO Code,Catalog ID,id #,Latitude,Longitude,BCM_2019,avg_temp,Ellipticit,Detection_,clr_obs19,Type,ID 2020,BCM_2020,Avg. temp.,Det_freq20,clr_obs20,ID 2021,BCM_2021,Det_freq21,clr_obs21,id_key_201,Avg_Temp_K,Det_freq17,clr_obs17,ISO_Code,BCM_2017,id_key,BCM_2012,BCM_2013,BCM_2014,BCM_2015,BCM_2016,clr_obs12,clr_obs13,clr_obs14,clr_obs15,clr_obs16,Det_freq12,Det_freq13,Det_freq14,Det_freq15,Det_freq16,Catalog _1,BCM_2018,Det_freq18,clr_obs18,flare_cate,Detection,Det_freq19,flare_id,STATEFP,COUNTYFP,COUNTYNS,GEOID,NAME,cnty_name,LSAD,CLASSFP,MTFCC,CSAFP,CBSAFP,METDIVFP,FUNCSTAT,ALAND,AWATER,INTPTLAT,INTPTLON,geometry,BCM_avg
42,United States,USA,USA_UPS_2015_119.6421W_35.4429N_v0.2,1002.0,35.44,-119.64,0.0,1943.07,1.6,9.04,387.0,upstream,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,flares_upstream,,,5d31933b909186,6,29,2054176,6029,Kern,Kern County,6,H1,G4020,,12540,,A,21061589620,79574760,35.3466288,-118.7295064,POINT (32463.729 -285943.545),0.0


In [18]:
social_df = social_df.set_geometry('bg_geom')

social_df.geometry

0        POLYGON ((154828.423 -457797.384, 154927.488 -...
1        POLYGON ((154836.126 -458134.127, 154835.934 -...
2        POLYGON ((152442.207 -458985.139, 152441.728 -...
3        POLYGON ((152453.177 -459429.971, 152452.671 -...
4        POLYGON ((151858.050 -459765.962, 151869.518 -...
                               ...                        
25602    POLYGON ((-162287.592 -153450.003, -162255.599...
25603    POLYGON ((-153230.185 -157896.463, -153187.391...
25604    POLYGON ((-151685.856 -155314.179, -151680.464...
25605    POLYGON ((-151231.659 -155261.166, -151189.377...
25606    POLYGON ((52493.300 -162907.653, 52493.519 -16...
Name: bg_geom, Length: 25602, dtype: geometry

In [19]:
flares_df.rename(columns={'geometry':'flare_pts'}, inplace=True)  # old:new. Match col names for merging

In [20]:
#flares_df = set_geometry_buffer(flares_df, buffer_size)
buffer_size=2000
buffer_col = f"buffer_{buffer_size}m"
flares_df[buffer_col] = flares_df['flare_pts'].buffer(distance=buffer_size)

flares_df = flares_df.set_geometry('buffer_2000m')


In [21]:
# subset of desired columns from flares_df
#flares_df = flares_df.drop(['flare_pts'], axis=1)

flares_subset = flares_df[['buffer_2000m', 'BCM_avg', 'flare_id', 'flare_cate']]

# perform spatial join
intersect = gp.sjoin(social_df, flares_subset, how='left', predicate='intersects')


In [22]:
 # bring in buffer_2000 col that is not brought in by the sjoin() function for some unknown reason
intersect_all = pd.merge(intersect, flares_subset[['flare_id', 'buffer_2000m']], 
                             on='flare_id', how='left')

In [23]:
len(intersect_all)

25865

In [24]:
# Check for null or invalid geometries in buffer_2000m column
intersect_all['buffer_valid'] = intersect_all['buffer_2000m'].is_valid

# Initialize int_prop and int_area columns with NaN values
intersect_all['b_int_prop'] = np.nan
intersect_all['b_int_area'] = np.nan

# Loop through each block group and flare pairing
for index, row in intersect_all.iterrows():
    # Check if buffer_2000m geometry is valid
    if row['buffer_valid']:
        # Calculate intersection area
        intersection = row['buffer_2000m'].intersection(row['bg_geom'])  # prop of buffer in the bg geom
        intersection_area = intersection.area
        # Calculate buffer area
        buffer_area = row['buffer_2000m'].area
        # Calculate intersection proportion
        buf_int_prop = intersection_area / buffer_area
        # Update int_prop and int_area columns with calculated values
        intersect_all.at[index, 'b_int_prop'] = buf_int_prop
        intersect_all.at[index, 'b_int_area'] = intersection_area
        intersect_all.at[index, 'buff_area'] = buffer_area

    else:
        # Assign 0 to int_prop and int_area if buffer_2000m geometry is invalid
        intersect_all.at[index, 'b_int_prop'] = 0
        intersect_all.at[index, 'b_int_area'] = 0
        intersect_all.at[index, 'buff_area'] = 0

        
# Drop buffer_valid column
intersect_all.drop('buffer_valid', axis=1, inplace=True)

# Create new dataframe to store output
#output_df = intersect_all[['bg_id', 'flare_id', 'int_prop', 'int_area']].copy()


### I confirm this works as intended in the Folium mapping at the bottom of the notebook

In [25]:
len(intersect_all)

25865

In [26]:
intersect_all.sample(1)

Unnamed: 0,bg_id,cnty_name,ACSTOTPOP,MINORPOP,D_PM25_2,shape_area_new,bg_geom,index_right,BCM_avg,flare_id,flare_cate,buffer_2000m,b_int_prop,b_int_area,buff_area
5502,60855100013,Santa Clara County,1174,400,15.5,658858.43,"POLYGON ((-184620.463 -71030.285, -184619.518 ...",,,,,,0.0,0.0,0.0


In [27]:
temp = flares_df.unary_union
all_buffers = gp.GeoDataFrame({'geometry': [temp]}, crs=flares_df.crs)  # convert back to geodf for processing
intersect_all = intersect_all.set_crs(meters_crs)
intersect_all = intersect_all.set_geometry('bg_geom')

intersect_temp = gp.overlay(intersect_all, all_buffers, how='intersection')  # could look at keep_geom=False
intersect_temp.rename(columns={'geometry':'bg_int_geom'}, inplace=True)  # old:new. Match col names for merging
intersect_temp = intersect_temp.set_geometry('bg_int_geom')  # set to the buffers rather than the points

# Create new 'area' column for the areas of the intersections
intersect_temp['bg_int_area'] = intersect_temp.area


# Calculate the proportion of each block group within the buffer zone
intersect_temp['bg_int_prop'] = intersect_temp['bg_int_area'] / intersect_temp['shape_area_new']

intersect_all = pd.merge(intersect_all, 
                             intersect_temp[['bg_id','bg_int_prop','bg_int_geom']], 
                             on=['bg_id'], 
                             how='left').fillna({'bg_int_prop': 0})


In [28]:
# Apply the proportion to each demographic variable to find counts by variable
demo_vars = ['ACSTOTPOP', 'MINORPOP']
for var in demo_vars:
    intersect_all[var + '_int_cnt'] = intersect_all[var] * intersect_all['bg_int_prop']

# find overall proportions for each demo var by dividing var count by respective total population    
for var in demo_vars:
    intersect_all[var + '_bg_totprop'] = intersect_all[var] / intersect_all['ACSTOTPOP']

In [29]:
intersect_all.sample(5)

Unnamed: 0,bg_id,cnty_name,ACSTOTPOP,MINORPOP,D_PM25_2,shape_area_new,bg_geom,index_right,BCM_avg,flare_id,flare_cate,buffer_2000m,b_int_prop,b_int_area,buff_area,bg_int_prop,bg_int_geom,ACSTOTPOP_int_cnt,MINORPOP_int_cnt,ACSTOTPOP_bg_totprop,MINORPOP_bg_totprop
11441,60133451111,Contra Costa County,1996,1332,35.32,1484053.49,"POLYGON ((-172884.832 -26698.813, -172867.471 ...",,,,,,0.0,0.0,0.0,0.0,,0.0,0.0,1.0,0.67
24533,60855050111,Santa Clara County,2447,1646,43.63,451883.58,"POLYGON ((-173806.427 -66005.985, -173798.404 ...",,,,,,0.0,0.0,0.0,0.0,,0.0,0.0,1.0,0.67
23428,60375509021,Los Angeles County,1593,1324,55.64,155144.53,"POLYGON ((172515.388 -450188.750, 172548.303 -...",,,,,,0.0,0.0,0.0,0.0,,0.0,0.0,1.0,0.83
24197,60290032121,Kern County,1831,1014,30.24,1895703.72,"POLYGON ((77600.098 -297341.232, 77600.453 -29...",,,,,,0.0,0.0,0.0,0.0,,0.0,0.0,1.0,0.55
6218,60373016011,Los Angeles County,1339,771,42.92,493536.99,"POLYGON ((156641.282 -426144.400, 156743.273 -...",,,,,,0.0,0.0,0.0,0.0,,0.0,0.0,1.0,0.58


#### Standardize variables

EJ index and population are already encoded at the level of a BG, so they're ready to normalize. 

Flare volume needs to multiplied by the proportion of the flare's buffer zone that resides in a BG, and then all (in the case of multiple flares in a BG) adjusted flare volumes summed up to generate a variable for total flare volume a BG is exposed to.  

In [30]:
def calc_adj_flare_vol(df):
    """
    "bcm_adj" = adjusted bcm value for a given flare_id and bg_id, based on the proportion of the buffer zone that 
    overlaps with a given bg_id. Equal to the bcm_avg * b_int_prop (the overlap)
    
    "bcm_adj_tot" = the sum of the bcm_adj values for a given bg_id
    """
    
    # calculate the product of bcm_avg and b_int_prop for each flare and bg combination
    df['bcm_adj'] = df['BCM_avg'] * df['b_int_prop']
    # sum the adjusted volumes for all flare and bg combinations
    adj_vol = df.groupby(['flare_id', 'bg_id'])['bcm_adj'].sum().reset_index()
    # sum the adjusted volumes for all flares in the block group
    bg_vol = adj_vol.groupby('bg_id')['bcm_adj'].sum().reset_index()
    # merge the block group volumes with the original dataframe
    df = pd.merge(df, bg_vol, on='bg_id', how='left', suffixes=('', '_tot'))
    return df


In [31]:
intersect_all = intersect_all.groupby('bg_id').apply(calc_adj_flare_vol)

In [32]:
# Assuming your dataframe is named `intersect_all`
df_uniq = intersect_all.drop_duplicates(subset=['bg_id', 'flare_id'])

In [33]:
type(df_uniq)

pandas.core.frame.DataFrame

In [34]:
df_uniq.columns

Index(['bg_id', 'cnty_name', 'ACSTOTPOP', 'MINORPOP', 'D_PM25_2',
       'shape_area_new', 'bg_geom', 'index_right', 'BCM_avg', 'flare_id',
       'flare_cate', 'buffer_2000m', 'b_int_prop', 'b_int_area', 'buff_area',
       'bg_int_prop', 'bg_int_geom', 'ACSTOTPOP_int_cnt', 'MINORPOP_int_cnt',
       'ACSTOTPOP_bg_totprop', 'MINORPOP_bg_totprop', 'bcm_adj',
       'bcm_adj_tot'],
      dtype='object')

In [35]:
# Define the variables to standardize

#'bcm_adj_tot' = Sum of the adjusted flare volume(s) that intersect w the BG. 
#'D_PM25_2' = EJ index for the BG
#'ACSTOTPOP_int_cnt' = BG population that resides in the intersection. = Totpop * intersection_proportion

varlist = ['bcm_adj_tot', 'D_PM25_2', 'ACSTOTPOP_int_cnt']

# Create a StandardScaler object that will transform selected variables to have a mean of zero and 
# sd of 1.
scaler = StandardScaler()

# Fit and transform the selected variables
intersect_norm = scaler.fit_transform(df_uniq[varlist])

# Create new variables with the standardized values
intersect_norm = pd.DataFrame(intersect_norm, columns=[var + '_norm' for var in varlist])

intersect_norm = intersect_norm.reset_index(drop=True)
df_uniq = df_uniq.reset_index(drop=True)

# Concatenate the new variables with the original DataFrame
df_uniq = pd.concat([df_uniq, intersect_norm], axis=1)


In [36]:
df_uniq.columns

Index(['bg_id', 'cnty_name', 'ACSTOTPOP', 'MINORPOP', 'D_PM25_2',
       'shape_area_new', 'bg_geom', 'index_right', 'BCM_avg', 'flare_id',
       'flare_cate', 'buffer_2000m', 'b_int_prop', 'b_int_area', 'buff_area',
       'bg_int_prop', 'bg_int_geom', 'ACSTOTPOP_int_cnt', 'MINORPOP_int_cnt',
       'ACSTOTPOP_bg_totprop', 'MINORPOP_bg_totprop', 'bcm_adj', 'bcm_adj_tot',
       'bcm_adj_tot_norm', 'D_PM25_2_norm', 'ACSTOTPOP_int_cnt_norm'],
      dtype='object')

In [37]:
# check calculations against two BGs
# Should see a single "bcm_adj_tot" value per BG, and different adjusted "bcm_adj" values per flare_id 
#depending on the amount of overlap.

# filter the dataframe and format specified columns to display 4 decimal places
filtered_df = df_uniq[df_uniq['bg_id'].isin([60190086003, 60133150001])][['bg_id', 'flare_id', 'BCM_avg', 'b_int_prop', 'bcm_adj', 'bcm_adj_tot','bcm_adj_tot_norm','bg_int_prop','ACSTOTPOP','ACSTOTPOP_int_cnt', 'b_int_area', 'buff_area']]
filtered_df[['BCM_avg', 'bcm_adj', 'bcm_adj_tot','bcm_adj_tot_norm']] = filtered_df[['BCM_avg', 'bcm_adj', 'bcm_adj_tot','bcm_adj_tot_norm']].apply(lambda x: x.apply(lambda y: '{:.4f}'.format(y)))

In [38]:
filtered_df

Unnamed: 0,bg_id,flare_id,BCM_avg,b_int_prop,bcm_adj,bcm_adj_tot,bcm_adj_tot_norm,bg_int_prop,ACSTOTPOP,ACSTOTPOP_int_cnt,b_int_area,buff_area
1579,60133150001,36412cae112862,0.0,0.79,0.0,0.0116,-0.0818,0.31,885,277.64,9945665.64,12546193.96
1580,60133150001,84f8fcac876528,0.0024,0.87,0.0021,0.0116,-0.0818,0.31,885,277.64,10959783.68,12546193.96
1581,60133150001,139d4801336969,0.0001,0.88,0.0001,0.0116,-0.0818,0.31,885,277.64,10988061.42,12546193.96
1582,60133150001,662fb5e3427683,0.0009,0.09,0.0001,0.0116,-0.0818,0.31,885,277.64,1117509.78,12546193.96
1583,60133150001,fb9dd7aa678530,0.0008,0.09,0.0001,0.0116,-0.0818,0.31,885,277.64,1088992.69,12546193.96
2972,60190086003,7bd77a95283749,0.0001,0.84,0.0001,0.0106,-0.1045,0.26,763,201.8,10519605.06,12546193.96
2973,60190086003,dc2822a7141447,0.0026,0.84,0.0022,0.0106,-0.1045,0.26,763,201.8,10491607.39,12546193.96
2974,60190086003,6c333571717831,0.0016,0.83,0.0013,0.0106,-0.1045,0.26,763,201.8,10473400.02,12546193.96


In [39]:
# save shapefile for visualization in Tableau

df_uniq.rename(columns={'shape_area_new':'shape_area'}, inplace=True)  # old:new.
df_uniq.rename(columns={'intersect_prop':'int_prop'}, inplace=True)  # old:new.
df_uniq.rename(columns={'ACSTOTPOP_int_cnt':'pop'}, inplace=True)  # old:new.
df_uniq.rename(columns={'ACSTOTPOP_int_cnt_norm':'pop_norm'}, inplace=True)  # old:new.
df_uniq.rename(columns={'bcm_adj_tot_norm':'bcm_norm'}, inplace=True)  # old:new.
df_uniq.rename(columns={'D_PM25_2':'pm25'}, inplace=True)  # old:new.
df_uniq.rename(columns={'D_PM25_2_norm':'pm25_norm'}, inplace=True)  # old:new.
df_uniq.rename(columns={'bg_int_geom':'int_geom'}, inplace=True)  # old:new.

# rename the flare categories for readability
df_uniq["flare_cate"] = df_uniq["flare_cate"].replace({"flares_oil_downstream": "down_oil",
                                                        "flares_upstream": "upstream"})

In [44]:
df_uniq = df_uniq.set_geometry('int_geom')

In [45]:
df_uniq.drop('bg_geom', axis=1, inplace=True)
df_uniq.drop('buffer_2000m', axis=1, inplace=True)

KeyError: "['bg_geom'] not found in axis"

In [46]:
df_uniq.columns

Index(['bg_id', 'cnty_name', 'ACSTOTPOP', 'MINORPOP', 'pm25', 'shape_area',
       'index_right', 'BCM_avg', 'flare_id', 'flare_cate', 'b_int_prop',
       'b_int_area', 'buff_area', 'bg_int_prop', 'int_geom', 'pop',
       'MINORPOP_int_cnt', 'ACSTOTPOP_bg_totprop', 'MINORPOP_bg_totprop',
       'bcm_adj', 'bcm_adj_tot', 'bcm_norm', 'pm25_norm', 'pop_norm'],
      dtype='object')

In [48]:
df_uniq.sample(20)

Unnamed: 0,bg_id,cnty_name,ACSTOTPOP,MINORPOP,pm25,shape_area,index_right,BCM_avg,flare_id,flare_cate,b_int_prop,b_int_area,buff_area,bg_int_prop,int_geom,pop,MINORPOP_int_cnt,ACSTOTPOP_bg_totprop,MINORPOP_bg_totprop,bcm_adj,bcm_adj_tot,bcm_norm,pm25_norm,pop_norm
13171,60590865021,Orange County,1878,1861,68.54,597026.5,,,,,0.0,0.0,0.0,0.0,,0.0,0.0,1.0,0.99,,,,1.39,-0.09
8652,60375407001,Los Angeles County,1118,936,62.67,719552.35,,,,,0.0,0.0,0.0,0.0,,0.0,0.0,1.0,0.84,,,,1.11,-0.09
14121,60650304004,Riverside County,1856,1663,59.43,479705.61,,,,,0.0,0.0,0.0,0.0,,0.0,0.0,1.0,0.9,,,,0.96,-0.09
13404,60590888013,Orange County,1685,1476,60.02,265449.78,,,,,0.0,0.0,0.0,0.0,,0.0,0.0,1.0,0.88,,,,0.99,-0.09
25042,61070041031,Tulare County,1201,1061,72.87,959617.09,,,,,0.0,0.0,0.0,0.0,,0.0,0.0,1.0,0.88,,,,1.59,-0.09
6985,60374006022,Los Angeles County,1576,1359,46.29,499666.0,,,,,0.0,0.0,0.0,0.0,,0.0,0.0,1.0,0.86,,,,0.34,-0.09
2916,60190078021,Fresno County,644,594,74.42,413398121.61,82.0,0.0,f32d2397880993,upstream,0.4,4982164.17,12546193.96,0.01,"POLYGON ((-14782.871 -176379.589, -14707.810 -...",7.76,7.16,1.0,0.92,0.0,0.0,-0.35,1.66,-0.03
6085,60372377101,Los Angeles County,2551,2551,89.83,178330.81,,,,,0.0,0.0,0.0,0.0,,0.0,0.0,1.0,1.0,,,,2.39,-0.09
10442,60379102183,Los Angeles County,1304,846,16.3,1548902.35,,,,,0.0,0.0,0.0,0.0,,0.0,0.0,1.0,0.65,,,,-1.07,-0.09
10911,60411150004,Marin County,1663,138,5.68,671646.34,,,,,0.0,0.0,0.0,0.0,,0.0,0.0,1.0,0.08,,,,-1.57,-0.09


In [49]:
df_uniq.to_file("data/df_bg_impactmetric_shp.shp", driver='ESRI Shapefile')

  """Entry point for launching an IPython kernel.


### Group by Flare ID and export shapefile for Tableau

### Create buffer

In [None]:
def create_buffer_intersection(flares_df, social_df, buffer_size):
    """
    Creates a new Geodf containing the intersection between census BGs
    and buffers of {buffer size} around flare sites.
    
    Parameters
    ----------
    flares_df : A Geodf containing point geometries representing flare sites.
    social_df : A Geodf containing polygon geometries representing social data from EPA EJScreen
    buffer_size : int
        The size of the buffer around each flare site, in meters.
    
    Returns
    -------
    tuple of GeoDataFrames
        A tuple containing two GeoDataFrames:
        1. The original flares_df with a new buffer column added.
        2. A GeoDataFrame containing the intersection between social_df and the 
        union of all buffer geometries in flares_df.
    """
    
    social_df = social_df.set_geometry('bg_geom')
    
    #flares_df = set_geometry_buffer(flares_df, buffer_size)
    buffer_col = f"buffer_{buffer_size}m"
    flares_df[buffer_col] = flares_df['geometry'].buffer(distance=buffer_size)

    flares_df = flares_df.set_geometry(buffer_col)
    
    temp = flares_df.unary_union
    all_buffers = gp.GeoDataFrame({'geometry': [temp]}, crs=flares_df.crs)  # convert back to geodf for processing
    
    intersect_temp = gp.overlay(social_df, all_buffers, how='intersection')  # could look at keep_geom=False
    
    intersect_temp.rename(columns={'geometry':'intersect_geom'}, inplace=True)  # old:new. Match col names for merging
    
    intersect_temp = intersect_temp.set_geometry('intersect_geom')

    intersect_temp2 = gp.sjoin(intersect_temp, flares_df, how = "left", predicate = 'intersects')

    return flares_df, intersect_temp2

In [None]:
buffer_sizes = [2000]
#buffer_sizes = [100, 400, 800, 1000, 1600, 2000, 5000, 7500, 10000]

for buffer_size in buffer_sizes:
    ca_flares_new, intersect = create_buffer_intersection(ca_flares_sub, ca_bg_joined_sub, buffer_size)

In [None]:
ca_flares_new.geometry

In [None]:
intersect_final.columns

In [None]:
intersect_final.sample(1)

In [None]:
# grab only necessary cols
intersect_forflares = intersect_final[['flare_id','bg_id', 'cnty_name_left', 'flare_cate',
                                 'BCM_2012', 'BCM_2013', 'BCM_2014', 'BCM_2015','BCM_2016', 
                                 'BCM_2017', 'BCM_2018', 'BCM_2019', 'BCM_2020', 'BCM_2021','BCM_avg',
                                 'buffer_2000m','pop','pm25','bcm_norm']].copy()

In [None]:
# Define the variables to aggregate
agg_dict = {'pm25': 'mean', 'pop': 'sum'}

# Group the block groups by flare ID and calculate the aggregation for the variables
ca_flares_agg = intersect_forflares.groupby('flare_id').agg(agg_dict)

# Rename the output variables
ca_flares_agg = ca_flares_agg.rename(columns={'pm25': 'pm25_avg', 'pop': 'pop_sum'})

# Join the aggregated variables with the original columns
ca_flares_merged = intersect_forflares.merge(ca_flares_agg, on='flare_id')


In [None]:
# Define the variables to standardize
varlist = ['pm25_avg', 'pop_sum']  # because bcm is already attached to the flare unit of analysis

# Create a StandardScaler object that will transform selected variables to have a mean of zero and 
# sd of 1.
scaler = StandardScaler()

# Fit and transform the selected variables
intersect_norm = scaler.fit_transform(ca_flares_merged[varlist])

# Create new variables with the standardized values
intersect_norm = pd.DataFrame(intersect_norm, columns=[var + '_n' for var in varlist])

# Concatenate the new variables with the original DataFrame
ca_flares_merged_f = pd.concat([ca_flares_merged, intersect_norm], axis=1)

In [None]:
ca_flares_merged_f.sample(3)

In [None]:
ca_flares_merged_f = gp.GeoDataFrame(ca_flares_merged_f, geometry='buffer_2000m', crs=meters_crs)


In [None]:
ca_flares_merged_f.to_file("data/df_flare_impactmetric_shp.shp", driver='ESRI Shapefile')

## Calculating single instance of weights here in python

In [None]:
# # Define the weights for each variable
# bcm_weight = 0
# pm25_weight = 1
# pop_weight = 0

In [None]:
# # Calculate the weighted variables
# intersect['BCM_weighted'] = intersect['BCM_avg_norm'] * bcm_weight
# intersect['D_PM25_2_weighted'] = intersect['D_PM25_2_norm'] * pm25_weight
# intersect['ACSTOTPOP_weighted'] = intersect['ACSTOTPOP_intersect_count_norm'] * pop_weight

# # Define the variables to sum
# varlist_weighted = ['BCM_weighted', 'D_PM25_2_weighted', 'ACSTOTPOP_weighted']

# # Group the block groups by flare ID and sum the weighted variables
# ca_flares_merged = intersect.groupby('flare_id')[varlist_weighted].sum()

# # Calculate the impact metric as the sum of the weighted variables
# ca_flares_merged['impact_metric'] = ca_flares_merged[varlist_weighted].sum(axis=1)

# # Sort the flares by impact metric in descending order
# ca_flares_merged = ca_flares_merged.sort_values('impact_metric', ascending=False)

In [None]:
# # Show the results
# ca_flares_merged.head()

In [None]:
# # Define the variables to sum
# varlist_norm = ['BCM_avg_norm', 'D_PM25_2_norm', 'ACSTOTPOP_intersect_count_norm']

# # Group the block groups by flare ID and sum the standardized variables
# ca_flares_merged = intersect.groupby('flare_id')[varlist_norm].sum()

# # Calculate the impact metric as the sum of the standardized variables
# ca_flares_merged['impact_metric'] = ca_flares_merged[varlist_norm].sum(axis=1)

# # Sort the flares by impact metric in descending order
# ca_flares_merged = ca_flares_merged.sort_values('impact_metric', ascending=False)

In [None]:
# df_final = pd.merge(ca_flares_merged, intersect, on='flare_id')

In [None]:
# invalid_geoms = df_final[~df_final.is_valid]

In [None]:
# invalid_geoms

In [None]:
# # save for use in tableau
# df_final.to_csv(F"data/df_impactmetric_csv.csv", index=False)

In [None]:
# # check for missing values
# missing_values = df_final.isnull().sum()

# # filter columns with missing values
# missing_cols = missing_values[missing_values > 0]

# # print column names and number of missing values
# for col in missing_cols.index:
#     print(f"Column '{col}' has {missing_cols[col]} missing values")

In [None]:
# no_missing = missing_values[missing_values == 0]
# # print column names and number of missing values
# for col in no_missing.index:
#     print(f"Column '{col}' has {no_missing[col]} missing values")

In [None]:
# df_final.sample(2)

## Top ten

In [None]:
# # display the top ten flares by impact metric
# top_ten = for_map.nlargest(10, 'impact_metric')
# top_ten = top_ten.set_geometry('buffer_2000m')

In [None]:
# # check for missing values
# missing_values = for_map.isnull().sum()

# # filter columns with missing values
# missing_cols = missing_values[missing_values > 0]

# # print column names and number of missing values
# for col in missing_cols.index:
#     print(f"Column '{col}' has {missing_cols[col]} missing values")

In [None]:
# no_missing = missing_values[missing_values == 0]
# # print column names and number of missing values
# for col in no_missing.index:
#     print(f"Column '{col}' has {no_missing[col]} missing values")

## Folium Mapping

In [None]:
# for_map = df_final[['flare_id', 'BCM_avg', 'D_PM25_2', 'ACSTOTPOP_intersect_count', 'impact_metric', 'buffer_2000m']]
# for_map = gp.GeoDataFrame(for_map, geometry='buffer_2000m', crs=meters_crs)

# for_map.to_file("data/df_impactmetric_shp.shp", driver='ESRI Shapefile')

In [None]:

# Define the color scale and number of bins
#color_scale = 'Reds'
#num_bins = 10

# Create a map centered on the first flare
# Create a folium map with a center location
m = folium.Map(location=[38.377158,-121.645792], zoom_start=6, tiles=None,overlay=False)  #start w lat/long roughly in center of CA
base_map = folium.FeatureGroup(name='Base map', overlay=True, control=False)
folium.TileLayer(tiles='OpenStreetMap').add_to(base_map)
base_map.add_to(m)




In [None]:
# Feature Group: Buffers

def style_function3(feature):
    return {
        'fillColor': 'red',
        'color': 'red',
        'fillOpacity': 0.05
    }

inter_all_bg = folium.FeatureGroup(name='Intersect BG', overlay=True)
folium.GeoJson(
    data=test["buffer_2000m"],
    style_function=style_function3
).add_to(inter_all_bg)
inter_all_bg.add_to(m)

In [None]:
# Feature Group: block group

def style_function2(feature):
    return {
        'fillColor': 'blue',
        'color': 'blue',
        'fillOpacity': 0.05
    }

bg_bound = folium.FeatureGroup(name='Intersect BG', overlay=True)
folium.GeoJson(
    data=test["bg_geom"],
    style_function=style_function2
).add_to(bg_bound)
bg_bound.add_to(m)

In [None]:
# style_function = lambda x: {'fillColor': '#ffffff', 
#                             'color':'#000000', 
#                             'fillOpacity': 0.1, 
#                             'weight': 0.1}
# highlight_function = lambda x: {'fillColor': '#999999', 
#                                 'color':'#999999', 
#                                 'fillOpacity': 0.50, 
#                                 'weight': 0.1}
# NIL = folium.features.GeoJson(
#     data = for_map,
#     style_function=style_function, 
#     control=False,
#     highlight_function=highlight_function, 
#     tooltip=folium.features.GeoJsonTooltip(
#         fields=['flare_id', 'BCM_avg_norm', 'D_PM25_2_norm', 
#              'ACSTOTPOP_intersect_count_norm', 'impact_metric'],# 'D_PM25_2', 'ACSTOTPOP', 'MINORPOP','shape_area_new', 'intersect_prop', 'intersect_area', 'MINORPOP_bg_totprop'],
#         style=("background-color: white; color: #333333; font-family: arial; font-size: 12px; padding: 10px;") 
#     )
# )

In [None]:
# # add hover functionality as child to map, add layering, display map
# m.add_child(NIL)
# m.keep_in_front(NIL)
# folium.LayerControl().add_to(m)

In [None]:
# Display the map
m