In [1]:
import geopandas as gp
from shapely import wkt
from shapely.geometry import Point, Polygon
from shapely.ops import unary_union
import pandas as pd
import numpy as np
from pprint import pprint
import os
import glob
import openpyxl
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import plotly.express as px #if using plotly
import folium
import warnings
import uuid
from sklearn.preprocessing import StandardScaler

In [2]:
ca_state = gp.read_file("data/CA_State_TIGER2016.shp")  # CA state
ca_counties = gp.read_file("data/CA_Counties_TIGER2016.shp")  # CA counties
ca_bg = gp.read_file("data/ca_bg_clean.shp")  # CA block groups, cleaned

In [3]:
# just CA flares
ca_flares = gp.read_file("data/ca_flares_clean.shp")
print(f'Flares found: {len(ca_flares)}')

# CA block groups merged with EJscreen data
ca_bg_joined = gp.read_file("data/ca_bg_joined_clean.shp")

# update col name for correct area b/c it gets saved as a truncated version when saved as a shapefile out of the 
# main data cleaning notebook
ca_bg_joined.rename(columns={'shape_ar_1':'shape_area_new'}, inplace=True)  # old:new.
print(f'BGs found: {len(ca_bg_joined)}')

Flares found: 117
BGs found: 25607


In [4]:
# not all flares have a unique identifier — both CatalogID and ID columns have missing values
# so I create a new col "flare_id" of IDs

# generate random integer IDs
ids = np.random.randint(100000, 999999, size=len(ca_flares))

# convert integer IDs to unique string identifiers
id_str = [str(uuid.uuid4())[:8] + str(i) for i in ids]

# add the new column to the DataFrame
ca_flares['flare_id'] = id_str

In [7]:
# There are three BGs that seem to just be aquatic buffers around the actual county land.
# Dropping them for now 
# IDs: 60839900000, 61119901000, 60379902000
# Explore if needed: ca_bg_joined.explore()

ids_to_drop = [60839900000, 61119901000, 60379902000, 60379903000, 60599901000]

# Drop the rows with those IDs
ca_bg_joined = ca_bg_joined[~ca_bg_joined['bg_id'].isin(ids_to_drop)]

In [8]:
# set common crs for project
# epsg3310: https://epsg.io/3310-1739
# units: meters
meters_crs = 3310  # Projected crs. this should be good for this overlay() calculation and all of project. 

ca_state = ca_state.to_crs(meters_crs)
ca_counties = ca_counties.to_crs(meters_crs)
ca_bg = ca_bg.to_crs(meters_crs)

### 1) Which specific flares are the most impactful? 

Exploratory question; no hypothesis testing

EJ index = (The Environmental Indicator Percentile for Block Group) x (Demographic Index for Block Group)
  
a) Determine flaring impact metric. 

Something like: buffer_population x EJ Index x flare volume  

How best to normalize these different units? Use percentiles compared to rest of CA? 

b) Calculate impact per flare  

c) Visualize top_x flares by impact  


In [9]:
# set col list for BCM_avg calculation
col_list = ['BCM_2012','BCM_2013','BCM_2014','BCM_2015','BCM_2016','BCM_2017',
            'BCM_2018','BCM_2019','BCM_2020','BCM_2021']

# add new column for average BCM across all years
ca_flares['BCM_avg'] = ca_flares[col_list].mean(axis=1) 

In [None]:
buffer_size = 2000

ca_bg_joined = ca_bg_joined[['bg_id', 'CNTY_NAME', 'Shape_Area', 'ACSTOTPOP', 'MINORPOP', 'D_PM25_2', 'shape_area_new', 'geometry']]

#flares_df = set_geometry_buffer(flares_df, buffer_size)
buffer_col = f"buffer_{buffer_size}m"
ca_flares[buffer_col] = ca_flares['geometry'].buffer(distance=buffer_size)

ca_flares = ca_flares.set_geometry(buffer_col)

temp = ca_flares.unary_union
all_buffers = gp.GeoDataFrame({'geometry': [temp]}, crs=ca_flares.crs)  # convert back to geodf for processing

intersect = gp.overlay(ca_flares, ca_bg_joined, how='intersection')  # could look at keep_geom=False

In [None]:
intersect = intersect[['flare_id', 'bg_id', 'CNTY_NAME','flare_cate', 
        'BCM_2012', 'BCM_2013', 'BCM_2014', 'BCM_2015', 'BCM_2016','BCM_2017',
        'BCM_2018', 'BCM_2019','BCM_2020','BCM_2021','BCM_avg',
        'ACSTOTPOP', 'MINORPOP', 'D_PM25_2', 'shape_area_new', 'geometry']]

intersect.rename(columns={'geometry':'intersect_geom'}, inplace=True)  # old:new.
intersect = intersect.set_geometry('intersect_geom')  # set to the buffers rather than the points

In [None]:
# Create new 'area' column for the areas of the intersections
intersect['intersect_area'] = intersect.area

# Calculate the proportion of each block group within the buffer zone
intersect['intersect_prop'] = intersect['intersect_area'] / intersect['shape_area_new']

In [None]:
len(intersect)

In [None]:
intersect['flare_id'].nunique()

In [None]:
# Apply the proportion to each demographic variable to find counts by variable
demo_vars = ['ACSTOTPOP', 'MINORPOP']
for var in demo_vars:
    intersect[var + '_intersect_count'] = intersect[var] * intersect['intersect_prop']

# find overall proportions for each demo var by dividing var count by respective total population    
for var in demo_vars:
    intersect[var + '_bg_totprop'] = intersect[var] / intersect['ACSTOTPOP']

In [None]:
intersect.sample(1)

In [None]:
# Define the variables to standardize
varlist = ['BCM_avg', 'D_PM25_2', 'ACSTOTPOP_intersect_count']

# Create a StandardScaler object
scaler = StandardScaler()

# Fit and transform the selected variables
intersect_norm = scaler.fit_transform(intersect[varlist])

# Create new variables with the standardized values
intersect_norm = pd.DataFrame(intersect_norm, columns=[var + '_norm' for var in varlist])

# Concatenate the new variables with the original DataFrame
intersect = pd.concat([intersect, intersect_norm], axis=1)


In [None]:
intersect.sample(1)

In [None]:
# Define the weights for each variable
bcm_weight = 0
pm25_weight = 1
pop_weight = 0

In [None]:
# Calculate the weighted variables
intersect['BCM_weighted'] = intersect['BCM_avg_norm'] * bcm_weight
intersect['D_PM25_2_weighted'] = intersect['D_PM25_2_norm'] * pm25_weight
intersect['ACSTOTPOP_weighted'] = intersect['ACSTOTPOP_intersect_count_norm'] * pop_weight

# Define the variables to sum
varlist_weighted = ['BCM_weighted', 'D_PM25_2_weighted', 'ACSTOTPOP_weighted']

# Group the block groups by flare ID and sum the weighted variables
ca_flares_merged = intersect.groupby('flare_id')[varlist_weighted].sum()

# Calculate the impact metric as the sum of the weighted variables
ca_flares_merged['impact_metric'] = ca_flares_merged[varlist_weighted].sum(axis=1)

# Sort the flares by impact metric in descending order
ca_flares_merged = ca_flares_merged.sort_values('impact_metric', ascending=False)

In [None]:
# Show the results
ca_flares_merged.head()

In [None]:
# Define the variables to sum
varlist_norm = ['BCM_avg_norm', 'D_PM25_2_norm', 'ACSTOTPOP_intersect_count_norm']

# Group the block groups by flare ID and sum the standardized variables
ca_flares_merged = intersect.groupby('flare_id')[varlist_norm].sum()

# Calculate the impact metric as the sum of the standardized variables
ca_flares_merged['impact_metric'] = ca_flares_merged[varlist_norm].sum(axis=1)

# Sort the flares by impact metric in descending order
ca_flares_merged = ca_flares_merged.sort_values('impact_metric', ascending=False)

In [None]:
df_final = pd.merge(ca_flares_merged, intersect, on='flare_id')

In [None]:
df_final.sample(1)

In [None]:
df_final['geometry'].dtype

In [None]:
invalid_geoms = df_final[~df_final.is_valid]

In [None]:
invalid_geoms

In [None]:
# save for use in tableau
df_final.to_csv(F"data/df_impactmetric_csv.csv", index=False)

In [None]:
# # check for missing values
# missing_values = df_final.isnull().sum()

# # filter columns with missing values
# missing_cols = missing_values[missing_values > 0]

# # print column names and number of missing values
# for col in missing_cols.index:
#     print(f"Column '{col}' has {missing_cols[col]} missing values")

In [None]:
# no_missing = missing_values[missing_values == 0]
# # print column names and number of missing values
# for col in no_missing.index:
#     print(f"Column '{col}' has {no_missing[col]} missing values")

In [None]:
df_final.sample(2)

In [None]:
for_map = df_final[['flare_id', 'BCM_avg', 'D_PM25_2', 'ACSTOTPOP_intersect_count', 'impact_metric', 'buffer_2000m']]
for_map = gp.GeoDataFrame(for_map, geometry='buffer_2000m', crs=meters_crs)

for_map.to_file("data/df_impactmetric_shp.shp", driver='ESRI Shapefile')

In [None]:
# # display the top ten flares by impact metric
# top_ten = for_map.nlargest(10, 'impact_metric')
# top_ten = top_ten.set_geometry('buffer_2000m')

In [None]:
#top_ten

In [None]:
# check for missing values
missing_values = for_map.isnull().sum()

# filter columns with missing values
missing_cols = missing_values[missing_values > 0]

# print column names and number of missing values
for col in missing_cols.index:
    print(f"Column '{col}' has {missing_cols[col]} missing values")

In [None]:
no_missing = missing_values[missing_values == 0]
# print column names and number of missing values
for col in no_missing.index:
    print(f"Column '{col}' has {no_missing[col]} missing values")

## Folium Mapping

In [None]:

# Define the color scale and number of bins
color_scale = 'Reds'
num_bins = 10

# Create a map centered on the first flare
# Create a folium map with a center location
m = folium.Map(location=[38.377158,-121.645792], zoom_start=6, tiles=None,overlay=False)  #start w lat/long roughly in center of CA
base_map = folium.FeatureGroup(name='Base map', overlay=True, control=False)
folium.TileLayer(tiles='OpenStreetMap').add_to(base_map)
base_map.add_to(m)




In [None]:
# Create a choropleth layer
folium.Choropleth(
    geo_data=for_map,
    name='Impact Metric',
    data=for_map,
    columns=['flare_id', 'BCM_avg', 'D_PM25_2', 
             'ACSTOTPOP_intersect_count', 'impact_metric', 'buffer_2000m'],
    key_on='feature.properties.flare_id',
    fill_color=color_scale,
    fill_opacity=0.7,
    line_opacity=0.2,
    bins=num_bins,
    legend_name='Impact Metric'
).add_to(m)

In [None]:
style_function = lambda x: {'fillColor': '#ffffff', 
                            'color':'#000000', 
                            'fillOpacity': 0.1, 
                            'weight': 0.1}
highlight_function = lambda x: {'fillColor': '#999999', 
                                'color':'#999999', 
                                'fillOpacity': 0.50, 
                                'weight': 0.1}
NIL = folium.features.GeoJson(
    data = for_map,
    style_function=style_function, 
    control=False,
    highlight_function=highlight_function, 
    tooltip=folium.features.GeoJsonTooltip(
        fields=['flare_id', 'BCM_avg_norm', 'D_PM25_2_norm', 
             'ACSTOTPOP_intersect_count_norm', 'impact_metric'],# 'D_PM25_2', 'ACSTOTPOP', 'MINORPOP','shape_area_new', 'intersect_prop', 'intersect_area', 'MINORPOP_bg_totprop'],
        style=("background-color: white; color: #333333; font-family: arial; font-size: 12px; padding: 10px;") 
    )
)

In [None]:
# add hover functionality as child to map, add layering, display map
m.add_child(NIL)
m.keep_in_front(NIL)
folium.LayerControl().add_to(m)

In [None]:
# Display the map
m