In [1]:
import geopandas as gp
from shapely import wkt
import pandas as pd
import numpy as np
import pprint
import os
import glob
import openpyxl
import matplotlib.pyplot as plt
import plotly.express as px #if using plotly

In [2]:
pd.set_option('display.max_columns', None)  # display all columns
pd.options.display.float_format = '{:20,.2f}'.format  # suppress scientific notation

## Read in files

In [3]:
ca_state = gp.read_file("data/CA_State_TIGER2016.shp")
ca_counties = gp.read_file("data/CA_Counties_TIGER2016.shp")
ca_bg = gp.read_file("data/tl_2022_06_bg.shp")

In [4]:
# Load census block group level data from EJscreen
# Source: https://www.epa.gov/ejscreen/download-ejscreen-data
ejscreen = pd.read_excel("data/CA_EJSCREEN_2022_Full_with_AS_CNMI_GU_VI.xlsx", index_col=None, header=0)

In [5]:
# CA social vulnerability data at level of census tract
# source: https://oehha.ca.gov/calenviroscreen/report/calenviroscreen-40
# calenv4_df = gp.read_file("data/CES4 Final Shapefile.shp")
# calenv4_df['Tract'] = calenv4_df['Tract'].astype(np.int64)  # change from float to int

In [6]:
# read in cleaned and combined flares data
all_flares = gp.read_file("data/all_flares.shp")

In [7]:
# set common crs for project
projcrs = 4326
ca_state = ca_state.to_crs(projcrs)
ca_counties = ca_counties.to_crs(projcrs)
ca_bg = ca_bg.to_crs(projcrs)

In [8]:
ca_bg.rename(columns={'GEOID':'ID'}, inplace=True)  # match column names for merging
ca_bg['ID'] = ca_bg['ID'].astype(np.int64)

In [9]:
print(f"{len(ejscreen['ID'])} block groups in the EJScreen data \n")
print(f"{len(ca_bg['ID'])} block groups in the CA block group shapefile\n")
ca_bg_joined = pd.merge(ca_bg, ejscreen, on='ID')
print(f"{len(ca_bg['ID'].unique())-len(ca_bg_joined)} block groups are missing after merge")

25607 block groups in the EJScreen data 

25607 block groups in the CA block group shapefile

0 block groups are missing after merge


## Subset flares df to just CA

In [10]:
ca_flares = gp.sjoin(all_flares, ca_counties, how = "inner", predicate = 'within')
print(f'Flares found: {len(ca_flares)}')
ca_flares.drop('index_right', axis=1, inplace=True)

col_list = ['BCM_2012','BCM_2013','BCM_2014','BCM_2015','BCM_2016','BCM_2017',
            'BCM_2018','BCM_2019','BCM_2020','BCM_2021']

# add new column for average BCM across all years
ca_flares['BCM_avg'] = ca_flares[col_list].mean(axis=1)  

Flares found: 117


## Regression (in progress)

#### New col: distance between BG centroid and nearest flare

In [11]:
ca_polys_subset = gp.sjoin(ca_bg_joined, ca_flares, how='inner', predicate='contains')

In [12]:
# epsg3310: https://epsg.io/3310-1739
# units: meters
ca_polys_subset = ca_polys_subset.to_crs(epsg=3310)
ca_flares = ca_flares.to_crs(epsg=3310)
ca_bg_joined = ca_bg_joined.to_crs(epsg=3310)

In [13]:
# calculate distance between each centroid and nearest flare
ca_bg_joined['dist_flare'] = ca_bg_joined.geometry.centroid.apply(lambda x: ca_flares.distance(x).min())

In [14]:
# Create binary dependent variable
ca_bg_joined = ca_bg_joined.assign(flares_present=ca_bg_joined['geometry'].isin(ca_polys_subset['geometry']).astype(int))

In [23]:
print(f"Block groups containing flares (0=no, 1=yes):\n")
print(ca_bg_joined['flares_present'].value_counts())

Block groups containing flares (0=no, 1=yes):

0    25570
1       37
Name: flares_present, dtype: int64


In [32]:
ca_bg_joined.head(2)

Unnamed: 0,STATEFP,COUNTYFP,TRACTCE,BLKGRPCE,ID,NAMELSAD,MTFCC,FUNCSTAT,ALAND,AWATER,INTPTLAT,INTPTLON,geometry,OBJECTID,STATE_NAME,ST_ABBREV,CNTY_NAME,REGION,ACSTOTPOP,ACSIPOVBAS,ACSEDUCBAS,ACSTOTHH,ACSTOTHU,ACSUNEMPBAS,VULEOPCT,MINORPOP,MINORPCT,LOWINCOME,LOWINCPCT,UNEMPLOYED,UNEMPPCT,LINGISO,LINGISOPCT,LESSHS,LESSHSPCT,UNDER5,UNDER5PCT,OVER64,OVER64PCT,PM25,OZONE,DSLPM,CANCER,RESP,PTRAF,PRE1960,PRE1960PCT,PNPL,PRMP,PTSDF,UST,PWDIS,D_PM25_2,D_OZONE_2,D_DSLPM_2,D_CANCR_2,D_RESP_2,D_PTRAF_2,D_LDPNT_2,D_PNPL_2,D_PRMP_2,D_PTSDF_2,D_UST_2,D_PWDIS_2,P_VULEOPCT,P_MINORPCT,P_LWINCPCT,P_UNEMPPCT,P_LNGISPCT,P_LESHSPCT,P_UNDR5PCT,P_OVR64PCT,P_PM25,P_OZONE,P_DSLPM,P_CANCR,P_RESP,P_PTRAF,P_LDPNT,P_PNPL,P_PRMP,P_PTSDF,P_UST,P_PWDIS,P_PM25_D2,P_OZONE_D2,P_DSLPM_D2,P_CANCR_D2,P_RESP_D2,P_PTRAF_D2,P_LDPNT_D2,P_PNPL_D2,P_PRMP_D2,P_PTSDF_D2,P_UST_D2,P_PWDIS_D2,B_VULEOPCT,B_MINORPCT,B_LWINCPCT,B_UNEMPPCT,B_LESHSPCT,B_LNGISPCT,B_UNDR5PCT,B_OVR64PCT,B_PM25,B_OZONE,B_DSLPM,B_CANCR,B_RESP,B_PTRAF,B_LDPNT,B_PNPL,B_PRMP,B_PTSDF,B_UST,B_PWDIS,B_PM25_D2,B_OZONE_D2,B_DSLPM_D2,B_CANCR_D2,B_RESP_D2,B_PTRAF_D2,B_LDPNT_D2,B_PNPL_D2,B_PRMP_D2,B_PTSDF_D2,B_UST_D2,B_PWDIS_D2,T_VULEOPCT,T_MINORPCT,T_LWINCPCT,T_UNEMPPCT,T_LNGISPCT,T_LESHSPCT,T_UNDR5PCT,T_OVR64PCT,T_PM25,T_OZONE,T_DSLPM,T_CANCR,T_RESP,T_PTRAF,T_LDPNT,T_PNPL,T_PRMP,T_PTSDF,T_UST,T_PWDIS,T_PM25_D2,T_OZONE_D2,T_DSLPM_D2,T_CANCR_D2,T_RESP_D2,T_PTRAF_D2,T_LDPNT_D2,T_PNPL_D2,T_PRMP_D2,T_PTSDF_D2,T_UST_D2,T_PWDIS_D2,EXCEED_COUNT_80,AREALAND,AREAWATER,NPL_CNT,TSDF_CNT,Shape_Length,Shape_Area,dist_flare,flares_present
0,6,37,650001,2,60376500012,Block Group 2,G5030,S,279786,0,33.8825685,-118.3221453,"POLYGON ((154828.423 -457797.384, 154927.488 -...",21287,California,CA,Los Angeles County,9,864,864,626,238,238,394,0.36,576,0.67,42,0.05,16,0.04,0,0.0,16,0.03,92,0.11,154,0.18,12.59,40.1,0.52,30.0,0.4,654.23,195,0.82,0.33,1.72,16.38,3.76,,34.33,11.44,31.47,29.68,28.97,26.11,32.9,32.9,31.11,35.05,25.39,,59,76,8,52,0,21,87,60,96.0,32.0,88.0,83.0,81.0,73.0,92,92,87,98,71.0,,81.0,51.0,77.0,66.0,68.0,71.0,82,81,77,80,72.0,,6,8,1,6,3,1,9,7,11,4,9,9,9,8,10,10,9,11,8,0,9,6,8,7,7,8,9,9,8,9,8,0,59 %ile,76 %ile,8 %ile,52 %ile,0 %ile,21 %ile,87 %ile,60 %ile,96 %ile,32 %ile,88 %ile,83 %ile,81 %ile,73 %ile,92 %ile,92 %ile,87 %ile,98 %ile,71 %ile,,81 %ile,51 %ile,77 %ile,66 %ile,68 %ile,71 %ile,82 %ile,81 %ile,77 %ile,80 %ile,72 %ile,,4,279786,0,0,0,3087.2,406992.5,3172.97,0
1,6,37,650001,4,60376500014,Block Group 4,G5030,S,279874,17869,33.8796469,-118.3217696,"POLYGON ((154836.126 -458134.127, 154835.934 -...",21289,California,CA,Los Angeles County,9,748,748,547,318,365,500,0.49,646,0.86,80,0.11,10,0.02,38,0.12,7,0.01,0,0.0,162,0.22,12.59,40.1,0.52,30.0,0.4,131.84,276,0.76,0.46,1.73,15.81,3.62,,46.59,15.53,42.71,40.28,39.31,18.44,43.19,45.62,42.22,47.56,33.97,,73,87,19,33,87,13,0,72,96.0,32.0,88.0,83.0,81.0,38.0,89,94,87,98,70.0,,89.0,61.0,86.0,77.0,79.0,60.0,88,89,85,89,80.0,,8,9,2,4,2,9,1,8,11,4,9,9,9,4,9,10,9,11,8,0,9,7,9,8,8,7,9,9,9,9,9,0,73 %ile,87 %ile,19 %ile,33 %ile,87 %ile,13 %ile,0 %ile,72 %ile,96 %ile,32 %ile,88 %ile,83 %ile,81 %ile,38 %ile,89 %ile,94 %ile,87 %ile,98 %ile,70 %ile,,89 %ile,61 %ile,86 %ile,77 %ile,79 %ile,60 %ile,88 %ile,89 %ile,85 %ile,89 %ile,80 %ile,,7,279874,17869,0,0,3442.38,433085.47,2838.01,0


In [49]:
allcols=['D_PM25_2', 'dist_flare', 'flares_present']
for_model = ca_bg_joined[allcols].dropna()  # drop na for now


cols=['D_PM25_2', 'dist_flare']
x=for_model[cols]
y=for_model['flares_present']

In [51]:
import statsmodels.api as sm
logit_model=sm.Logit(y,x)
result=logit_model.fit()
print(result.summary2())

Optimization terminated successfully.
         Current function value: 0.007447
         Iterations 15
                         Results: Logit
Model:              Logit            Pseudo R-squared: 0.317     
Dependent Variable: flares_present   AIC:              385.1562  
Date:               2022-11-22 13:46 BIC:              401.4561  
No. Observations:   25590            Log-Likelihood:   -190.58   
Df Model:           1                LL-Null:          -278.92   
Df Residuals:       25588            LLR p-value:      2.5735e-40
Converged:          1.0000           Scale:            1.0000    
No. Iterations:     15.0000                                      
------------------------------------------------------------------
              Coef.   Std.Err.     z      P>|z|    [0.025   0.975]
------------------------------------------------------------------
D_PM25_2     -0.0371    0.0057   -6.4535  0.0000  -0.0483  -0.0258
dist_flare   -0.0006    0.0001  -11.4646  0.0000  -0.0007  -0

In [52]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

LogisticRegression()

In [53]:
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

Accuracy of logistic regression classifier on test set: 1.00


In [54]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)

[[7669    0]
 [   8    0]]


In [55]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7669
           1       0.00      0.00      0.00         8

    accuracy                           1.00      7677
   macro avg       0.50      0.50      0.50      7677
weighted avg       1.00      1.00      1.00      7677



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
