#### Workbook to read in data from different sources and prep for processing

In [1]:
import geopandas as gp
import pandas as pd
import os
import glob
import requests

## Flares data

In [2]:
projcrs = 4326

In [3]:
def convert_geopandas(df):
    '''Convert pandas df to geodataframe, drop orig Lat/Long cols, and set crs to common project crs'''
    
    df = gp.GeoDataFrame(
        df, geometry=gp.points_from_xy(df["Longitude"],
                                       df["Latitude"]), 
        crs = projcrs)

    #df.drop(['Latitude', 'Longitude'], axis=1, inplace=True) 
    return(df)

In [4]:
def read_excel(sheet_name):
    
    all_files = glob.glob("data/*.xlsx")
    temp = []

    for filename in all_files:
        df = pd.read_excel(filename, index_col=None, header=0, sheet_name=sheet_name)
        temp.append(df)

    df = pd.concat(temp, axis=0, ignore_index=True)

    # some data cleanup
    df.rename(columns={'BCM 2019':'BCM_2019', 'Avg. temp':'avg_temp',
                       'BCM 2018':'BCM_2018', 'BCM 2020':'BCM_2020',
                       'BCM 2021':'BCM_2021',
                       'Detection_frequency_2012':'Det_freq12',
                       'Detection_frequency_2013':'Det_freq13',
                       'Detection_frequency_2014':'Det_freq14',
                       'Detection_frequency_2015':'Det_freq15',
                       'Detection_frequency_2016':'Det_freq16',
                       'Detection_frequency_2017':'Det_freq17',
                       'Detection frequency 2018':'Det_freq18',
                       'Detection freq. 2019':'Det_freq19',
                       'Detection frequency 2020':'Det_freq20',
                       'Detection frequency 2021':'Det_freq21',
                       'Clear_obs_2012': 'clr_obs12',
                       'Clear_obs_2013': 'clr_obs13',
                       'Clear_obs_2014': 'clr_obs14',
                       'Clear_obs_2015': 'clr_obs15',
                       'Clear_obs_2016': 'clr_obs16',
                       'Clear_obs_2017': 'clr_obs17',
                       'Clear obs 2018': 'clr_obs18',
                       'Clear obs. 2019': 'clr_obs19',
                       'Clear Obs. 2020': 'clr_obs20',
                       'Clear Obs. 2021': 'clr_obs21'}, inplace=True)
    
    df["flare_category"] = sheet_name  # add new column for overall flare type, to make summary stats easier
    
    return(df)

In [5]:
# define sheet names
all_upstream="flares_upstream"
oil_downstream="flares_oil_downstream"
gas_downstream="flares_gas_downstream"

# create dfs by flare category
flares_upstream = read_excel(all_upstream)
oil_downstream = read_excel(oil_downstream)
gas_downstream = read_excel(gas_downstream)

# combine into single df and convert to geodataframe
all_flares = convert_geopandas(pd.concat([flares_upstream, oil_downstream, gas_downstream]))

In [6]:
all_flares.to_file("data/all_flares.shp")

  """Entry point for launching an IPython kernel.


## Social vulnerability data from CalEnviroScreen

No cleaning necessary; shapefile read in directly to analysis notebooks

## Refineries

No cleaning necessary; csv read in directly to analysis notebooks

## 2010 US Census data

In [None]:
keys = pd.read_csv("api_keys.csv")
census_api_key = keys.loc[keys['api']=='census']['key'].iloc[0]

### 2/16/23 note: not using census API right now. Maybe later in project. 
1) need to confirm variables of interest with advisor / RMI

Czolowsky et al used: 
total population, Hispanic, minority, non-Hispanic minority, 5y and younger, under 18y, and 75y and older.

Var list

P001001: total population

P004001: total hispanic or latino origin

H008002: Total races tallied for householders — White alone or in combination with one or more other races


In [144]:
# helpful vid: https://www.youtube.com/watch?v=LW-M_UC0VTE
# dec/sf1 variables: https://api.census.gov/data/2010/dec/sf1/variables.html
# dec/sf1 example api calls: https://api.census.gov/data/2010/dec/sf1/examples.html
# CA FIPS codes: https://www.weather.gov/hnx/cafips
# FYI California has 8,057 census tracts, 23,212 block groups, and 710,145 blocks

# set search query and parameters
url = "https://api.census.gov/data/2010/dec/sf1?get=NAME,TRACT,P001001,P004001,H008002&for=block:*&in=state:06&in=county:001&in=tract:*&key={0}".format(census_api_key)

# make call to api
response = requests.request("GET", url)

In [145]:
# clean up column names that come from the api
col_names = ["name", "tract_id", "total_pop", "total_hisp", "race_white", "state", "county", "tract", "block"]

# construct df, beg. at first row so headers are skipped
df = pd.DataFrame(columns=col_names, data=response.json()[1:]) 

In [146]:
# check a few rows
df.sample(5)

Unnamed: 0,name,tract_id,total_pop,total_hisp,race_white,state,county,tract,block
2194,"Block 1007, Block Group 1, Census Tract 4043, ...",404300,0,0,0,6,1,404300,1007
18306,"Block 1014, Block Group 1, Census Tract 4431.0...",443103,12,12,1,6,1,443103,1014
21099,"Block 1065, Block Group 1, Census Tract 4507.4...",450743,0,0,0,6,1,450743,1065
5062,"Block 1005, Block Group 1, Census Tract 4089, ...",408900,0,0,0,6,1,408900,1005
14094,"Block 1066, Block Group 1, Census Tract 4371.0...",437101,0,0,0,6,1,437101,1066
20825,"Block 4028, Block Group 4, Census Tract 4507.0...",450701,0,0,0,6,1,450701,4028
20730,"Block 3168, Block Group 3, Census Tract 4507.0...",450701,0,0,0,6,1,450701,3168
16546,"Block 1267, Block Group 1, Census Tract 4415.0...",441503,0,0,0,6,1,441503,1267
15957,"Block 2015, Block Group 2, Census Tract 4412, ...",441200,48,48,16,6,1,441200,2015
8294,"Block 1005, Block Group 1, Census Tract 4251.0...",425102,0,0,0,6,1,425102,1005


In [147]:
# save to a csv for further processing in other notebooks
df.to_csv("data/ca_blocklevel_demographics.csv")