In [None]:
import pandas as pd
import numpy as np
import time

# for haversine function for transmission distance calculations
from math import radians, cos, sin, asin, sqrt
from collections import Counter

In [None]:
allocate_prod_leakage_mode = 'simple'
# for allocating leakage from production areas,
# based on energy fraction of dry gas (out of total dry gas + NGLs + oil) 
# simple: allocate production area leakage based on dry gas share of energy
# complex: assign a portion of leakage to well sites and a portion to gathering & processing; 
# calculate energy shares separately for each (because gathering & processing doesn't include oil)

oil_based_leakage_fract = False
# for assigning leakage rate for areas without measurements
# True: uses the dry gas fraction of DNGO to set a leakage rate
# False: uses same leakage rate for all production areas, regardless of oil/gas mix

In [None]:
# file paths
lca_path = '/Users/masoninman/Dropbox/GEM/LCA of natural gas use/'
model_path = lca_path + 'US cities LCA of gas model/'
eia_path = lca_path + 'EIA data for LCA of gas/EIA gas data released 2020-10-30 (production, consumption, trade)/'
inputs_path = model_path + 'US gas model inputs/'

In [None]:
parameters_file = 'GIM parameters file.xlsx'
parameters_excel = pd.ExcelFile(inputs_path + parameters_file)
parameters_main = pd.read_excel(
    parameters_excel, 'main parameters'
).set_index('parameter name')['parameter value']

data_year = int(parameters_main.at['data_year'])
ch4_fract_in_ng_consumer_grade = parameters_main.at['ch4_fract_in_ng_consumer_grade']
ch4_kg_per_mcf = parameters_main.at['ch4_kg_per_mcf']
dry_gas_en_dens_btu_per_cf = parameters_main.at['dry_gas_en_dens_btu_per_cf']
oil_en_dens_mmbtu_per_bbl = parameters_main.at['oil_en_dens_mmbtu_per_bbl']
contiguous_us_only = parameters_main.at['contiguous_us_only']

In [None]:
# input parameters from previous steps

# for input files from gas flows module
num_iter = 5000
gas_flows_timestamp = '2020-11-12_0701'
supply_fract_file = f'GIM gas flows supp_fract_prod_sums_all from gross trade for {data_year}, {num_iter} iterations ({gas_flows_timestamp}).xlsx'

In [None]:
# input files from external sources
eia_release_date = '2020-10-30'

# EIA gross gas withdrawals
# https://www.eia.gov/dnav/ng/ng_prod_sum_a_EPG0_FGW_mmcf_a.htm
eia_gross_prod_file = f'EIA gas gross production NG_PROD_SUM_A_EPG0_FGW_MMCF_A released {eia_release_date}.xls'

# EIA energy production files (data by state)
dry_gas_prod_file = f'EIA gas dry production NG_PROD_SUM_A_EPG0_FPD_MMCF_A released {eia_release_date}.xls'
ngl_prod_file = f'EIA natural gas plant processing - liquids extracted NG_PROD_PP_A_EPL0_YGT_MBBL_A released {eia_release_date}.xls'
oil_prod_file = f'EIA crude oil production PET_CRD_CRPDN_ADC_MBBL_A released {eia_release_date}.xls'

eia_shale_by_play_file = 'EIA Nat Gas Weekly - shale gas by play thru 2020-09.xlsx'

# Natural Gas Plant Liquids (NGLs) by PADD
# https://www.eia.gov/dnav/pet/pet_pnp_gp_dc_nus_mbbl_m.htm
ngl_padd_file_template = f"EIA Natural Gas Plant Field Production - PADD _PADD_NUM_ PET_PNP_GP_DC_R_PADD_NUM_0_MBBL_A released {eia_release_date}.xls"

# import EIA Drilling Productivity Report
# https://www.eia.gov/petroleum/drilling/
dpr_file = 'EIA - Drilling Productivity Report (DPR) 2020-10 - data.xlsx'

In [None]:
# GAS (AND OIL) PRODUCTION INPUT FILES
# state data

# state data files

# Pennsylvania:
pa_unconv_year_file = f'PA DEP oil and gas well production - {data_year} unconventional monthly.xlsx'
pa_conv_year_file = f'PA DEP oil and gas well production - {data_year} conventional annual.xlsx'

# Ohio:
oh_file = f'Ohio DNR - oil and gas - Combined Production {data_year} (conventional & unconventional).xlsx'

# New Mexico data goes through May 2020
new_mexico_file_template = 'New Mexico OCD - county_name County production by month from 1994 (dl 2020-07-05).xlsx'

# Colorado data; includes 2019, 2018, & 2015
colorado_file = 'Colorado COGIS oil and gas production by county (dl 2020-08-22).xlsx'

# Texas data
texas_file = 'Texas oil and gas production by county - 2018-2019 (PDQ dl 2020-10-12).xlsx'

# North Dakota & Montana data include 2019
nd_gas_file = 'North Dakota DMR gas produced by county monthly 2015-01 to 2019-12 (dl 2020-08-22).xlsx'
nd_oil_file = 'North Dakota DMR - Oil production by county 2018 (dl 2020-11-09).xlsx'
montana_bakken_file = 'Montana Board of Oil & Gas Conservation - annual production Bakken region counties 1986-2019.xlsx'

# Arkansas data; have downloaded through 2019 for Fayetteville region
arkansas_file_name_template = "Arkansas Oil and Gas Commission - Gas production DATA_YEAR - COUNTY County.xlsx"

# Utah data
utah_by_county_file = 'Utah Department of Natural Resources - oil and gas production by county annual 2010-2019.csv'

# Louisiana data
la_file = 'Louisiana DNR - oil and gas production by parish 2018 (dl 2020-10-20).xlsx'

# California data
ca_by_county_file = 'California Dept of Cons - oil and gas production by county.xlsx'

In [None]:
# energy density of NGLs
# calculated in model, based on NGL mix and energy densities of individual NGLs
# (ethane, propane, etc., in file below)
ngl_en_dens_file = 'GIM input file - energy densities of NGLs.xlsx'

state_padd_file = 'US states by PADD.xlsx'

In [None]:
# standard conversions
g_per_kg = 1e3
g_per_ton = 1e6
gg_per_ton = 1e-3

mcf_per_mmcf = 1e3
mmcf_per_bcf = 1e3

# conversion to metric units
bcf_per_bcm = 35.315

In [None]:
# list of jurisidictions to calculate leakage for, based on their gas consumption
# contiguous 48 states + DC
contig_48_states_dc = [
    'Alabama', 'Arizona', 'Arkansas', 'California', 'Colorado',
    'Connecticut', 'Delaware', 'District of Columbia', 'Florida', 'Georgia',
    'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky',
    'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan',
    'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada',
    'New Hampshire', 'New Jersey', 'New Mexico', 'New York',
    'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon',
    'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota',
    'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington',
    'West Virginia', 'Wisconsin', 'Wyoming'
]

## functions

In [None]:
def read_dry_gas_prod(dry_gas_prod_file):
    """
    EIA data on dry gas production from https://www.eia.gov/dnav/ng/ng_prod_sum_a_EPG0_VGM_mmcf_a.htm
    
    Original units are MMcf, which are retained in output.
    """
    dry_1 = pd.read_excel(
        eia_path + dry_gas_prod_file, 
        header=2,
        sheet_name='Data 1'
    )

    dry_2 = pd.read_excel(
        eia_path + dry_gas_prod_file, 
        header=2,
        sheet_name='Data 2'
    )

    dry = pd.merge(dry_1, dry_2, left_on='Date', right_on='Date')

    dry['year'] = dry['Date'].astype(str).str.split('-').str[0].astype(int)

    # only keep years from 2000 onward
    # prior to 1997 used different accounting, and there was a discrepancy in 1999
    # drop latest year (2019) because it doesn't include breakdown by state for 'Other States' (sheet 'Data 2')
    dry = dry.loc[(dry['year']>=2000) & (dry['year']<=data_year)]

    dry = dry.set_index('year')
    dry = dry.drop('Date', axis=1)

    for col in dry.columns:
        if ' Dry Natural Gas Production (MMcf)' in col:
            new_col = col.replace(' Dry Natural Gas Production (MMcf)', '')
            dry = dry.rename(columns={col: new_col})
        elif ' Dry Natural Gas Production (Million Cubic Feet)' in col:
            new_col = col.replace(' Dry Natural Gas Production (Million Cubic Feet)', '')
            dry = dry.rename(columns={col: new_col})
        elif ' Dry Natural Gas Production  (Million Cubic Feet)' in col:
            new_col = col.replace(' Dry Natural Gas Production  (Million Cubic Feet)', '')
            dry = dry.rename(columns={col: new_col})
        elif ' Natural Gas Dry Production (Million Cubic Feet)' in col:
            new_col = col.replace(' Natural Gas Dry Production (Million Cubic Feet)', '')
            dry = dry.rename(columns={col: new_col})
        elif ' Dry Production of Natural Gas (Million Cubic Feet)' in col:
            new_col = col.replace(' Dry Production of Natural Gas (Million Cubic Feet)', '')
            dry = dry.rename(columns={col: new_col})
        else:
            print(col)

    # rename Fed GOM
    dry = dry.rename(columns={'Federal Offshore--Gulf of Mexico': 'GOM (federal)'})

    gas_dry_year_mmcf = dry.loc[data_year]
    gas_dry_year_mmcf.name = 'dry gas production (MMcf)'
    
    gas_dry_year_mmbtu = gas_dry_year_mmcf * dry_gas_en_dens_btu_per_cf
    gas_dry_year_mmbtu.name = 'dry gas production (mmbtu)'
    
    return(gas_dry_year_mmcf, gas_dry_year_mmbtu)

In [None]:
def read_ngl_prod(ngl_prod_file):
    """
    Read EIA data on NGL production volumes by state
    
    Natural Gas Plant Processing, Total Liquids Extracted
    https://www.eia.gov/dnav/ng/ng_prod_pp_a_EPL0_ygt_Mbbl_a.htm    
    
    In data sheet, units are thousand bbl per year
    """
    df = pd.read_excel(eia_path + ngl_prod_file, sheet_name='Data 1', header=2)
    df['year'] = df['Date'].dt.year
    df = df.drop('Date', axis=1)
    df = df.set_index('year')

    for col in df.columns:
        if 'Extracted in ' in col or 'Onshore' in col:
            df = df.drop(col, axis=1)
        else:
            pass

    for col in df.columns:
        new_col = col
        new_col = new_col.split(' Natural Gas Total Liquids Extracted')[0]
        new_col = new_col.split(' Total Liquids Extracted')[0] # for anomaly in ID column
        new_col = new_col.strip()
        df = df.rename(columns={col: new_col})

    # rename 'Gulf Of Mexico'
    df = df.rename(columns={'Gulf Of Mexico': 'GOM (federal)'})

    # convert units from thousand bbls (starting file) to bbl
    df = df * 1000

    ngl_prod_bbl_year = df.loc[data_year]
    ngl_prod_bbl_year.name = 'NGL production (bbl)'
    
    return ngl_prod_bbl_year

In [None]:
def read_oil_prod(oil_prod_file):
    """
    Read EIA data on oil production by state
    
    EIA page "Crude Oil Production":
    https://www.eia.gov/dnav/pet/pet_crd_crpdn_adc_mbbl_a.htm
    
    Units are thousands of barrels; converted for output to barrels.
    """
    df = pd.read_excel(eia_path + oil_prod_file, sheet_name='Data 1', header=2)

    df['year'] = df['Date'].dt.year
    df = df.drop('Date', axis=1)
    df = df.set_index('year')

    for col in df.columns:
        new_col = col
        new_col = new_col.split(' Field Production of Crude Oil')[0]
        df = df.rename(columns={col: new_col})
        
    # rename Fed GOM
    df = df.rename(columns={'Federal Offshore--Gulf of Mexico': 'GOM (federal)'})

    # convert from thousand barrels to bbl
    oil_prod_bbl = df * 1000  

    oil_prod_bbl_year = oil_prod_bbl.loc[data_year]
    oil_prod_bbl_year.name = 'oil production (bbl)'    
    
    return oil_prod_bbl_year

In [None]:
def read_leakage_rates_and_methane_share_parameters(parameters_excel):
    """
    Read input sheets with natural gas leakage rates and shares of methane in natural gas.
    
    (For gross natural gas extracted, prior to processing.)
    """
    leak_fractions = pd.read_excel(parameters_excel, 'production area leakage')
    
    leak_fractions['region & subregion'] = leak_fractions['region'].astype(str) + ' (' + leak_fractions['subregion'].astype(str) + ')'
    leak_fractions['region & subregion'] = leak_fractions['region & subregion'].str.split(' \(nan\)').str[0]
    leak_fractions = leak_fractions.set_index('region & subregion')
    leak_fractions = leak_fractions.drop(['region', 'subregion', 'leakage data source'], axis=1)['gas leakage % (relative to gross gas extraction)']
    
    # -----------
    ch4_shares_of_wet_gas = pd.read_excel(parameters_excel, sheet_name='methane share in gross gas')
    
    # transform into series
    ch4_shares_of_wet_gas = ch4_shares_of_wet_gas.set_index('region')['% CH4 by volume in gross gas']
    
    return(leak_fractions, ch4_shares_of_wet_gas)

In [None]:
def read_dpr_basin_prod_stats(dpr_file):
    """
    Read data from EIA Drilling Productivity Report (DPR)
    https://www.eia.gov/petroleum/drilling/
    """
    dpr_excel = pd.ExcelFile(inputs_path + dpr_file)
    
    # read EIA DPR sheet with county names
    dpr_counties = pd.read_excel(dpr_excel, sheet_name='RegionCounties')
    
    # convert production per day to absolute quantities
    days_per_mo = {1: 31, 2: 28, 3: 31, 4: 30, 5: 31, 6: 30, 7: 31, 8: 31, 9: 30, 10: 31, 11: 30, 12: 31}

    dpr_basin_prod_stats = pd.DataFrame() # initialize

    for basin in ['Anadarko', 'Appalachia', 'Bakken', 'Eagle Ford', 'Haynesville', 'Niobrara', 'Permian']:
        dpr = pd.read_excel(dpr_excel, sheet_name=f'{basin} Region', header=1)
        dpr = dpr.drop([
            'Rig count', 
            'Production per rig', 'Production per rig.1', 
            'Legacy production change', 'Legacy production change.1'
        ], axis=1)
        dpr = dpr.rename(columns={
            'Month': 'date',
            'Total production': 'oil production (bbl/d)',
            'Total production.1': 'gas production (Mcf/d)'
        })

        # convert from Mcf/d to Bcm/mo, and from bbl/d to bbl/mo
        dpr['days'] = dpr['date'].dt.month.replace(days_per_mo)
        dpr['gas production Bcf/mo'] = dpr['gas production (Mcf/d)'] * dpr['days'] / 1e6
        dpr['oil production bbl/mo'] = dpr['oil production (bbl/d)'] * dpr['days']

        # calculate sums for data_year
        dpr_gas_bcf_year = dpr.loc[dpr['date'].dt.year==data_year]['gas production Bcf/mo'].sum()        
        dpr_oil_bbl_year = dpr.loc[dpr['date'].dt.year==data_year]['oil production bbl/mo'].sum()
        
        dpr_basin_prod_stats.at[f'{basin} region', 'gas production Bcf/y'] = dpr_gas_bcf_year
        dpr_basin_prod_stats.at[f'{basin} region', 'oil production bbl/y'] = dpr_oil_bbl_year
        
    return(dpr_counties, dpr_basin_prod_stats)

In [None]:
def calculate_ngl_energy(ngl_prod_bbl_year, ngls_padd_en_dens, state_padd):
    # merge in PADD info
    df = pd.merge(
        ngl_prod_bbl_year,
        state_padd[['PADD']],
        left_index=True, right_index=True,
        how='left'
    )

    # merge in NGL energy density for corresponding PADD
    df = pd.merge(
        df,
        ngls_padd_en_dens,
        left_on='PADD', right_index=True,
        how='outer',
    )

    # calculate weighted average
    df_p = df.dropna(subset=['NGL energy density (mmbtu/bbl)'])
    weights = df_p[['NGL production (bbl)', 'NGL energy density (mmbtu/bbl)']].product(axis=1, skipna=False).sum()
    average = weights / df_p['NGL production (bbl)'].sum()
    
    # for rows with no entry for NGL energy density, fill in natioanl average
    # will apply to U.S. total, Alaska, and GOM (federal)
    df = df.fillna(average)
    
    ngl_energy_density = df['NGL energy density (mmbtu/bbl)']

    # calculate energy content of NGLs
    ngl_prod_mmbtu_year = df[
        ['NGL production (bbl)', 'NGL energy density (mmbtu/bbl)']].product(axis=1, skipna=False)
    
#     print(ngl_prod_mmbtu_year.head()) # for db
    
    ngl_prod_mmbtu_year.name = 'NGL production (mmbtu)'
    
    return(ngl_energy_density, ngl_prod_mmbtu_year)

In [None]:
def calculate_gas_fraction_of_energy_by_state(
    gas_dry_year_mmcf,
    gas_dry_year_mmbtu,
    ngl_prod_bbl_year,
    ngl_energy_density,
    ngl_prod_mmbtu_year,
    oil_prod_bbl_year,
    oil_en_dens_mmbtu_per_bbl,
):
    quad_per_mmbtu = 1e-9
    
    oil_prod_mmbtu_year = oil_prod_bbl_year * oil_en_dens_mmbtu_per_bbl
    oil_prod_mmbtu_year.name = 'oil production (mmbtu)'
    
    # merge gas, NGL, oil energy data
    df = pd.concat([
        gas_dry_year_mmcf,
        gas_dry_year_mmbtu,
        ngl_prod_bbl_year,
        ngl_energy_density,
        ngl_prod_mmbtu_year,
        oil_prod_bbl_year,
        oil_prod_mmbtu_year,
    ], axis=1)
    
    df['dry gas energy density (mmbtu/MMcf)'] = dry_gas_en_dens_btu_per_cf
    # NGL energy density df merged above
    df['oil energy density (mmbtu/bbl)'] = oil_en_dens_mmbtu_per_bbl

    df = df.loc[~df.index.str.contains('Onshore')]
    df = df.loc[~df.index.str.contains('Offshore')]
    df = df.loc[df.index!='Other States']
    # exclude all rows for entire PADDs
    # (note: this does *not* exclude Federal offshore GOM & Federal Offshore PADD 5 (CA federal offshore))
    df = df.loc[~df.index.isin(
        ['East Coast (PADD 1)', 'Midwest (PADD 2)', 'Gulf Coast (PADD 3)',
         'Rocky Mountain (PADD 4)', 'West Coast (PADD 5)'])]
    df = df.loc[df.index!='Alaska South']
    df = df.loc[df.index!='Alaska North Slope Crude Oil Production (Thousand Barrels)']
    
    df['dgno mmbtu'] = df[[
        'dry gas production (mmbtu)',
        'NGL production (mmbtu)',
        'oil production (mmbtu)',
    ]].sum(axis=1)

    df['dry gas en fract'] = df['dry gas production (mmbtu)']/df['dgno mmbtu']
    df['NGL en fract'] = df['NGL production (mmbtu)']/df['dgno mmbtu']
    df['oil en fract'] = df['oil production (mmbtu)']/df['dgno mmbtu']
    df = df.sort_values(by='dgno mmbtu', ascending=False)
    df[['dry gas en fract', 'NGL en fract', 'oil en fract']]
    
    # reorder columns (note: index is states/areas)
    df = df[[
        'dry gas production (MMcf)', 'dry gas energy density (mmbtu/MMcf)', 'dry gas production (mmbtu)',
        'NGL production (bbl)', 'NGL energy density (mmbtu/bbl)', 'NGL production (mmbtu)', 
        'oil production (bbl)', 'oil energy density (mmbtu/bbl)', 'oil production (mmbtu)',
        'dgno mmbtu', 'dry gas en fract', 'NGL en fract', 'oil en fract'
    ]]
    
    table_en_fract = df
    
    print(f"show values in quads for whole US (incl Alaska) for {data_year}:") # for db
    print(f"DGNO quads: {round(df.loc['U.S.'].at['dgno mmbtu'].sum() * quad_per_mmbtu, 1)}")
    table_en_fract_quads = (df.loc['U.S.']*quad_per_mmbtu).round(1)
    print(f"dry gas (quads): {table_en_fract_quads.at['dry gas production (mmbtu)']}")
    print(f"oil (quads): {table_en_fract_quads.at['oil production (mmbtu)']}")
    print(f"NGL (quads): {table_en_fract_quads.at['NGL production (mmbtu)']}")
    print("for comparison, EIA says production for 2018 was (in quads): gas: ~32, oil: ~23, NGL: ~6")
    # source: https://www.eia.gov/todayinenergy/detail.php?id=39392; values eyeballed from graph

    dry_gas_en_fract_of_dgno_by_state = df['dry gas en fract']
    dry_gas_en_fract_of_dgno_by_state.name = 'DG fract of DGNO'
    
    return(table_en_fract, dry_gas_en_fract_of_dgno_by_state)

In [None]:
def calculate_production_leakage_allocated_to_gas_selected_region(
    state, region,
    sel_gross_gas_yr_mcf, sel_leak_fract,
    leak_by_area
):
    """
    From data on gas, NGL, and oil production, calculate volume and mass of CH4 leakage.
    
    Allocate portion of the CH4 leakage to dry gas (consumer-grade); remainder allocated to NGL and oil.
    """
    
    sel_gross_gas_yr_bcf = sel_gross_gas_yr_mcf / 1e6
    
    sel_leak_ng_mcf = sel_gross_gas_yr_mcf * sel_leak_fract
    
    # special handling for Appalachia CH4 fractions
    if region == 'Appalachia region - Pennsylvania northeast':
        region_for_ch4_share = 'Appalachia region - northeast Pennsylvania'
    elif region in ['Appalachia region - Pennsylvania southwest',
                    'Appalachia region - Pennsylvania other',
                    'Appalachia region - West Virginia',
                    'Appalachia region - Ohio']:                    
        region_for_ch4_share = 'Appalachia region - outside northeast Pennsylvania'
    else:
        region_for_ch4_share = region
    
    # calculate leakage from all oil-NGL-gas production
    ch4_shares_of_gross_gas_region = ch4_shares_of_wet_gas.at[region_for_ch4_share]
    sel_leak_ch4_mcf = sel_leak_ng_mcf * ch4_shares_of_gross_gas_region
    sel_leak_ch4_bcf = sel_leak_ch4_mcf / 1e6
    sel_leak_ch4_ton = sel_leak_ch4_mcf * (ch4_kg_per_mcf/1000)

    # put data into df leak_by_area
    state_leak_list = [
        state, 
        region,
        sel_gross_gas_yr_bcf,
        sel_leak_fract,
        ch4_shares_of_gross_gas_region,
        sel_leak_ch4_bcf,
        sel_leak_ch4_ton,
    ]
    
    leak_by_area.loc[len(leak_by_area)] = state_leak_list
    
    return leak_by_area

In [None]:
def fill_in_docs_table_3_1(
    prod_area,
    gas_wet_yr_mcf,
    methane_leak_fract,
    docs_table_3_1
):
    gas_wet_yr_bcf = gas_wet_yr_mcf / 1e6
    
    # divide production by us_cont_gross_prod_tcf to get %
    fract_cont_us = gas_wet_yr_bcf / us_cont_gross_prod_bcf
    
    # put data into df docs_table_3_1
    table_1_list = [
        prod_area, 
        gas_wet_yr_bcf,
        fract_cont_us,
        methane_leak_fract,
    ]
    
    docs_table_3_1.loc[len(docs_table_3_1)] = table_1_list
    
    return docs_table_3_1

In [None]:
def compile_eia_gross_prod_all_states(eia_gross_prod_excel, data_year):
    eia_gross_prod_1 = pd.read_excel(eia_gross_prod_excel, sheet_name='Data 1', header=2)
    eia_gross_prod_2 = pd.read_excel(eia_gross_prod_excel, sheet_name='Data 2', header=2)

    eia_gross_prod_1['year'] = eia_gross_prod_1['Date'].dt.year
    eia_gross_prod_1 = eia_gross_prod_1.set_index('year')
    eia_gross_prod_1 = eia_gross_prod_1.drop('Date', axis=1)

    eia_gross_prod_2['year'] = eia_gross_prod_2['Date'].dt.year
    eia_gross_prod_2 = eia_gross_prod_2.set_index('year')
    eia_gross_prod_2 = eia_gross_prod_2.drop('Date', axis=1)
    
    df = pd.concat([eia_gross_prod_1, eia_gross_prod_2], axis=1)
    
    for col in df.columns:
        new_col = col
        new_col = new_col.replace(' (MMcf)', '')
        new_col = new_col.replace(' Natural Gas Gross Withdrawals', '')
        
        # anomaly for Idaho
        new_col = new_col.replace(' Gross Withdrawals of Natural Gas (Million Cubic Feet)', '')
            
        df = df.rename(columns={col: new_col})
        
    # rename Fed GOM
    df = df.rename(columns={'Federal Offshore--Gulf of Mexico': 'GOM (federal)'})
    
    # original units are MMcf; convert to Mcf
    eia_gross_prod_all_yr_mmcf = df.loc[data_year]
    eia_gross_prod_all_yr_mmcf.name = 'gross gas production (MMcf)'
    
    return eia_gross_prod_all_yr_mmcf

In [None]:
def read_eia_gross_production_contiguous_us(eia_gross_prod_excel):
    eia_gross_prod_main = pd.read_excel(eia_gross_prod_excel, sheet_name='Data 1', header=2)
    eia_gross_prod_main['year'] = eia_gross_prod_main['Date'].dt.year
    eia_gross_prod_main = eia_gross_prod_main.set_index('year')
    us_gross_prod = eia_gross_prod_main.at[data_year, 'U.S. Natural Gas Gross Withdrawals (MMcf)']
    alaska_gross_prod = eia_gross_prod_main.at[data_year, 'Alaska Natural Gas Gross Withdrawals (MMcf)']
    us_cont_gross_prod_bcf = (us_gross_prod - alaska_gross_prod)/1e3
    
    return us_cont_gross_prod_bcf

### functions to read state production data

In [None]:
def get_annual_production_from_eia_shale_by_play(column_name, data_year):
    # convert production per day to absolute quantities
    days_per_mo = {1: 31, 2: 28, 3: 31, 4: 30, 5: 31, 6: 30, 7: 31, 8: 31, 9: 30, 10: 31, 11: 30, 12: 31}

    df = eia_shale_by_play.copy()[['Date', column_name]]
    
    # convert from Bcf/d to MMcf/mo
    df['days per mo'] = df['Date'].dt.month.replace(days_per_mo)
    df['gas production MMcf/mo'] = df[column_name] * df['days per mo'] * 1000
    df['year'] = df['Date'].dt.year
    df = df.groupby('year')['gas production MMcf/mo'].sum()
    play_prod_year_mmcf = df.loc[data_year]
    
    return play_prod_year_mmcf

In [None]:
def read_pennsylvania_state_data(pa_unconv_year_file, pa_conv_year_file):
    pa_unconv_year = pd.read_excel(
        inputs_path + pa_unconv_year_file, 
        sheet_name='Oil_Gas_Well_Production', 
        header=10)
    pa_conv_year = pd.read_excel(
        inputs_path + pa_conv_year_file,
        sheet_name='Oil_Gas_Well_Production', 
        header=10)

    # combine into one df
    pa_year = pd.concat([pa_unconv_year, pa_conv_year], sort=False)

    # exclude totals rows
    pa_year = pa_year.loc[pa_year['Well Permit #']!='Total']
    
    return(pa_year, pa_unconv_year, pa_conv_year)

In [None]:
def read_west_virginia_production_data():
    # West Virginia
    # as described in sheet 'DATA DESCRIPTIONS':
    # gas (production) units are Mcf
    # oil (production) units are bbl
    # NGL (production) units are bbl (note: "Only HOR6A horizontal wells are required to report NGL")
    if data_year == 2018:
        wv_file = 'West Virginia oil and gas production totals 2018 (2019-06-17).xlsx'
        sel_sheet = 'DATA'
        header_row = 1
    elif data_year == 2019:
        wv_file = 'West Virginia oil and gas production totals 2019 (2020-07-31).xlsx'
        sel_sheet = 'Sheet1'
        header_row = 0
    else:
        print("Error!" + f" Haven't entered WV data for {data_year}")
        print("Use data for prior year, assuming production is the same.")
        wv_file = 'West Virginia oil and gas production totals 2019 (2020-07-31).xlsx'

    wv_year_excel = pd.ExcelFile(inputs_path + wv_file)
    df = pd.read_excel(wv_year_excel, sheet_name=sel_sheet, header=header_row)
    df = df.rename(columns={'Total_Gas ': 'Total_Gas'})
    
    if data_year == 2018:
        df = df.rename(columns={'Well_type': 'Well Type'})
        
    df = df[['Year', 'API', 'County', 'Well Type', 'Total_Gas', 'Total_NGL', 'Total_Oil']]
        
    wv_year = df
    
    return wv_year

In [None]:
def read_ohio_production_data():
    # Ohio Department of Natural Resources
    # https://ohiodnr.gov/wps/portal/gov/odnr/discover-and-learn/safety-conservation/about-odnr/oil-gas/oil-gas-resources/production
    # "Each record has the following fields: production year, county, township, API well number, owner name, well name, well number, 
    # oil (1 barrel = 42 US gallons), gas (1 MCF = 1,000 cubic feet of gas), brine (1 barrel = 42 US gallons), days in production, 
    # date completed, and the plugging date (if applicable)."
    try:
        oh_year = pd.read_excel(inputs_path + oh_file, sheet_name=str(data_year), header=0)
        oh_year = oh_year.rename(columns={'QUARTER: N/A, 1, 2, 3, 4': 'QUARTER'})
        
        return oh_year
    
    except:
        # there isn't a combined data file; try files that are split conv & unconv
        try:
            oh_conv_file = f'Ohio DNR - oil and gas - Conventional Production Annual {data_year}.xls'
            oh_unconv_file = f'Ohio DNR - oil and gas - Horizontal Production All Quarters {data_year}.xls'

            oh_conv = pd.read_excel(inputs_path + oh_conv_file, sheet_name='Sheet1', header=0)

            oh_unconv = pd.read_excel(inputs_path + oh_unconv_file, sheet_name='Sheet1', header=0)

            quarters_dict = {
                'QUARTER 1,2,3,4': 'QUARTER',
                'QUARTER: N/A, 1, 2, 3, 4': 'QUARTER',
                'QUARTER n/a,1,2,3,4:': 'QUARTER',
            }
            oh_conv = oh_conv.rename(columns=quarters_dict)
            oh_unconv = oh_unconv.rename(columns=quarters_dict)

            oh_unconv = oh_unconv.rename(columns={
                'OIL_': 'OIL', 'GAS_': 'GAS', 'BRINE_': 'BRINE',
                'PRODUCTION YEAR': 'YEAR',
            })
            
            oh_year = pd.concat([oh_conv, oh_unconv], sort=False)
            
            oh_year = oh_year[['YEAR', 'QUARTER', 'API WELL NUMBER', 'COUNTY', 'OIL', 'GAS']]
            
            return oh_year

        except:
            print("Error!" + f" Haven't entered OH data for {data_year}")

In [None]:
def read_new_mexico_production_data(new_mexico_file_template):
    """
    New Mexico data from Oil Conservation Division (OCD)
    https://wwwapps.emnrd.state.nm.us/ocd/ocdpermitting//Reporting/Production/CountyProductionInjectionSummary.aspx

    individual files for each county, with data for each month from 1994 onward
    each year of data has headers

    units not stated in NM OCD files
    EIA gross withdrawals for NM in 2018: 1,527,319 MMcf
    sum from counties above, if units are Mcf: 1,491,957 MMcf
    similar values, so units in NM OCD data must be Mcf

    also, pages for individual wells state gas production as Mcf:
    https://wwwapps.emnrd.state.nm.us/ocd/ocdpermitting/Data/WellDetails.aspx?api=30-025-34112
    """
    nm_county_dfs_list = [] # initialize
    for county in ['Chaves', 'Eddy', 'Lea', 'McKinley', 'Rio Arriba', 'Roosevelt', 'San Juan', 'Sandoval']:
        file_name = new_mexico_file_template.replace('county_name', county)
        df = pd.read_excel(inputs_path + file_name, header=5)

        # exclude rows with column names & drop empty rows
        df = df.loc[df['Year']!='Year']
        df = df.dropna(how='all')

        df['County'] = county
        nm_county_dfs_list.append(df)

    new_mexico_wells = pd.concat(nm_county_dfs_list, sort=False)
    new_mexico_wells_year = new_mexico_wells.loc[new_mexico_wells['Year']==data_year]
    new_mexico_year = new_mexico_wells_year.groupby('County')[['Produced Gas', 'Produced Oil']].sum().reset_index()
    
    return new_mexico_year

In [None]:
def read_colorado_production_data(colorado_file):
    # Colorado data from COGIS: https://cogcc.state.co.us/cogis/ProductionSearch.asp
    # can show annual totals by county; copy results from webpage into Excel
    colorado = pd.read_excel(inputs_path + colorado_file, sheet_name=str(data_year))
    # clean up county names & gas data
    colorado['County'] = colorado['County'].str.replace(u' \xa0', '').str.strip()

    for col in ['Gas Sales (MCF)', 'Gas Production (MCF)', 'Oil Production (barrels)']:
        colorado[col] = colorado[col].astype(str).str.replace('\xa0 ', '').str.strip()
        colorado[col] = colorado[col].replace('', '0')
        colorado[col] = colorado[col].astype(float)
        
    return colorado

In [None]:
def read_wyoming_production_data():
    """
    Read data from state of Wyoming for oil and gas production.
    
    Note: 2019 file downloaded from the state website seemed to have a problem. Had to open in Excel and save as xlsx.
    """
    wyoming_file = '' # intialize
    try:
        if data_year == 2018:
            wyoming_file = 'Wyoming Oil and Gas Conservation Commission - oil and gas production 2018.csv'
            wyoming_wells = pd.read_csv(inputs_path + wyoming_file, sep='\t')
        elif data_year == 2019:
            wyoming_file = 'Wyoming Oil and Gas Conservation Commission - oil and gas production 2019.xlsx'
            wyoming_wells = pd.read_excel(inputs_path + wyoming_file)
    except:
        print("Error!" + f" Wyoming data not entered for {data_year}.")

    if wyoming_file != '':
        # appears units are Mcf; value for 2018 is then 1,814,790 MMcf, 
        # which is ~5% higher than EIA's value for gross extraction in 2018 of 1,720,897
        all_mon_list_gas = [] # initialize
        all_mon_list_oil = [] # initialize
        for mon in ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']:
            mon_gas = wyoming_wells.groupby('Countyname')[f'{mon}_gas'].sum().reset_index()
            mon_gas = mon_gas.rename(columns={f'{mon}_gas': 'gas (Mcf)'})
            all_mon_list_gas += [mon_gas]

            mon_oil = wyoming_wells.groupby('Countyname')[f'{mon}_oil'].sum().reset_index()
            mon_oil = mon_oil.rename(columns={f'{mon}_oil': 'oil (bbl)'})
            all_mon_list_oil += [mon_oil]
            
        all_mon_gas = pd.concat(all_mon_list_gas, sort=False)
        all_mon_oil = pd.concat(all_mon_list_oil, sort=False)

        wyoming_gas = all_mon_gas.groupby('Countyname')['gas (Mcf)'].sum().reset_index()
        wyoming_oil = all_mon_oil.groupby('Countyname')['oil (bbl)'].sum().reset_index()
        
        wyoming = pd.merge(wyoming_gas, wyoming_oil, on='Countyname', how='outer')
        
        return wyoming

    else:
        print(f"Didn't attempt to processing wyoming_file: {wyoming_file}")

In [None]:
def read_texas_production_data(texas_file):
    """
    Get Texas county-level data from Railroad Commission,
    from version that shows statewide data (although doesn't allow for download):
    http://webapps.rrc.texas.gov/PDQ
    """
    if data_year in [2018, 2019]:
        texas = pd.read_excel(inputs_path + texas_file, sheet_name=str(data_year))
        texas['Gas total (Mcf)'] = texas[['Casinghead (MCF)', 'GW Gas (MCF)']].sum(axis=1)
        texas['Oil total (bbl)'] = texas[['Oil (BBL)', 'Condensate (BBL)']].sum(axis=1)
        
        texas = texas.drop(
            ['Oil (BBL)', 'Casinghead (MCF)', 'GW Gas (MCF)', 'Condensate (BBL)'], 
            axis=1)
        
        return texas
        
    else:
        print('Error!' + f" Data set not entered for Texas for {data_year}")

In [None]:
def read_state_padds(state_padd_file):
    """
    Input sheet for assigning each state to a PADD.
    
    Used to then assign NGL energy density by PADD.
    """
    state_padd = pd.read_excel(inputs_path + state_padd_file)
    state_padd['producing state'] = state_padd['producing state'].replace('Gulf Of Mexico', 'GOM (federal)')

    state_padd = state_padd.set_index('producing state')

    state_padd['PADD'] = 'PADD ' + state_padd['PADD'].astype(str)
    
    return state_padd

In [None]:
def calculate_padd_ngl_energy_density(ngl_en_dens_file, ngl_padd_file_template):
    """
    Calculate energy density of NGLs in each PADD
    Read file on energy densities of NGLs, from:
    
    Natural Gas Plant Field Production
    https://www.eia.gov/dnav/pet/pet_pnp_gp_dc_r10_mbbl_a.htm
    """
    ngl_en_dens = pd.read_excel(inputs_path + ngl_en_dens_file, sheet_name='data')
    ngl_en_dens = ngl_en_dens.set_index('NGL category')['million Btu per barrel of liquid']
    
    ngls_padd_en_dens = pd.Series(dtype=float)
    ngls_padd_en_dens.name = 'NGL energy density (mmbtu/bbl)'
    
    for padd_num in range(1, 5+1):
        ngl_padd_file = ngl_padd_file_template.replace('_PADD_NUM_', str(padd_num))
        ngl_padd = pd.read_excel(eia_path + ngl_padd_file, sheet_name='Data 1', header=2)

        ngl_padd['year'] = ngl_padd['Date'].dt.year
        ngl_padd = ngl_padd.drop('Date', axis=1)
        ngl_padd = ngl_padd.set_index('year')

        for col in ngl_padd.columns:
            new_col = col
            new_col = new_col.split(f' (PADD {padd_num}) ')[-1]
            new_col = new_col.split('Field Production of ')[-1]
            new_col = new_col.split(' (Thousand Barrels)')[0]
            ngl_padd = ngl_padd.rename(columns={col: new_col})

        ngl_padd = ngl_padd.drop([
            'Gas Plant Production of Pentanes Plus',
            'Gas Plant Production of Liquified Petroleum Gases'
        ], axis=1)

        ngl_padd_year = ngl_padd.loc[data_year]
        ngl_padd_year.index.name = 'NGL category'

        # convert units from kbbl to bbl; also add units to series name
        ngl_padd_year_bbl = ngl_padd_year * 1000
        ngl_padd_year_bbl.name = 'bbl liquids'
        
        # change 'Natural Gasoline' to 'Pentanes Plus' to match energy density input sheet
        ngl_padd_year_bbl = ngl_padd_year_bbl.rename({'Natural Gasoline': 'Pentanes Plus'})
        
        # drop total NGLs volume
        ngl_padd_year_bbl = ngl_padd_year_bbl.drop('Natural Gas Liquids')

        # calculate energy of NGLs
        merged = pd.merge(ngl_padd_year_bbl, ngl_en_dens, left_index=True, right_index=True, how='outer')
        merged['mmbtu NGLs'] = merged[['bbl liquids', 'million Btu per barrel of liquid']].product(axis=1, skipna=True)
        
        # weighted average energy density of NGLs
        ngls_padd_en_dens_mmbtu_per_bbl = merged['mmbtu NGLs'].sum() / merged['bbl liquids'].sum()
        print(f"PADD {padd_num}: {round(ngls_padd_en_dens_mmbtu_per_bbl, 2)} mmbtu/bbl")

        ngls_padd_en_dens.at[f'PADD {padd_num}'] = ngls_padd_en_dens_mmbtu_per_bbl
    
    return ngls_padd_en_dens

In [None]:
def read_eia_shale_gas_by_play(eia_shale_by_play_file):
    """
    Read EIA data for shale gas by play, from Natural Gas Weekly
    
    From https://www.eia.gov/naturalgas/weekly/img/shale_gas_202009.xlsx
    (Or other URL with later date; change date in URL to download.)
    (As of 2020-11-10, latest version was for 2020-09.)
    
    Units: bcf/day
    """
    
    eia_shale_by_play = pd.read_excel(inputs_path + eia_shale_by_play_file)
    
    return eia_shale_by_play

## read files

In [None]:
(leak_fractions, ch4_shares_of_wet_gas) = read_leakage_rates_and_methane_share_parameters(parameters_excel)

In [None]:
# read EIA data
# read in EIA data on US gross gas production
# calculate value for contiguous US (total minus Alaska)
# use for calculating share of contiguous US in documentation Table 1
eia_gross_prod_excel = pd.ExcelFile(eia_path + eia_gross_prod_file)
us_cont_gross_prod_bcf = read_eia_gross_production_contiguous_us(eia_gross_prod_excel)
eia_gross_prod_all_yr_mmcf = compile_eia_gross_prod_all_states(eia_gross_prod_excel, data_year)

eia_shale_by_play = read_eia_shale_gas_by_play(eia_shale_by_play_file)

(dpr_counties, dpr_basin_prod_stats) = read_dpr_basin_prod_stats(dpr_file)

(gas_dry_year_mmcf, gas_dry_year_mmbtu) = read_dry_gas_prod(dry_gas_prod_file)

ngl_prod_bbl_year = read_ngl_prod(ngl_prod_file)

oil_prod_bbl_year = read_oil_prod(oil_prod_file)
oil_prod_cont_us = oil_prod_bbl_year.at['U.S.'] - oil_prod_bbl_year.at['Alaska']

state_padd = read_state_padds(state_padd_file)
ngls_padd_en_dens = calculate_padd_ngl_energy_density(ngl_en_dens_file, ngl_padd_file_template)

In [None]:
# read state data
(pa_year, pa_unconv_year, pa_conv_year) = read_pennsylvania_state_data(
    pa_unconv_year_file, pa_conv_year_file)
wv_year = read_west_virginia_production_data()
oh_year = read_ohio_production_data()
colorado = read_colorado_production_data(colorado_file)
new_mexico_year = read_new_mexico_production_data(new_mexico_file_template)
wyoming = read_wyoming_production_data()
texas = read_texas_production_data(texas_file)

### end of read files

In [None]:
# initialize df
leak_by_area = pd.DataFrame(columns = [
    'state', 
    'region',
    'gross gas production (Bcf)',
    'leakage fraction',
    'CH4 fraction in gross gas',
    'CH4 leaked (Bcf)',
    'CH4 leaked (tons)',
])

In [None]:
# initialize df
docs_table_3_1 = pd.DataFrame(columns = [
    'Production area', 'Gross gas (Bcf)', 'cont US fract', 'methane leak fract'
])

## split energy between gas & other

In [None]:
(ngl_energy_density, ngl_prod_mmbtu_year) = calculate_ngl_energy(ngl_prod_bbl_year, ngls_padd_en_dens, state_padd)

(table_en_fract, 
 dry_gas_en_fract_of_dgno_by_state) = calculate_gas_fraction_of_energy_by_state(
    gas_dry_year_mmcf,
    gas_dry_year_mmbtu,
    ngl_prod_bbl_year,
    ngl_energy_density,
    ngl_prod_mmbtu_year,
    oil_prod_bbl_year,
    oil_en_dens_mmbtu_per_bbl)

## Calculate quantity of methane leaked by state
* Take data on gross production (MMcf NG) by producing region/basin
* Separate region/basin by state as needed
* For each producing region/basin, apply the leakage rate from the literature to get quantity of NG leaked
  * E.g., for TX portion of Permian, apply Permian leakage rate of 3.7%
* Get value for methane fraction of NG
  * E.g., gross gas produced in Permian is 80% methane (or whatever the value is from the literature)
* Calculate volume of methane leaked (MMcf)
* Convert to mass (Gg) and energy of methane leaked (MMBtu)
* After doing for each region/basin with measured leakage rates, apply default values for remaining production
  * Default for NG leakage rate: 2.3%?
  * Default for methane fraction in gross NG production: 80%?
* In the end, for each state, have quantities of methane leaked in terms of volume (MMcf), mass (Gg), and energy (MMBtu)

## Calculate energy content of gas, oil, and NGLs by state
* Take data on gross oil and gas/NGL production by state
* Calculate energy content of each stream
  * If the state distinguishes NGLs from gas, as WV does, calculate energy content of each
    * gas: assume energy density of consumer-grade gas, because NGLs are already separated in the data (even though not separated in the hydrocarbon stream)
    * NGL: apply conversion factor for energy density of NGLs produced in the region
  * If the state doesn't distinguish NGLs from gas, have to estimate split between gas and NGLs
    * EIA reports, by state, gross gas production, marketed gas production, and dry gas production 
    * EIA also reports NGL production (both in terms of bbl and gaseous equivalent, based on energy content)
    * Thus can calculate the energy content of gross gas, dry gas (consumer-grade), and NGLs produced
      * Should be that dry gas + NGLs ~= gross gas - reinjection
      * If Prudhoe Bay oil field is representative, gross gas reinjected leads to negligible leakage
    * And can calculate the share of energy from gas (excluding NGLs)
* To calculate energy density of NGLs produced in the region:
  * Use EIA data on NGL production volumes, separated by type of NGL (ethane, propane, etc.), for each PADD & sub-PADD
  * Use EIA values for the energy density of each type of NGL (https://www.eia.gov/dnav/ng/ng_prod_sum_a_EPG0_VG9_mmcf_a.htm)
  * Calculate total energy content of NGLs for each PADD & sub-PADD
  * Calculate average energy density of NGLs for each PADD & sub-PADD (total energy / total volume)

## Appalachia region

In [None]:
# read leakage rates for Appalachia sub-regions & calculate average for SW PA & WV
pa_ne_leak_fract = leak_fractions.at['Appalachia region (northeast PA)']
pa_sw_and_wv_leak_fract = leak_fractions.at['Appalachia region (southwest PA & northern WV)']

In [None]:
# assign leakage rate to PA SW
pa_sw_leak_fract = pa_sw_and_wv_leak_fract

# assume other PA production has intermediate leakage rate
# this covers small remaining production, primarily in central and northwest PA
# there are many older, conventional wells in these areas (as with SW PA)
pa_other_leak_fract = pa_sw_and_wv_leak_fract

In [None]:
# Appalachia: get well-level production from PA, WV, & OH
# assign production to Appalachia region, as defined by counties included in EIA DPR
appalachia = dpr_counties.loc[dpr_counties['Region']=='Appalachia Region']
appalachia_pa_counties = appalachia.loc[appalachia['State']=='PA']['County'].tolist()
appalachia_wv_counties = appalachia.loc[appalachia['State']=='WV']['County'].tolist()
appalachia_oh_counties = appalachia.loc[appalachia['State']=='OH']['County'].tolist()

### Pennsylvania

In [None]:
pa_gross_gas_yr_mcf = pa_year['Gas Quantity (Mcf)'].sum()
pa_gross_gas_yr_appal_mcf = pa_year.loc[pa_year['Well County'].isin(appalachia_pa_counties)]['Gas Quantity (Mcf)'].sum()
pa_gross_gas_yr_nonappal_mcf = pa_year.loc[~pa_year['Well County'].isin(appalachia_pa_counties)]['Gas Quantity (Mcf)'].sum()

print(f"In Applachia counties (Bcf): {int(pa_gross_gas_yr_appal_mcf/1e6)}")
print(f"Not in Appalachia counties (Bcf): {int(pa_gross_gas_yr_nonappal_mcf/1e6)}")

# Therefore PA Appalachia production is essentially the same as PA production (at least for gas)
# Therefore, for "other" PA, include all remaining counties.

In [None]:
# for PA counties within Appalachia region (DPR-defined),
# calculate production in the northeast and southwest regions of the state
# also calculate the remaining 'other' production in the state, within the Appalachia region
pa_ne_counties = [
    'BRADFORD', 'LYCOMING', 'SULLIVAN', 'SUSQUEHANNA', 'TIOGA', 'WYOMING',
    'COLUMBIA', 'LACKAWANNA', 'LUZERNE', 'PIKE',  'WAYNE',
]
# note: for 2018, no gas or oil production in Columbia, Lackawanna, Luzerne, Pike, Wayne; not showing up in county sums
pa_ne = pa_year.loc[pa_year['Well County'].isin(pa_ne_counties)]
pa_ne_gross_gas_yr_mcf = pa_ne['Gas Quantity (Mcf)'].sum()

pa_sw_counties = [
    'ALLEGHENY', 'ARMSTRONG', 'BEAVER', 'BUTLER', 'CAMBRIA', 'FAYETTE',
    'GREENE', 'INDIANA', 'SOMERSET', 'WASHINGTON', 'WESTMORELAND',
]
# note: all SW counties listed here had at least some gas production in 2018
pa_sw = pa_year.loc[pa_year['Well County'].isin(pa_sw_counties)]
pa_sw_gross_gas_yr_mcf = pa_sw['Gas Quantity (Mcf)'].sum()

pa_other_gross_gas_yr_mcf = pa_gross_gas_yr_mcf - (pa_ne_gross_gas_yr_mcf + pa_sw_gross_gas_yr_mcf)

In [None]:
leak_by_area = calculate_production_leakage_allocated_to_gas_selected_region(
    'Pennsylvania', 'Appalachia region - Pennsylvania northeast', 
    pa_ne_gross_gas_yr_mcf, pa_ne_leak_fract, 
    leak_by_area)

leak_by_area = calculate_production_leakage_allocated_to_gas_selected_region(
    'Pennsylvania', 'Appalachia region - Pennsylvania southwest', 
    pa_sw_gross_gas_yr_mcf, pa_sw_and_wv_leak_fract, 
    leak_by_area)

leak_by_area = calculate_production_leakage_allocated_to_gas_selected_region(
    'Pennsylvania', 'Appalachia region - Pennsylvania other', 
    pa_other_gross_gas_yr_mcf, pa_other_leak_fract, 
    leak_by_area)

In [None]:
# oil production
pa_ne_oil_year = pa_ne[['Oil Quantity (Bbl)', 'Condensate Quantity (Bbl)']].sum().sum()
pa_sw_oil_year = pa_sw[['Oil Quantity (Bbl)', 'Condensate Quantity (Bbl)']].sum().sum()
pa_other_oil_year = pa_year[['Oil Quantity (Bbl)', 'Condensate Quantity (Bbl)']].sum().sum() - pa_ne_oil_year - pa_sw_oil_year

oil_prod_year = pd.Series(dtype=float)
oil_prod_year.at['Appalachia, PA NE'] = pa_ne_oil_year
oil_prod_year.at['Appalachia, PA SW'] = pa_sw_oil_year
oil_prod_year.at['Appalachia, PA other'] = pa_other_oil_year

In [None]:
# SHOWS AVERAGE, BUT NOT USED BELOW:
# calculate total NG leakage for PA
pa_leak_gross_gas_mcf = sum([
    pa_ne_gross_gas_yr_mcf * pa_ne_leak_fract,
    pa_sw_gross_gas_yr_mcf * pa_sw_and_wv_leak_fract,
    pa_other_gross_gas_yr_mcf * pa_other_leak_fract
])

pa_leak_fract_avg = pa_leak_gross_gas_mcf / pa_gross_gas_yr_mcf
print(f"PA avg leak rate: {round(pa_leak_fract_avg*100, 2)}%")

### West Virginia

In [None]:
wv_gross_gas_yr_appal_mcf = wv_year.loc[wv_year['County'].str.upper().isin(appalachia_wv_counties)]['Total_Gas'].sum()
wv_gross_gas_yr_nonappal_mcf = wv_year.loc[~wv_year['County'].str.upper().isin(appalachia_wv_counties)]['Total_Gas'].sum()
print(f"In Applachia counties (Bcf): {wv_gross_gas_yr_appal_mcf/1e6}")
print(f"Not in Appalachia counties (Bcf): {wv_gross_gas_yr_nonappal_mcf/1e6}")

In [None]:
wv_oil_yr_appal_bbl = wv_year.loc[wv_year['County'].str.upper().isin(appalachia_wv_counties)]['Total_Oil'].sum()
wv_oil_yr_nonappal_bbl = wv_year.loc[~wv_year['County'].str.upper().isin(appalachia_wv_counties)]['Total_Oil'].sum()
print(f"In Applachia counties (million bbl): {wv_oil_yr_appal_bbl/1e6}")
print(f"Not in Appalachia counties (million bbl): {wv_oil_yr_nonappal_bbl/1e6}")

In [None]:
# note: all WV production (of oil, NGLs, and gas) is within Appalachia counties defined by EIA
# so values for WV Appalachia are same as for whole state
leak_by_area = calculate_production_leakage_allocated_to_gas_selected_region(
    'West Virginia', 'Appalachia region - West Virginia', 
    wv_gross_gas_yr_appal_mcf, pa_sw_and_wv_leak_fract,
    leak_by_area)

In [None]:
oil_prod_year.at['Appalachia, WV'] = wv_oil_yr_appal_bbl

### Ohio

In [None]:
# gas units are Mcf, according to Ohio DNR page: 
# https://ohiodnr.gov/wps/portal/gov/odnr/business-and-industry/energy-resources/well-information/production
oh_gross_gas_yr_appal_mcf = oh_year.loc[oh_year['COUNTY'].isin(appalachia_oh_counties)]['GAS'].sum()
oh_gross_gas_yr_nonappal_mcf = oh_year.loc[~oh_year['COUNTY'].isin(appalachia_oh_counties)]['GAS'].sum()

print(f"In Appalachia counties (Bcf): {int(oh_gross_gas_yr_appal_mcf/1e6)}")
print(f"Not in Appalachia counties (Bcf): {int(oh_gross_gas_yr_nonappal_mcf/1e6)}")

In [None]:
# calculate oil production only for Appalachia counties
oh_oil_yr_appal_bbl = oh_year.loc[oh_year['COUNTY'].isin(appalachia_oh_counties)]['OIL'].sum()
oh_oil_yr_nonappal_bbl = oh_year.loc[~oh_year['COUNTY'].isin(appalachia_oh_counties)]['OIL'].sum()

print(f"In Appalachia counties (million bbl): {oh_oil_yr_appal_bbl/1e6}")
print(f"Not in Appalachia counties (million bbl): {oh_oil_yr_nonappal_bbl/1e6}")

In [None]:
# note: there is non-neglible OH oil production outside of Appalachia counties defined by EIA

# calculate leaked volume of NG, 
# for OH, assign leakage rate for SW PA & WV from above
# because both areas have more oil- and NGL-rich wells than NE PA,
# and because OH, like SW PA & WV, has many older conventional wells
oh_appal_leak_fract = pa_sw_and_wv_leak_fract

leak_by_area = calculate_production_leakage_allocated_to_gas_selected_region(
    'Ohio', 'Appalachia region - Ohio', 
    oh_gross_gas_yr_appal_mcf, oh_appal_leak_fract, 
    leak_by_area)

In [None]:
oil_prod_year.at['Appalachia, OH'] = oh_oil_yr_appal_bbl

### put Appalachia results into docs table 1

In [None]:
docs_table_3_1 = fill_in_docs_table_3_1(
    'Appalachia region (northeast PA)',
    pa_ne_gross_gas_yr_mcf,
    pa_ne_leak_fract,
    docs_table_3_1
)

In [None]:
app_remainder_gross_gas_mcf = sum([
    pa_sw_gross_gas_yr_mcf,
    pa_other_gross_gas_yr_mcf,
    wv_gross_gas_yr_appal_mcf,
    oh_gross_gas_yr_appal_mcf,
])
app_remainder_leak_gross_gas_mcf = sum([
    pa_sw_gross_gas_yr_mcf * pa_sw_and_wv_leak_fract,
    pa_other_gross_gas_yr_mcf * pa_other_leak_fract,
    wv_gross_gas_yr_appal_mcf * pa_sw_and_wv_leak_fract,
    oh_gross_gas_yr_appal_mcf * oh_appal_leak_fract,
])

app_remainder_leak_fract = app_remainder_leak_gross_gas_mcf / app_remainder_gross_gas_mcf

docs_table_3_1 = fill_in_docs_table_3_1(
    'Appalachia region (other)',
    app_remainder_gross_gas_mcf,
    app_remainder_leak_fract,
    docs_table_3_1
)

#### For comparison, calculate Appalachia leakage: method #1 (regional leakage rates)

In [None]:
# calculate weighted avereage leakage rate - based on gross gas volumes
app_leak_rate_weighted_avg = ((pa_ne_gross_gas_yr_mcf * pa_ne_leak_fract) + (app_remainder_gross_gas_mcf * app_remainder_leak_fract))/(pa_ne_gross_gas_yr_mcf + app_remainder_gross_gas_mcf)
print(f"app_leak_rate_weighted_avg: {round(100*app_leak_rate_weighted_avg, 2)}%")

In [None]:
# calculate weighted avereage leakage rate - based on CH4 volumes
app_ch4_leak_total = (pa_ne_gross_gas_yr_mcf * ch4_shares_of_wet_gas.at['Appalachia region - northeast Pennsylvania'] * pa_ne_leak_fract) + (app_remainder_gross_gas_mcf * ch4_shares_of_wet_gas.at['Appalachia region - outside northeast Pennsylvania'] * app_remainder_leak_fract)
app_ch4_prod_total = (pa_ne_gross_gas_yr_mcf * ch4_shares_of_wet_gas.at['Appalachia region - northeast Pennsylvania']) + (app_remainder_gross_gas_mcf * ch4_shares_of_wet_gas.at['Appalachia region - outside northeast Pennsylvania'])
print(f"Applachia weighted average leakage rate (based on total CH4 leaked and total CH4 produced): {round(100*app_ch4_leak_total/app_ch4_prod_total, 2)}%")

### Estimate Barkley 2017 leakage rate for all gas wells (conv & unconv) 

In [None]:
pa_ne_unconv_mcf = pa_unconv_year.loc[pa_unconv_year['Well County'].isin(pa_ne_counties)]['Gas Quantity (Mcf)'].sum()
pa_ne_conv_mcf = pa_conv_year.loc[pa_conv_year['Well County'].isin(pa_ne_counties)]['Gas Quantity (Mcf)'].sum()

In [None]:
pa_ne_unconv_mcf/(pa_ne_unconv_mcf + pa_ne_conv_mcf)

In [None]:
pa_ne_unconv_leak_mcf = pa_ne_unconv_mcf * 0.36/100
pa_ne_conv_leak_mcf = pa_ne_conv_mcf * 15/100
print(pa_ne_unconv_leak_mcf, pa_ne_conv_leak_mcf)

In [None]:
(pa_ne_unconv_leak_mcf + pa_ne_conv_leak_mcf)/(pa_ne_unconv_mcf + pa_ne_conv_mcf)

# Rounding to 2 significant digits, this is the same as the value reported in Barkley 2017
# which fits with Barkley personal communication, that essentially all the leakage is from unconventional wells
# (note that this is not the case when looking at PA as a whole, or Appalachia region as a whole)

#### Calculate Appalachia leakage: method #2 (conventional vs unconventional leakage rates)

In [None]:
# pa_conv_year & pa_unconv_year already defined above; PA has separate data sheets for each
pa_year_conv_mcf = pa_conv_year['Gas Quantity (Mcf)'].sum()
pa_year_unconv_mcf = pa_unconv_year['Gas Quantity (Mcf)'].sum()

pa_conv_well_count = len(pa_conv_year['Well Permit #'].unique())
pa_unconv_well_count = len(pa_unconv_year['Well Permit #'].unique())

In [None]:
wv_year_conv_mcf = wv_year.loc[wv_year['Well Type'].isin(['VERT', 'COAL'])]['Total_Gas'].sum()
wv_year_unconv_mcf = wv_year.loc[~wv_year['Well Type'].isin(['VERT', 'COAL'])]['Total_Gas'].sum()

wv_conv_well_count = len(wv_year.loc[wv_year['Well Type'].isin(['VERT', 'COAL'])]['API'].unique())
wv_unconv_well_count = len(wv_year.loc[~wv_year['Well Type'].isin(['VERT', 'COAL'])]['API'].unique())

In [None]:
# In Ohio data
# unconventional wells report quarterly; they have 1, 2, 3, 4 entered in column 'QUARTER'
# conventional wells report annually; they have 'N/A' entered in column 'QUARTER'
oh_year_conv_mcf = oh_year.loc[oh_year['QUARTER'].isna()==True]['GAS'].sum()
oh_year_unconv_mcf = oh_year.loc[oh_year['QUARTER'].isna()==False]['GAS'].sum()

# # conv vs unconv:
# print(oh_year_conv_mcf, oh_year_unconv_mcf, oh_year_conv_mcf/(oh_year_conv_mcf + oh_year_unconv_mcf))

oh_conv_well_count = len(oh_year.loc[oh_year['QUARTER'].isna()==True]['WELL NAME'].unique())
oh_unconv_well_count = len(oh_year.loc[oh_year['QUARTER'].isna()==False]['WELL NAME'].unique())

In [None]:
# Appalachia well counts
conv_well_count = pa_conv_well_count + wv_conv_well_count + oh_conv_well_count
unconv_well_count = pa_unconv_well_count + wv_unconv_well_count + oh_unconv_well_count
unconv_well_count_fract = conv_well_count / (conv_well_count + unconv_well_count)
print(f"Appalachia wells: % that are unconventional: {round(100*unconv_well_count_fract, 1)}%")

In [None]:
# calculate unconventional production for PA, WV, OH
# leakage rates for unconventional wells only
# Omara 2016: mean rate: 0.6% (GIM calculation)
# Barkley 2017 (NE PA): mean rate: 0.36%
# Barkley 2019 (SW PA & WV): mean rate: 0.5%; would be slightly lower if they had used mean rate from Omara 2016 for conventional wells
# Omara 2018: ???
# Caulton 2019: 0.53%
pawvoh_unconv_leak_fract = 0.53/100 # aka 0.53%; based on Caulton 2019
pawvoh_unconv_mcf = pa_year_unconv_mcf + wv_year_unconv_mcf + oh_year_unconv_mcf
pawvoh_unconv_leak_mcf = pawvoh_unconv_mcf * pawvoh_unconv_leak_fract

# calculate sum of conventional production for PA, WV, OH
pawvoh_conv_leak_fract = 14.6/100 # aka 14.6%, mean rate calculated from OMARA 2016
pawvoh_conv_mcf = pa_year_conv_mcf + wv_year_conv_mcf + oh_year_conv_mcf
pawvoh_conv_leak_mcf = pawvoh_conv_mcf * pawvoh_conv_leak_fract

# total production
pawvoh_unconv_mcf = pa_year_unconv_mcf + wv_year_unconv_mcf + oh_year_unconv_mcf
pawvoh_conv_mcf = pa_year_conv_mcf + wv_year_conv_mcf + oh_year_conv_mcf
pawvoh_mcf = pawvoh_unconv_mcf + pawvoh_conv_mcf

# total leakage
pawvoh_leak_mcf = pawvoh_unconv_leak_mcf + pawvoh_conv_leak_mcf

pawvoh_avg_leak_method2 = (pawvoh_unconv_leak_mcf + pawvoh_conv_leak_mcf)/pawvoh_mcf

In [None]:
print(f"Appalachia average leakage rate (method #2): {round(pawvoh_avg_leak_method2*100, 2)}%")

In [None]:
print(f"percent of production from conventional wells: {round(100*pawvoh_conv_mcf/pawvoh_mcf, 2)}%")
print(f"percent of leakage from conventional wells: {round(100*pawvoh_conv_leak_mcf/pawvoh_leak_mcf, 2)}%")

## Permian: Texas & New Mexico

In [None]:
# read leak rate from input file for production area leakage rates
permian_leak_fract = leak_fractions.at['Permian region']

In [None]:
# Permian: get county-level data from TX & NM; calculate shares
permian = dpr_counties.loc[dpr_counties['Region']=='Permian Region']
permian_tx_counties = permian.loc[permian['State']=='TX']['County'].tolist()
permian_nm_counties = permian.loc[permian['State']=='NM']['County'].tolist()

In [None]:
tx_permian_gross_gas_mcf = texas.loc[texas['County'].isin(permian_tx_counties)]['Gas total (Mcf)'].sum()

leak_by_area = calculate_production_leakage_allocated_to_gas_selected_region(
    'Texas', 'Permian region', 
    tx_permian_gross_gas_mcf, permian_leak_fract,
    leak_by_area)

In [None]:
nm_permian_gross_gas_mcf = new_mexico_year.loc[
    new_mexico_year['County'].str.upper().isin(permian_nm_counties)
]['Produced Gas'].sum()

leak_by_area = calculate_production_leakage_allocated_to_gas_selected_region(
    'New Mexico', 'Permian region', 
    nm_permian_gross_gas_mcf, permian_leak_fract, 
    leak_by_area)

In [None]:
permian_gross_gas_mcf = tx_permian_gross_gas_mcf + nm_permian_gross_gas_mcf

docs_table_3_1 = fill_in_docs_table_3_1(
    'Permian region',
    permian_gross_gas_mcf,
    permian_leak_fract,
    docs_table_3_1
)

In [None]:
# calculate oil
tx_permian_oil_bbl = texas.loc[
    texas['County'].isin(permian_tx_counties)
]['Oil total (bbl)'].sum()
oil_prod_year.at['Permian, TX'] = tx_permian_oil_bbl

nm_permian_oil_bbl = new_mexico_year.loc[
    new_mexico_year['County'].str.upper().isin(permian_nm_counties)
]['Produced Oil'].sum()
oil_prod_year.at['Permian, NM'] = nm_permian_oil_bbl

In [None]:
# for comparison; results of this cell not used later
permian_gross_gas_mcf = tx_permian_gross_gas_mcf + nm_permian_gross_gas_mcf
permian_gross_gas_bcf = permian_gross_gas_mcf / 1e6
permian_gross_gas_dpr = dpr_basin_prod_stats.at['Permian region', 'gas production Bcf/y']
print(f"Permian pct diff county data vs DPR: {round(100*((permian_gross_gas_dpr - permian_gross_gas_bcf)/permian_gross_gas_bcf), 1)}%")

permian_tx_fract = tx_permian_gross_gas_mcf / permian_gross_gas_mcf
permian_nm_fract = nm_permian_gross_gas_mcf / permian_gross_gas_mcf

print(f"% of Permian from TX: {round(100*permian_tx_fract, 1)}%")
print(f"% of Permian from NM: {round(100*permian_nm_fract, 1)}%")

### Haynesville region

In [None]:
# get Haynesville general data
haynesville_leak_fract = leak_fractions.at['Haynesville region']
haynesville_dpr_counties = dpr_counties.loc[dpr_counties['Region']=='Haynesville Region']

In [None]:
# Texas portion of Haynesville
haynesville_tx_counties = haynesville_dpr_counties.loc[
    haynesville_dpr_counties['State']=='TX']['County'].tolist()
tx_haynes = texas.loc[texas['County'].isin(haynesville_tx_counties)]
tx_haynes_gross_gas_mcf = tx_haynes['Gas total (Mcf)'].sum()

leak_by_area = calculate_production_leakage_allocated_to_gas_selected_region(
    'Texas', 'Haynesville region', 
    tx_haynes_gross_gas_mcf, haynesville_leak_fract, 
    leak_by_area)

In [None]:
# Arkansas portion of Haynesville
haynesville_ar_counties = haynesville_dpr_counties.loc[
    haynesville_dpr_counties['State']=='AR']['County'].tolist()

county_ser = pd.Series(dtype=float)
for county in haynesville_ar_counties:
    county = county.title()    
    arkansas_file_name = arkansas_file_name_template.replace(
        'DATA_YEAR', str(data_year)).replace('COUNTY', county)
    df = pd.read_excel(inputs_path + arkansas_file_name, header=1)
    
    if 'No data records available' in df.columns:
        # no data
        pass
    else:
        total = df.loc[df['Month']=='Total:']['Prod Total'].sum()
        county_ser.at[county] = total

ar_haynes_gross_gas_mcf = county_ser.sum()

leak_by_area = calculate_production_leakage_allocated_to_gas_selected_region(
    'Arkansas', 'Haynesville region', 
    ar_haynes_gross_gas_mcf, haynesville_leak_fract,
    leak_by_area)

In [None]:
# Louisiana portion of Haynesville
haynesville_la_counties = haynesville_dpr_counties.loc[
    haynesville_dpr_counties['State']=='LA']['County'].tolist()

la_data = pd.read_excel(inputs_path + la_file)
la_haynes = la_data.loc[la_data['Parish'].isin(haynesville_la_counties)]
la_haynes_gross_gas_mcf = la_haynes['Gas Total'].sum()

leak_by_area = calculate_production_leakage_allocated_to_gas_selected_region(
    'Louisiana', 'Haynesville region', 
    la_haynes_gross_gas_mcf, haynesville_leak_fract,
    leak_by_area)

In [None]:
haynesville_gross_gas_mcf = sum([
    la_haynes_gross_gas_mcf, 
    tx_haynes_gross_gas_mcf, 
    ar_haynes_gross_gas_mcf
])
docs_table_3_1 = fill_in_docs_table_3_1(
    'Haynesville region',
    haynesville_gross_gas_mcf,
    haynesville_leak_fract,
    docs_table_3_1
)

In [None]:
# compare against EIA DPR
haynesville_year_dpr_bcf = dpr_basin_prod_stats.at['Haynesville region', 'gas production Bcf/y']
print(f"Haynesville Bcf, EIA:    {int(haynesville_year_dpr_bcf)}")
print(f"Haynesville Bcf, LA DNR: {int(haynesville_gross_gas_mcf/1e6)}")

In [None]:
# for reference
haynesville_la_fract = la_haynes_gross_gas_mcf / haynesville_gross_gas_mcf
haynesville_tx_fract = tx_haynes_gross_gas_mcf / haynesville_gross_gas_mcf
haynesville_ar_fract = ar_haynes_gross_gas_mcf / haynesville_gross_gas_mcf

print(haynesville_la_fract, haynesville_tx_fract, haynesville_ar_fract)

In [None]:
oil_prod_year.at['Haynesville, LA'] = la_haynes['Oil Total'].sum()

oil_prod_year.at['Haynesville, LA'] = tx_haynes['Oil total (bbl)'].sum()

### Anadarko region

In [None]:
anadarko_leak_fract = leak_fractions.at['Anadarko region']

In [None]:
# Anadarko: get county-level data from TX; calculate share of DPR
# assign percentages based on Texas data and DPR total
# (couldn't find data from OK, other than well-level data, which Oklahoma Geological Survey warns may be incomplete)

anadarko_dpr_counties = dpr_counties.loc[dpr_counties['Region']=='Anadarko Region']
anadarko_tx_counties = anadarko_dpr_counties.loc[anadarko_dpr_counties['State']=='TX']['County'].tolist()

tx_anadarko_gross_gas_mcf = texas.loc[texas['County'].isin(anadarko_tx_counties)]['Gas total (Mcf)'].sum()

leak_by_area = calculate_production_leakage_allocated_to_gas_selected_region(
    'Texas', 'Anadarko region', 
    tx_anadarko_gross_gas_mcf, anadarko_leak_fract, 
    leak_by_area)

In [None]:
# Don't have data directly from Oklahoma
# infer from Texas state data and EIA DPR
anadarko_gross_gas_bcf_dpr = dpr_basin_prod_stats.at['Anadarko region', 'gas production Bcf/y']
tx_anadarko_gross_gas_bcf = tx_anadarko_gross_gas_mcf/1e6
anadarko_tx_gas_fract = tx_anadarko_gross_gas_bcf / anadarko_gross_gas_bcf_dpr
anadarko_ok_gas_fract = 1 - anadarko_tx_gas_fract

ok_anadarko_gross_gas_mcf = anadarko_ok_gas_fract * dpr_basin_prod_stats.at['Anadarko region', 'gas production Bcf/y'] * 1e6

leak_by_area = calculate_production_leakage_allocated_to_gas_selected_region(
    'Oklahoma', 'Anadarko region', 
    ok_anadarko_gross_gas_mcf, anadarko_leak_fract, 
    leak_by_area)

In [None]:
anadarko_gross_gas_mcf = tx_anadarko_gross_gas_mcf + ok_anadarko_gross_gas_mcf

docs_table_3_1 = fill_in_docs_table_3_1(
    'Anadarko region',
    anadarko_gross_gas_mcf,
    anadarko_leak_fract,
    docs_table_3_1
)

In [None]:
print(anadarko_tx_gas_fract, anadarko_ok_gas_fract)

In [None]:
# calculate oil
tx_anadarko_oil_bbl = texas.loc[
    texas['County'].isin(anadarko_tx_counties)
]['Oil total (bbl)'].sum()
oil_prod_year.at['Anadarko, TX'] = tx_anadarko_oil_bbl

# Don't have data directly from Oklahoma
# infer from Texas state data and EIA DPR
anadarko_oil_bbl_dpr = dpr_basin_prod_stats.at['Anadarko region', 'oil production bbl/y']
anadarko_tx_oil_fract = tx_anadarko_oil_bbl / anadarko_oil_bbl_dpr
anadarko_ok_oil_fract = 1 - anadarko_tx_oil_fract

print(anadarko_tx_oil_fract, anadarko_ok_oil_fract )

anadarko_ok_oil_bbl = anadarko_ok_oil_fract * dpr_basin_prod_stats.at['Anadarko region', 'oil production bbl/y']
oil_prod_year.at['Anadarko, OK'] = anadarko_ok_oil_bbl

### Eagle Ford

In [None]:
# split Eagle Ford into east and west
# based on map in Peischl 2018 showing flight paths
eagle_ford_west_dpr_counties = [
    'DIMMIT', 'FRIO', 'LA SALLE', 'MAVERICK', 'WEBB', 'ZAVALA'
]
eagle_ford_east_dpr_counties = [
    'ATASCOSA', 'LIVE OAK', 'MCMULLEN', 
    'BASTROP', 'BEE', 'BRAZOS', 'BURLESON', 'DEWITT', 'FAYETTE', 'GONZALES', 
    'KARNES', 'LAVACA', 'LEE', 'LEON', 'MADISON', 'MILAM', 'WILSON'
]

eagle_ford_west_leak_fract = leak_fractions.at['Eagle Ford region west']
eagle_ford_west = texas.loc[texas['County'].isin(eagle_ford_west_dpr_counties)]
eagle_ford_west_gross_gas_mcf = eagle_ford_west['Gas total (Mcf)'].sum()

leak_by_area = calculate_production_leakage_allocated_to_gas_selected_region(
    'Texas', 'Eagle Ford region west', 
    eagle_ford_west_gross_gas_mcf, eagle_ford_west_leak_fract, 
    leak_by_area)

docs_table_3_1 = fill_in_docs_table_3_1(
    'Eagle Ford region west',
    eagle_ford_west_gross_gas_mcf, eagle_ford_west_leak_fract,
    docs_table_3_1
)

eagle_ford_east_leak_fract = leak_fractions.at['Eagle Ford region east']
eagle_ford_east = texas.loc[texas['County'].isin(eagle_ford_east_dpr_counties)]
eagle_ford_east_gross_gas_mcf = eagle_ford_east['Gas total (Mcf)'].sum()

leak_by_area = calculate_production_leakage_allocated_to_gas_selected_region(
    'Texas', 'Eagle Ford region east', 
    eagle_ford_east_gross_gas_mcf, eagle_ford_east_leak_fract, 
    leak_by_area)

docs_table_3_1 = fill_in_docs_table_3_1(
    'Eagle Ford region east',
    eagle_ford_east_gross_gas_mcf, eagle_ford_east_leak_fract,
    docs_table_3_1
)

In [None]:
print(f"Eagle Ford west Bcf: {round(eagle_ford_west_gross_gas_mcf/1e6, 1)}")
print(f"Eagle Ford east Bcf: {round(eagle_ford_east_gross_gas_mcf/1e6, 1)}")

In [None]:
# comparison: get production from EIA DPR
eagle_ford_gross_gas_mcf_dpr = dpr_basin_prod_stats.at['Eagle Ford region', 'gas production Bcf/y'] * 1e6

# note: EIA DPR value for Eagle Ford gas is substantially higher (~10%) than state data (RRC)
# the state state is for total gas (gas well gas + casinghead gas)
# the same method for Permian found much closer agreement between RRC & DPR data
# so it's unclear what the reason for the discrepancy would be; 
# seems unlikely to be significantly more late reporting in Eagle Ford than Permian
txrrc_eagle_ford = eagle_ford_west_gross_gas_mcf + eagle_ford_east_gross_gas_mcf
print(f"Texas RRC: {int(txrrc_eagle_ford/1e6)} Bcf")
print(f"EIA DPR:   {int(eagle_ford_gross_gas_mcf_dpr/1e6)} Bcf")

In [None]:
# calculate oil production
oil_prod_year.at['Eagle Ford west, TX'] = eagle_ford_west['Oil total (bbl)'].sum()
oil_prod_year.at['Eagle Ford east, TX'] = eagle_ford_east['Oil total (bbl)'].sum()

### Greater Green River basin
(within EIA's Niobrara region)

In [None]:
# counties within basin based on USGS map:
# USGS Southwestern Wyoming Province Assessment Team
# Petroleum Systems and Geologic Assessment of Oil and Gas in the Southwestern Wyoming Province, Wyoming, Colorado, and Utah; Chapter 1
# https://pubs.usgs.gov/dds/dds-069/dds-069-d/reports.html
# & 
# The Greater Green River Basin
# https://www.usgs.gov/media/images/greater-green-river-basin

green_river_wy_counties = ['Lincoln', 'Sublette', 'Sweetwater', 'Uinta']
green_river_wy = wyoming.loc[wyoming['Countyname'].str.title().isin(green_river_wy_counties)]
green_river_wy_raw_gas_mcf = green_river_wy['gas (Mcf)'].sum()

green_river_co_counties = ['Moffat', 'Routt']
green_river_co = colorado.loc[colorado['County'].str.title().isin(green_river_co_counties)]
green_river_co_raw_gas_mcf = green_river_co['Gas Production (MCF)'].sum()

green_river_raw_gas_mcf = green_river_wy_raw_gas_mcf + green_river_co_raw_gas_mcf
green_river_wy_fract = green_river_wy_raw_gas_mcf/green_river_raw_gas_mcf
green_river_co_fract = green_river_co_raw_gas_mcf/green_river_raw_gas_mcf
print(f"Green River gas production: WY: {round(100*green_river_wy_fract, 1)}%, CO: {round(100*green_river_co_fract, 1)}%")

In [None]:
greater_green_river_leakage_rate = leak_fractions.at['Greater Green River region']

leak_by_area = calculate_production_leakage_allocated_to_gas_selected_region(
    'Wyoming', 'Greater Green River region', 
    green_river_wy_raw_gas_mcf, greater_green_river_leakage_rate, 
    leak_by_area
)
leak_by_area = calculate_production_leakage_allocated_to_gas_selected_region(
    'Colorado', 'Greater Green River region', 
    green_river_co_raw_gas_mcf, greater_green_river_leakage_rate, 
    leak_by_area
)
docs_table_3_1 = fill_in_docs_table_3_1(
    'Greater Green River region',
    green_river_raw_gas_mcf, greater_green_river_leakage_rate,
    docs_table_3_1
)

In [None]:
# calculate oil production
oil_prod_year.at['Green River, WY'] = green_river_wy['oil (bbl)'].sum()
oil_prod_year.at['Green River, CO'] = green_river_co['Oil Production (barrels)'].sum()

### San Juan basin

In [None]:
# split between Colorado & New Mexico

# based on "The San Juan Basin, a Complex Giant Gas Field, New Mexico and Colorado,"
# James E. Fassett, Search and Discovery Article #10254 (2013)
san_juan_nm_counties = ['McKinley', 'Rio Arriba', 'San Juan', 'Sandoval']
san_juan_co_counties = ['Montezuma', 'La Plata', 'Archuleta']

# make upper case
san_juan_nm_counties = [x.upper() for x in san_juan_nm_counties]
san_juan_co_counties = [x.upper() for x in san_juan_co_counties]

san_juan_co = colorado.loc[colorado['County'].isin(san_juan_co_counties)]
san_juan_co_raw_gas_mcf = san_juan_co['Gas Production (MCF)'].sum()

san_juan_nm = new_mexico_year.loc[new_mexico_year['County'].str.upper().isin(san_juan_nm_counties)]
san_juan_nm_raw_gas_mcf = san_juan_nm['Produced Gas'].sum()

san_juan_gross_gas_mcf = san_juan_nm_raw_gas_mcf + san_juan_co_raw_gas_mcf

san_juan_nm_fract = san_juan_nm_raw_gas_mcf / san_juan_gross_gas_mcf
san_juan_co_fract = san_juan_co_raw_gas_mcf / san_juan_gross_gas_mcf

print(f"San Juan gas production: NM: {round(100*san_juan_nm_fract, 1)}%, CO: {round(100*san_juan_co_fract, 1)}%")

In [None]:
san_juan_co_raw_gas_mcf, san_juan_nm_raw_gas_mcf

In [None]:
san_juan_leak_fract = leak_fractions.at['San Juan region']

leak_by_area = calculate_production_leakage_allocated_to_gas_selected_region(
    'New Mexico', 'San Juan region', 
    san_juan_nm_raw_gas_mcf, san_juan_leak_fract, 
    leak_by_area)

leak_by_area = calculate_production_leakage_allocated_to_gas_selected_region(
    'Colorado', 'San Juan region', 
    san_juan_co_raw_gas_mcf, san_juan_leak_fract, 
    leak_by_area)

docs_table_3_1 = fill_in_docs_table_3_1(
    'San Juan region',
    san_juan_gross_gas_mcf,
    san_juan_leak_fract,
    docs_table_3_1
)

In [None]:
# calculate oil production
oil_prod_year.at['San Juan, NM'] = san_juan_nm['Produced Oil'].sum()
oil_prod_year.at['San Juan, CO'] = san_juan_co['Oil Production (barrels)'].sum()

### Barnett region

In [None]:
barnett_leak_fract = leak_fractions.at['Barnett region']

# based on maps from Texas Commission on Environmental Quality
# https://www.tceq.texas.gov/airquality/barnettshale/bshale-maps
barnett_counties = [
    'Montague', 'Cooke', 'Jack', 'Wise', 'Denton',
    'Palo Pinto', 'Parker', 'Tarrant', 'Dallas', 'Erath',
    'Hood', 'Somervell', 'Johnson', 'Ellis', 'Hill'
]

# get production from state data
barnett_tx = texas.loc[texas['County'].str.title().isin(barnett_counties)]
barnett_gross_gas_mcf = barnett_tx['Gas total (Mcf)'].sum()

leak_by_area = calculate_production_leakage_allocated_to_gas_selected_region(
    'Texas', 'Barnett region', 
    barnett_gross_gas_mcf, barnett_leak_fract, 
    leak_by_area)

docs_table_3_1 = fill_in_docs_table_3_1(
    'Barnett region',
    barnett_gross_gas_mcf,
    barnett_leak_fract,
    docs_table_3_1
)

In [None]:
# for comparison against EIA data for gas production by play
# (but EIA's value from this sheet is dry gas production)
barnett_dry_gas_yr_mcf_eia = get_annual_production_from_eia_shale_by_play('Barnett (TX)', data_year) * 1000

barnett_gross_gas_mcf/1e6, barnett_dry_gas_yr_mcf_eia/1e6

In [None]:
oil_prod_year.at['Barnett, TX'] = barnett_tx['Oil total (bbl)'].sum()

### Offshore GOM (state & federal)
* split between Texas, Alabama, Louisiana, and federal
* EIA data has units MMcf

In [None]:
# get production rates
gom_fed_gross_gas_yr_mcf = eia_gross_prod_all_yr_mmcf.at['GOM (federal)'] * 1000
gom_la_gross_gas_yr_mcf = eia_gross_prod_all_yr_mmcf.at['Louisiana--State Offshore'] * 1000
gom_tx_gross_gas_yr_mcf = eia_gross_prod_all_yr_mmcf.at['Texas--State Offshore'] * 1000
gom_al_gross_gas_yr_mcf = eia_gross_prod_all_yr_mmcf.at['Alabama--State Offshore'] * 1000

gom_year_total_mcf = sum([
    gom_fed_gross_gas_yr_mcf,
    gom_la_gross_gas_yr_mcf,
    gom_tx_gross_gas_yr_mcf,
    gom_al_gross_gas_yr_mcf
])

In [None]:
# show:
# for state waters, fraction from Alabama:
gom_al_gross_gas_yr_mcf/sum([gom_la_gross_gas_yr_mcf, gom_tx_gross_gas_yr_mcf, gom_al_gross_gas_yr_mcf])

In [None]:
# view:
# shows nearly all the gas production is from federal waters
gom_fed_fract = gom_fed_gross_gas_yr_mcf/gom_year_total_mcf
gom_la_fract = gom_la_gross_gas_yr_mcf/gom_year_total_mcf
gom_tx_fract = gom_tx_gross_gas_yr_mcf/gom_year_total_mcf
gom_al_fract = gom_al_gross_gas_yr_mcf/gom_year_total_mcf

print(round(gom_fed_fract, 3), 
      round(gom_la_fract, 3),
      round(gom_tx_fract, 3),
      round(gom_al_fract, 3))

In [None]:
gom_leak_fract = leak_fractions.at['Offshore Gulf of Mexico (state & federal waters)']

# GOM federal waters
leak_by_area = calculate_production_leakage_allocated_to_gas_selected_region(
    'GOM (federal)', 'Offshore Gulf of Mexico', 
    gom_fed_gross_gas_yr_mcf, gom_leak_fract, 
    leak_by_area
)
# GOM Louisiana state waters
leak_by_area = calculate_production_leakage_allocated_to_gas_selected_region(
    'Louisiana', 'Offshore Gulf of Mexico', 
    gom_la_gross_gas_yr_mcf, gom_leak_fract, 
    leak_by_area
)
# GOM Texas state waters
leak_by_area = calculate_production_leakage_allocated_to_gas_selected_region(
    'Texas', 'Offshore Gulf of Mexico', 
    gom_tx_gross_gas_yr_mcf, gom_leak_fract, 
    leak_by_area
)
# GOM Alabama state waters
leak_by_area = calculate_production_leakage_allocated_to_gas_selected_region(
    'Alabama', 'Offshore Gulf of Mexico', 
    gom_al_gross_gas_yr_mcf, gom_leak_fract, 
    leak_by_area
)

In [None]:
# for documentation, group all of GOM (state & federal) together
docs_table_3_1 = fill_in_docs_table_3_1(
    'Offshore Gulf of Mexico (state & federal)',
    gom_year_total_mcf,
    gom_leak_fract,
    docs_table_3_1
)

In [None]:
oil_prod_year.at['GOM federal waters'] = oil_prod_bbl_year.at['GOM (federal)']

# estimate, assuming same proportion of oil is from state waters as the proportion of gas
oil_prod_year.at['GOM state waters'] = (1-gom_fed_fract) * oil_prod_bbl_year.at['GOM (federal)']

In [None]:
# show fraction covered so far
oil_prod_year.sum()/oil_prod_cont_us

### Denver-Julesburg region
(within Niobrara region defined by EIA DPR)

In [None]:
# Colorado production from counties within Denver-Julesburg basin

# https://www.usgs.gov/centers/cersc/science/denver-basin-oil-and-gas-assessments
dj_counties_co = [
    'Weld', 'Yuma', 'Larimer', 'Adams', 
    'Arapahoe', 'Boulder', 'Washington', 'Phillips', 'Logan', 
    'Morgan', 'Kit Carson', 'Sedgwick', 
    'Jefferson', # no production?
]

# counties I thought might be part of DJ, but aren't in EIA's Niobrara region:
# 'Denver', 'Lincoln', 'Elbert', 'El Paso', 'Douglas'

# split between Colorado & Wyoming
dj_co_raw_gas_mcf = colorado.loc[colorado['County'].str.title().isin(dj_counties_co)]['Gas Production (MCF)'].sum()
dj_co_oil_bbl = colorado.loc[colorado['County'].str.title().isin(dj_counties_co)]['Oil Production (barrels)'].sum()

# note: Denver-Julesburg production in Colorado is 57% of gas produced in EIA DPR's Niobrara region within Colorado production
# EIA DPR's Niobrara Region also includes Colorado's Piceance basin

In [None]:
# Wyoming production from counties within Denver-Julesburg basin
# note that Wyoming's share of production from this basin is ~10% of that of Colorado's share, both for oil & gas

# https://www.usgs.gov/centers/cersc/science/denver-basin-oil-and-gas-assessments
dj_counties_wy = [
    'Platte', 'Goshen', 'Laramie',
    'Albany', 'Carbon',
]

dj_wy_gross_gas_mcf = wyoming.loc[wyoming['Countyname'].str.title().isin(dj_counties_wy)]['gas (Mcf)'].sum()
dj_wy_oil_bbl = wyoming.loc[wyoming['Countyname'].str.title().isin(dj_counties_wy)]['oil (bbl)'].sum()

In [None]:
wyoming['gas (Mcf)'].sum()

In [None]:
# from Nebraska Energy Office, "Natural Gas Production by County in Nebraska"
# https://neo.ne.gov/programs/stats/inf/31.html
# all of Nebraska's gas production is within Denver-Julesburg basin
# however, it is small relative to production from CO & WY
dj_ne_gas_mcf_2016 = 526420

print(f"Percent of DJ gas from Nebraska (approx.): {round(100*dj_ne_gas_mcf_2016/(dj_ne_gas_mcf_2016 + dj_co_raw_gas_mcf + dj_wy_gross_gas_mcf),2)}%")

In [None]:
# given that the production is so small, and that there isn't recent data (latest year is 2016), 
# then GIM excludes the Nebraska part of DJ
# that production would fall into the category of miscellaneous production assigned the default leakage rate

In [None]:
# dj_total_gas_mcf = dj_co_gas_mcf + dj_wy_gas_mcf
# dj_total_gas_boe = dj_co_gas_boe + dj_wy_gas_boe
# dj_total_oil_bbl = dj_co_oil_bbl + dj_wy_oil_bbl

# dj_total_gas_fract = dj_total_gas_boe/(dj_total_gas_boe + dj_total_oil_bbl)

In [None]:
# gim_region_prod_stats.at['Denver-Julesburg basin', 'gas production Bcf/y'] = dj_total_gas_mcf/1e6
# gim_region_prod_stats.at['Denver-Julesburg basin', 'gas fract of energy'] = dj_total_gas_fract

In [None]:
dj_leak_fract = leak_fractions.at['Denver-Julesburg region']
dj_gross_gas_mcf = dj_co_raw_gas_mcf + dj_wy_gross_gas_mcf

leak_by_area = calculate_production_leakage_allocated_to_gas_selected_region(
    'Colorado', 'Denver-Julesburg region', 
    dj_co_raw_gas_mcf, dj_leak_fract, 
    leak_by_area)

leak_by_area = calculate_production_leakage_allocated_to_gas_selected_region(
    'Wyoming', 'Denver-Julesburg region', 
    dj_wy_gross_gas_mcf, dj_leak_fract, 
    leak_by_area)

docs_table_3_1 = fill_in_docs_table_3_1(
    'Denver-Julesburg region',
    dj_gross_gas_mcf,
    dj_leak_fract,
    docs_table_3_1
)

In [None]:
oil_prod_year.at['Denver-Julesburg, CO'] = dj_co_oil_bbl
oil_prod_year.at['Denver-Julesburg, WY'] = dj_wy_oil_bbl
# assume Nebraska's DJ oil production is negligible
print(f"billion bbl oil/y added so far to oil_prod_year: {round(oil_prod_year.sum()/1e9, 2)}")

In [None]:
# show fraction covered so far
oil_prod_year.sum()/oil_prod_cont_us

### Bakken region

In [None]:
# split between North Dakota and Montana

# North Dakota Department of Mineral Resources data page:
# https://www.dmr.nd.gov/oilgas/stats/statisticsvw.asp
# downloaded "Monthly gas production totals by county" (pdf); units are Mcf
# notes that confidental wells are excluded; may be an issue for data < 1 year old
# extracted data using Tabula

bakken = dpr_counties.loc[dpr_counties['Region']=='Bakken Region']
bakken_nd_counties = bakken.loc[bakken['State']=='ND']['County'].tolist()
bakken_mt_counties = bakken.loc[bakken['State']=='MT']['County'].tolist()

df = pd.read_excel(inputs_path + nd_gas_file)
df['year'] = df['Date'].dt.year
df = df.set_index('year')
df = df.drop('Date', axis=1)
df = df.groupby(df.index).sum()
ser = df.loc[data_year]
ser.index.name = 'County'
north_dakota_year = pd.DataFrame(ser).reset_index().rename(columns={data_year: 'gas Mcf'})
bakken_nd = north_dakota_year.loc[north_dakota_year['County'].str.upper().isin(bakken_nd_counties)]
bakken_nd_gross_gas_mcf = bakken_nd['gas Mcf'].sum()

In [None]:
bakken_nd_gross_gas_mcf

In [None]:
df = pd.read_excel(inputs_path + nd_oil_file)
df['year'] = df['Date'].dt.year
df = df.set_index('year')
df = df.drop('Date', axis=1)
df = df.groupby(df.index).sum()
ser = df.loc[data_year]
ser.index.name = 'County'
north_dakota_year = pd.DataFrame(ser).reset_index().rename(columns={data_year: 'oil bbl'})
bakken_nd = north_dakota_year.loc[north_dakota_year['County'].str.upper().isin(bakken_nd_counties)]
bakken_nd_oil_bbl = bakken_nd['oil bbl'].sum()

In [None]:
# Montana Board of Oil & Gas Conservation
# Annual Production by County
# http://www.bogc.dnrc.mt.gov/WebApps/DataMiner/Production/ProdAnnualCounty.aspx
# downloaded only the counties listed by EIA DPR as being within Bakken region
# (Dawson, McCone, Richland, Roosevelt, Sheridan)

montana_bakken = pd.read_excel(inputs_path + montana_bakken_file)
montana_bakken_year = montana_bakken.copy().loc[montana_bakken['Year']==data_year]
montana_bakken_year['Total Gas'] = montana_bakken_year[['Nat. Gas', 'Assoc. Gas']].sum(axis=1)
bakken_mt_gross_gas_mcf = montana_bakken_year['Total Gas'].sum()
bakken_mt_oil_bbl = montana_bakken_year['Oil'].sum()

In [None]:
bakken_gross_gas_mcf = bakken_nd_gross_gas_mcf + bakken_mt_gross_gas_mcf
bakken_gross_gas_bcf = bakken_gross_gas_mcf/1e6
bakken_year_dpr = dpr_basin_prod_stats.at['Bakken region', 'gas production Bcf/y']
print(f"Bakken production state data: {round(bakken_gross_gas_bcf, 1)} Bcf/y")
print(f"Bakken production EIA DPR:    {round(bakken_year_dpr, 1)} Bcf/y")

In [None]:
bakken_nd_fract = bakken_nd_gross_gas_mcf/bakken_gross_gas_mcf
bakken_mt_fract = bakken_mt_gross_gas_mcf/bakken_gross_gas_mcf
print(bakken_nd_fract)
print(bakken_mt_fract)

In [None]:
bakken_leak_fract = leak_fractions.at['Bakken region']

leak_by_area = calculate_production_leakage_allocated_to_gas_selected_region(
    'North Dakota', 'Bakken region', 
    bakken_nd_gross_gas_mcf, bakken_leak_fract, 
    leak_by_area)

leak_by_area = calculate_production_leakage_allocated_to_gas_selected_region(
    'Montana', 'Bakken region', 
    bakken_mt_gross_gas_mcf, bakken_leak_fract, 
    leak_by_area)

docs_table_3_1 = fill_in_docs_table_3_1(
    'Bakken region',
    bakken_gross_gas_mcf,
    bakken_leak_fract,
    docs_table_3_1
)

In [None]:
oil_prod_year.at['Bakken, ND'] = bakken_nd_oil_bbl
oil_prod_year.at['Bakken, MT'] = bakken_mt_oil_bbl
print(f"billion bbl oil/y added so far to oil_prod_year: {round(oil_prod_year.sum()/1e9, 2)}")

In [None]:
# show fraction covered so far
oil_prod_year.sum()/oil_prod_cont_us

## Fayetteville

In [None]:
# based on Gulen 2014; only a very small number of wells outside of these counties, on the margins of the play
fayetteville_counties = [
    'Van Buren', 'Cleburne', 'Independence',
    'Conway', 'Faulkner', 'White'
]

# data on gross gas production from Arkansas Oil and Gas Commission; monthly data by county
# http://www.aogc.state.ar.us/data/monthly.aspx
county_ser = pd.Series(dtype=float)
for county in fayetteville_counties:
    arkansas_file_name = arkansas_file_name_template.replace(
        'DATA_YEAR', str(data_year)).replace('COUNTY', county)
    df = pd.read_excel(inputs_path + arkansas_file_name, header=1)
    
    if 'No data records available' in df.columns:
        print(f"No gas production for {county} County")
        pass
    else:
        total = df.loc[df['Month']=='Total:']['Prod Total'].sum()
        county_ser.at[county] = total

fayetteville_gross_gas_mcf = county_ser.sum()

fayetteville_leak_fract = leak_fractions.at['Fayetteville region']

leak_by_area = calculate_production_leakage_allocated_to_gas_selected_region(
    'Arkansas', 'Fayetteville region', 
    fayetteville_gross_gas_mcf, fayetteville_leak_fract, 
    leak_by_area)

docs_table_3_1 = fill_in_docs_table_3_1(
    'Fayetteville region',
    fayetteville_gross_gas_mcf,
    fayetteville_leak_fract,
    docs_table_3_1
)

In [None]:
# for comparison:
fayetteville_dry_gas_year_mcf_eia = get_annual_production_from_eia_shale_by_play('Fayetteville (AR)', data_year) * 1000

print(f"EIA dry gas: {int(fayetteville_dry_gas_year_mcf_eia/1e6)}")
print(f"AOGC gross gas: {int(fayetteville_gross_gas_mcf/1e6)}")

## Uintah basin

In [None]:
# based on Utah Geological Survey map:
# https://geology.utah.gov/map-pub/survey-notes/uinta-basin-produced-water/
# count production in Uintah County and Duchnese County
# within the basin, only small, scattered gas fields in Grand County and Carbon County
# but those counties have other gas fields outside the boundary of Uintah basin
# since Uintah leakage rate is higher than US average, conservative approach for leakage is to draw boundaries more tightly
uintah_basin_counties = ['UINTAH', 'DUCHESNE']

# Utah oil & gas production from Department of Natural Resources
# online oil & gas information: https://oilgas.ogm.utah.gov/oilgasweb/live-data-search/lds-main.xhtml
# annual data by county: https://oilgas.ogm.utah.gov/oilgasweb/live-data-search/lds-prod/prod-by-cnty-lu.xhtml

# in Uintah basin, also also two small gas fields in Carbon County; 
# https://www.shaleexperts.com/plays/uinta-basin/Overview
# gas production (Mcf):
# from annual data by field: https://oilgas.ogm.utah.gov/oilgasweb/live-data-search/lds-prod/prod-by-fld-lu.xhtml
# In 2018, those were < 5% of the production in Uintah & Duchnese counties

utah_by_county = pd.read_csv(inputs_path + utah_by_county_file)
utah_by_county = utah_by_county.set_index('County')

utah_by_county_year = utah_by_county.loc[utah_by_county['Year']==data_year]

uintah_ut = utah_by_county_year.loc[utah_by_county_year.index.isin(uintah_basin_counties)]
uintah_year_gross_gas_mcf = uintah_ut['Natural Gas (MCF)'].sum()
uintah_year_oil_bbl = uintah_ut['Oil (BBLs)'].sum()

In [None]:
# uintah_year_oil_mmbtu = uintah_year_oil_bbl * oil_en_dens_mmbtu_per_bbl
# uintah_year_oil_mmbtu / (uintah_year_gross_gas_mmbtu_approx + uintah_year_oil_mmbtu)

In [None]:
uintah_leak_fract = leak_fractions.at['Uintah region']

leak_by_area = calculate_production_leakage_allocated_to_gas_selected_region(
    'Utah', 'Uintah region', 
    uintah_year_gross_gas_mcf, uintah_leak_fract, 
    leak_by_area)

docs_table_3_1 = fill_in_docs_table_3_1(
    'Uintah region',
    uintah_year_gross_gas_mcf,
    uintah_leak_fract,
    docs_table_3_1
)

In [None]:
oil_prod_year.at['Uintah, UT'] = uintah_year_oil_bbl

In [None]:
# show oil fraction covered so far
oil_prod_year.sum()/oil_prod_cont_us

## San Joaquin Valley (California)

In [None]:
# California Department of Conservation:
# Annual Reports of the State Supervisor of Oil and Gas
# https://www.conservation.ca.gov/calgem/pubs_stats/annual_reports/Pages/annual_reports.aspx

# Includes oil, gas, water production by county
# for 2018, report states gross production
# 2017 report stated net production
# 2019 report covering 2014-2018 also states 2018 as gross and previous years as net
# so there was apparently a change in reporting

In [None]:
if data_year in [2018, 2019]:
    ca_by_county = pd.read_excel(
        inputs_path + ca_by_county_file, sheet_name=f"{data_year} data")
    
    # clean data
    ca_by_county['County'] = ca_by_county['County'].str.strip()
    
else:
    print("ERROR!" + f" Data year {data_year} not covered in California data by county here.")
    print("Check website for new data: https://www.conservation.ca.gov/calgem/pubs_stats/annual_reports/Pages/annual_reports.aspx")

In [None]:
sjv_counties = ['Kern', 'San Joaquin', 'Fresno', 'Madera', 'Kings', 'Merced', 'Stanislaus', 'Tulare']
sjv_data = ca_by_county.loc[ca_by_county['County'].isin(sjv_counties)]
sjv_prod_gas_mcf = sjv_data['Total gross gas production (Mcf)'].sum()
sjv_prod_oil_bbl = sjv_data['Oil & condensate produced (bbl)'].sum()

In [None]:
sjv_leakage_rate = leak_fractions.at['San Joaquin Valley']

leak_by_area = calculate_production_leakage_allocated_to_gas_selected_region(
    'California', 'San Joaquin Valley',
    sjv_prod_gas_mcf, sjv_leakage_rate, 
    leak_by_area
)
docs_table_3_1 = fill_in_docs_table_3_1(
    'San Joaquin Valley',
    sjv_prod_gas_mcf, sjv_leakage_rate, 
    docs_table_3_1
)

In [None]:
if data_year == 2018 or data_year == 2019:
    oil_prod_year.at['San Joaquin Valley, CA'] = sjv_prod_oil_bbl
else:
    print("ERROR!" + f" Don't have SJV production rate entered yet for {data_year}.")

## Average leakage & summary statistics

In [None]:
def calculate_weighted_average_leak_fract_gross(leak_by_area):
    """
    For all production areas with methane leakage rate measurements,
    calculate the quantity of gross gas leaked, 
    divide by the sum of gross gas production.
    
    This provides a weighted average percentage leakage rate (or leak fraction).
    """
    df = leak_by_area.copy()
    gross_gas_leaked = df[['gross gas production (Bcf)', 'leakage fraction']].product(axis=1)
    weighted_average_leak_fract = gross_gas_leaked.sum()/df['gross gas production (Bcf)'].sum()
    print(f"Average leakage rate: {round(100*weighted_average_leak_fract, 3)}%")
    
    return weighted_average_leak_fract

In [None]:
weighted_average_leak_fract = calculate_weighted_average_leak_fract_gross(leak_by_area)

In [None]:
# show oil fraction covered so far (approx., because GOM state waters is approx.)
oil_prod_year.sum()/oil_prod_cont_us

In [None]:
# show oil fraction covered so far, excluding GOM state waters
(oil_prod_year.sum() - oil_prod_year.at['GOM state waters'])/oil_prod_cont_us

In [None]:
def reformat_docs_table_3_1_for_display(docs_table_3_1):
    df = docs_table_3_1.copy()
    df = df.sort_values(by='Gross gas (Bcf)', ascending=False)
    df = df.set_index('Production area')
    
    # calculate weighted average leakage
    bcf_leaked = df[['Gross gas (Bcf)', 'methane leak fract']].product(axis=1)
#     print(f"bcf_leaked.sum(): {bcf_leaked.sum()}; df['Gross gas (Bcf)'].sum(): {df['Gross gas (Bcf)'].sum()}")
    df.at['Contiguous U.S. total', 'methane leak fract'] = bcf_leaked.sum() / df['Gross gas (Bcf)'].sum()
    
    # put in totals
    df.at['Contiguous U.S. total', 'cont US fract'] = df['cont US fract'].sum()
    df.at['Contiguous U.S. total', 'Gross gas (Bcf)'] = df['Gross gas (Bcf)'].sum()
    
    # reformat columns
    df['Gross gas (Bcf)'] = df['Gross gas (Bcf)'].round(0).astype(int)
    df['cont US %'] = (df['cont US fract']*100).round(1).astype(str) + '%'
    df['methane leak %'] = (df['methane leak fract']*100).round(2).astype(str) + '%'
    
    df = df.drop(['cont US fract', 'methane leak fract'], axis=1)
    
    docs_table_3_1_display = df
    
    return docs_table_3_1_display

In [None]:
# run function to show result:
reformat_docs_table_3_1_for_display(docs_table_3_1)

## split each region's production between states
based on county-level data

In [None]:
def calculate_totals_by_state_measured_areas(leak_by_area):
    # calculate totals for each state
    # (these are not total leakage values; only for areas measured by methane leakage studies)
    df = leak_by_area.copy()
    
    for col in [
    'gross gas production (Bcf)',
    'leakage fraction',
    'CH4 fraction in gross gas',
    'CH4 leaked (Bcf)',
    'CH4 leaked (tons)'
    ]:
        df[col] = df[col].astype(float)

    df['gross gas leaked (Bcf)'] = df[['gross gas production (Bcf)', 'leakage fraction']].product(axis=1)
    df = df.groupby('state')[['gross gas production (Bcf)', 'gross gas leaked (Bcf)', 'CH4 leaked (Bcf)', 'CH4 leaked (tons)']].sum()
    
    measured_leak_by_state = df
    return measured_leak_by_state

In [None]:
def clean_eia_gross_gas_prod(eia_gross_prod_all_yr_mmcf):
    
    # convert from MMcf to Bcf
    df = eia_gross_prod_all_yr_mmcf / 1000
    df.name = 'gross gas prod (Bcf) total'

    # clean up
    df = df.drop(
        ['U.S.', 'Other States', 'Alaska']
    )
    df = df.rename(
        index={'Federal Offshore--Gulf of Mexico': 'GOM (federal)'})

    # exclude those with 'onshore', 'offshore', 'other states'
    for row in df.index:
        if 'offshore' in row.lower() or 'onshore' in row.lower():
            df = df.drop(row)
    
    eia_gross_prod_all_yr_bcf = df
            
    return eia_gross_prod_all_yr_bcf

In [None]:
def fill_in_leakage_for_unmeasured_areas(
    measured_leak_by_state, 
    eia_gross_prod_all_yr_mmcf, 
    dry_gas_en_fract_of_dgno_by_state, 
    default_ch4_share_in_gross_gas, 
    weighted_average_leak_fract
):
    """
    For production areas in contiguous US that were not covered by methane leakage studies, calculate leakage.
    
    Calculate the remaining production for each state from EIA data.
    
    Apply default leakage rate to that remaining production.
    """
    mcf_per_bcf = 1e6
    
    df = measured_leak_by_state.copy()
    df = df.rename(columns={'gross gas production (Bcf)': 'gross gas prod (Bcf) measured'})
    
    # get EIA data on gross production; clean up
    eia_gross_prod_all_yr_bcf = clean_eia_gross_gas_prod(eia_gross_prod_all_yr_mmcf)
    
    # merge in total gross gas production for each state
    df = pd.merge(
        df, 
        eia_gross_prod_all_yr_bcf,
        left_index=True,
        right_index=True,
        how='outer',
        suffixes=(' measured', ' total')
    )

    for col in df.columns:
        df[col] = df[col].fillna(0)
    
    # calculate volume of gas production not covered by measurements
    df['uncovered gross gas (Bcf)'] = df['gross gas prod (Bcf) total'].sub(
        df['gross gas prod (Bcf) measured'], fill_value=0)

    df['pct measured'] = df['gross gas prod (Bcf) measured'].div(df['gross gas prod (Bcf) total'])
   
    # fill in leakage for areas not measured by methane leakage studies
    # for those with negative values in 'uncovered gross gas (Bcf)', don't add any more leakage
    # (negative values mean that state data for areas covered summed to slightly more than EIA data for those states)
    
    if oil_based_leakage_fract == False:
        print(f"Using nationwide average leakage % as default: {round(100*weighted_average_leak_fract, 1)}%")
    else:
        pass
    
    for state in df.index:
        uncovered_wg_bcf = df.at[state, 'uncovered gross gas (Bcf)']
        if uncovered_wg_bcf > 0:
            # for uncovered, multiply by default leakage rate to calculate leakage volume
            # put that value into df for leakage quantity
            if oil_based_leakage_fract == False:
                uncovered_leak_fract = weighted_average_leak_fract
                
            elif oil_based_leakage_fract == True:
                
                # estimate leakage fraction based on correlation between DG fract of DGNO & leakage rate from other states
                # did the fit in Excel, for top 13 producing states, except for WY, since these were those with nearly all production with methane measurements
                # (so they had little input from default calculations)
                # the states were: TX, PA, OK, LA, OH, CO, WV, NM, GOM (Federal), ND, AR, UT, CA
                # got fit R^2 = 0.465
                # the fit was very close to the line connecting ND (5.8% leak, 16.4% DG in DGNO) to LA (1.5% leak, 89.1% DG in DNGO)
                # so use those values 
                y2 = 5.8/100
                y1 = 1.5/100
                x2 = 16.4/100
                x1 = 89.1/100
                slope = (y2 - y1)/(x2 - x1)
                intercept = y2 - (slope * x2)
                
                uncovered_leak_fract = slope * dry_gas_en_fract_of_dgno_by_state.at[state] + intercept
                print(f"{state}: default leakage rate: {round(100*uncovered_leak_fract, 1)}%") # for db
                
            else:
                print("ERROR!" + f" Unexpected value for oil_based_leakage_fract: {oil_based_leakage_fract}")
            
            # CH4 leaked tons uncovered, from uncovered_leak_gross_gas_bcf
            uncovered_leak_wg_bcf = uncovered_wg_bcf * uncovered_leak_fract
            uncovered_leak_ch4_tons = uncovered_leak_wg_bcf * mcf_per_bcf * default_ch4_share_in_gross_gas * (ch4_kg_per_mcf/1000)

            df.at[state, 'uncovered gross gas leaked (Bcf)'] = uncovered_leak_wg_bcf
            df.at[state, 'uncovered CH4 leaked (tons)'] = uncovered_leak_ch4_tons
        else:
            df.at[state, 'uncovered gross gas leaked (Bcf)'] = 0
            df.at[state, 'uncovered CH4 leaked (tons)'] = 0

    df['gross gas leaked (tons) total'] = df[['gross gas leaked (Bcf)', 'uncovered gross gas leaked (Bcf)']].sum(axis=1)        
    df['CH4 leaked (tons) total'] = df[['CH4 leaked (tons)', 'uncovered CH4 leaked (tons)']].sum(axis=1)
    df['leakage fraction'] = df['gross gas leaked (tons) total'].div(df['gross gas prod (Bcf) total'], fill_value=None)
    
    leak_by_state = df
    
    return leak_by_state

In [None]:
measured_leak_by_state = calculate_totals_by_state_measured_areas(leak_by_area)

default_ch4_share_in_gross_gas = leak_by_area[
    ['gross gas production (Bcf)', 'CH4 fraction in gross gas']
].product(axis=1).sum()/leak_by_area['gross gas production (Bcf)'].sum()

leak_by_state = fill_in_leakage_for_unmeasured_areas(
    measured_leak_by_state, eia_gross_prod_all_yr_mmcf, 
    dry_gas_en_fract_of_dgno_by_state, default_ch4_share_in_gross_gas, 
    weighted_average_leak_fract)

In [None]:
cont_us_leak_prod_oil_gas_ton_ch4 = leak_by_state['CH4 leaked (tons) total'].sum()
print(f"GIM production area leakage (O&G): {round(cont_us_leak_prod_oil_gas_ton_ch4/1e6, 4)} Tg")

In [None]:
# # export:
# save_timestamp = time.strftime('%Y-%m-%d_%H%M', time.localtime())
# leak_by_state_display.to_excel(
#     lca_path + f'GIM production area leakage by producing state {save_timestamp}.xlsx'
# )

In [None]:
# for each state, sum gross gas production
# calculate remaining gross gas production without methane measurements
# put in default leakage fraction and default CH4 fraction in gross gas
# calculate CH4 leaked (Bcf & tons)
# then groupby sum for each state
# compare with other version I made to see if there were mistakes

gas_producing_states = [
    'Alaska', 'Arkansas', 'California', 'Colorado', 'GOM (federal)', 'Louisiana', 
    'Montana', 'New Mexico', 'North Dakota', 'Ohio', 'Oklahoma', 'Pennsylvania', 
    'Texas', 'Utah', 'West Virginia', 'Wyoming',
    'Alabama', 'Arizona', 'Florida', 'Idaho', 'Illinois', 'Indiana', 'Kentucky',
    'Maryland', 'Michigan', 'Mississippi', 'Missouri', 'Nebraska', 'Nevada',
    'New York', 'Oregon', 'South Dakota', 'Tennessee', 'Virginia']

if contiguous_us_only==True:
    gas_producing_states.remove('Alaska')

# NEW FUNCTION FOR DOCUMENTATION:
def fill_in_production_without_methane_leak_measurements(leak_by_area, eia_gross_prod_all_yr_mmcf):
    gross_gas_mmcf = eia_gross_prod_all_yr_mmcf.copy()
    gross_gas_mmcf = gross_gas_mmcf.loc[gross_gas_mmcf.index.isin(gas_producing_states)]
    gross_gas_mmcf.name = 'gross gas production (MMcf) [whole state]'

    df = leak_by_area.copy()
    df['gross gas production (MMcf)'] = df['gross gas production (Bcf)'] * 1000
    df['CH4 leaked (MMcf)'] = df['CH4 leaked (Bcf)'] * 1000
    df = df.drop(['gross gas production (Bcf)', 'CH4 leaked (Bcf)'], axis=1)
    
    for state in gas_producing_states:
        gross_gas_state_mmcf = gross_gas_mmcf.at[state]
        gross_gas_state_meas_mmcf = df.loc[df['state']==state]['gross gas production (MMcf)'].sum()
        gross_gas_state_not_meas_mmcf = gross_gas_state_mmcf - gross_gas_state_meas_mmcf

        if gross_gas_state_not_meas_mmcf > 0:
            df_index_new = df.index.max() + 1
            df.at[df_index_new, 'state'] = state
            df.at[df_index_new, 'region'] = 'other'
            df.at[df_index_new, 'gross gas production (MMcf)'] = gross_gas_state_not_meas_mmcf
            df.at[df_index_new, 'leakage fraction'] = weighted_average_leak_fract
            df.at[df_index_new, 'CH4 fraction in gross gas'] = default_ch4_share_in_gross_gas
            ch4_leaked_mmcf = gross_gas_state_not_meas_mmcf * default_ch4_share_in_gross_gas * weighted_average_leak_fract
            df.at[df_index_new, 'CH4 leaked (MMcf)'] = ch4_leaked_mmcf
            mcf_per_mmcf = 1e3
            ton_per_kg = 1e-3
            ch4_leaked_tons = ch4_leaked_mmcf * mcf_per_mmcf * ch4_kg_per_mcf * ton_per_kg
            df.at[df_index_new, 'CH4 leaked (tons)'] = ch4_leaked_tons
    
    df.insert(6, 'CH4 density (kg/Mcf)', ch4_kg_per_mcf)
    
    # concat totals
    state_totals = pd.DataFrame(gross_gas_mmcf)
    state_totals = pd.concat([
        state_totals, 
        gas_dry_year_mmcf, 
        ngl_prod_bbl_year, ngl_energy_density,
        oil_prod_bbl_year
    ], axis=1)
    state_totals = state_totals.loc[state_totals.index.isin(gas_producing_states)]
    state_totals = state_totals.reset_index()
    state_totals = state_totals.rename(
        columns={'index': 'state', 
                 'gross gas production (MMcf) [whole state]': 'gross gas production (MMcf)'})
    state_totals['region'] = 'all regions'
    state_totals['dry gas energy density (mmbtu/MMcf)'] = dry_gas_en_dens_btu_per_cf
    state_totals['oil energy density (mmbtu/bbl)'] = oil_en_dens_mmbtu_per_bbl
    df = pd.concat([df, state_totals], sort=False)
    
    # bring in gross gas production by state, for sorting
    df = pd.merge(df, gross_gas_mmcf, left_on='state', right_index=True, how='left') # , suffixes=('', ' for sorting'))
    df = df.sort_values(by=[
        'gross gas production (MMcf) [whole state]', 
        'gross gas production (MMcf)'
    ], ascending=[False, False])
    df = df.drop('gross gas production (MMcf) [whole state]', axis=1)
    df = df.reset_index(drop=True)
    
    for state in gas_producing_states:
        total_leak_by_state = df.groupby('state')['CH4 leaked (tons)'].sum()
        # get index:
        all_regions_index = df.loc[(df['state']==state) & (df['region']=='all regions')].index[0]
        df.at[all_regions_index, 'CH4 leaked (tons)'] = total_leak_by_state.at[state]

    dry_gas_mmbtu = df['dry gas production (MMcf)'] * df['dry gas energy density (mmbtu/MMcf)']
    df.insert(10, 'dry gas production (mmbtu)', dry_gas_mmbtu)
    
    ngl_mmbtu = df['NGL production (bbl)'] * df['NGL energy density (mmbtu/bbl)']
    df.insert(13, 'NGL production (mmbtu)', ngl_mmbtu)
    
    df['oil production (mmbtu)'] = df['oil production (bbl)'] * df['oil energy density (mmbtu/bbl)']
    
    df['total energy (mmbtu)'] = df[[
        'dry gas production (mmbtu)', 
        'NGL production (mmbtu)', 
        'oil production (mmbtu)']
    ].sum(axis=1, skipna=True).replace(0, np.nan)
    df['% energy from dry gas'] = df['dry gas production (mmbtu)'] / df['total energy (mmbtu)']
    df['CH4 leaked alloc dry gas (tons)'] = df['CH4 leaked (tons)'] * df['% energy from dry gas']
    g_per_ton = 1e6
    mcf_per_mmcf = 1e3
    df['dry gas CH4 leakage rate (g CH4/Mcf dry gas)'] = df['CH4 leaked alloc dry gas (tons)'] * g_per_ton / (df['dry gas production (MMcf)'] * mcf_per_mmcf)
    
    # reorder columns:
    df = df[[
        'state',
        'region',
        'gross gas production (MMcf)', 'CH4 fraction in gross gas', 'leakage fraction',
        'CH4 leaked (MMcf)', 'CH4 density (kg/Mcf)', 'CH4 leaked (tons)', 
        'dry gas production (MMcf)', 'dry gas energy density (mmbtu/MMcf)', 'dry gas production (mmbtu)',
        'NGL production (bbl)', 'NGL energy density (mmbtu/bbl)', 'NGL production (mmbtu)', 
        'oil production (bbl)', 'oil energy density (mmbtu/bbl)', 'oil production (mmbtu)',
        'total energy (mmbtu)', '% energy from dry gas', 
        'CH4 leaked alloc dry gas (tons)',
        'dry gas CH4 leakage rate (g CH4/Mcf dry gas)'
    ]]
    
    df = df.rename(columns={'leakage fraction': 'leakage rate (% of gross gas that leaks)'})
            
    leak_by_area_complete = df
    return leak_by_area_complete

In [None]:
leak_by_area_complete = fill_in_production_without_methane_leak_measurements(leak_by_area, eia_gross_prod_all_yr_mmcf)

# export
save_timestamp = time.strftime('%Y-%m-%d_%H%M', time.localtime())
leak_by_area_complete.to_excel(
    lca_path + f'GIM leak by area complete {save_timestamp}.xlsx', 
    index=False)

In [None]:
# show:
leak_by_area_complete.loc[leak_by_area_complete['region']=='all regions'][
    ['state', 'dry gas production (MMcf)', 'dry gas CH4 leakage rate (g CH4/Mcf dry gas)']
].sort_values(by='dry gas CH4 leakage rate (g CH4/Mcf dry gas)', ascending=False)

In [None]:
prod_leak_gg = leak_by_area_complete.loc[leak_by_area_complete['region']!='all regions']['CH4 leaked (tons)'].sum() * gg_per_ton
prod_leak_gg

In [None]:
import math

df = leak_by_area.copy()
df['region'] = df['region'].str.split(' - ').str[0]
df = df.groupby('region')[['CH4 leaked (tons)']].sum()
# convert to million tons
df['CH4 leaked (million tons)'] = df['CH4 leaked (tons)']/1e6
df = df.drop('CH4 leaked (tons)', axis=1)
df = df.sort_values('CH4 leaked (million tons)', ascending=False)

# calculate diameter of circles, such that area of circle is proportional to leakage
# area = pi * (diameter/2) ^ 2
df['diameter'] = 2 * (df['CH4 leaked (million tons)'] / math.pi)**(1/2)

leak_by_area_for_graphic = df

In [None]:
leak_by_area_for_graphic['diameter']

## Allocate leakage to dry gas (consumer-grade gas)
(Excludes leakage allocated to NGLs and oil)

In [None]:
def allocate_leakage_to_dry_gas_produced_simple(
    leak_by_state, 
    dry_gas_en_fract_of_dgno_by_state):
    """
    Allocate production area leakage based on energy shares.
    
    Simple version: 
    Calculate share of energy from dry gas; allocate leakage proportionally.
    """
    
    df = leak_by_state.copy()
    
    print("in fn allocate_leakage_to_dry_gas_produced, running simple allocation")
    # simply divide leakage by the percentage of energy production from dry gas
    prod = df['CH4 leaked (tons) total']
    prod_df = pd.merge(prod, dry_gas_en_fract_of_dgno_by_state, left_index=True, right_index=True, how='outer')
    prod_df = prod_df.drop(['U.S.', 'Alaska'])

    prod_leak_ch4_ton_dg = prod_df[['CH4 leaked (tons) total', 'DG fract of DGNO']].product(axis=1)
    prod_leak_ch4_ton_dg.name = 'prod leak alloc DG CH4 tons'

    return prod_leak_ch4_ton_dg

In [None]:
def allocate_leakage_to_dry_gas_produced_complex(
    leak_by_state, 
    dry_gas_en_fract_of_dgno_by_state, 
    dry_gas_en_fract_of_gross_gas_by_state):
    """
    Allocate production area leakage based on energy shares.

    Complex version:
    Assume 70% of leakage from well sites. Split based on dry gas share of DNGO (dry gas & NGL & oil).
    Assume 30% of leakage from gathering & processing. Split based on dry gas share of gross gas (dry gas & NGL).
    """
    
    df = leak_by_state.copy()

    print("in fn allocate_leakage_to_dry_gas_produced, running complex allocation")
    # assume 70% of leakage in production areas is from production itself (based on Alvarez 2018)
    prod = df['CH4 leaked (tons) total'] * 0.7
    prod_df = pd.merge(prod, dry_gas_en_fract_of_dgno_by_state, left_index=True, right_index=True, how='outer')
    prod_df = prod_df.drop(['U.S.', 'Alaska'])

    prod_leak_dg = prod_df[['CH4 leaked (tons) total', 'DG fract of DGNO']].product(axis=1)
    prod_leak_dg.name = 'production leak alloc DG CH4 tons'

    # -------

    # assume 30% of leakage in production areas is from gathering & processing (based on Alvarez 2018)
    gath_proc = df['CH4 leaked (tons) total'] * 0.3
    gath_proc_df = pd.merge(gath_proc, dry_gas_en_fract_of_gross_gas_by_state, left_index=True, right_index=True, how='outer')
    gath_proc_df = gath_proc_df.drop(['U.S.', 'Alaska'])

    gath_proc_dg = gath_proc_df[['CH4 leaked (tons) total', 'DG fract of WG']].product(axis=1)
    gath_proc_dg.name = 'gath & proc leak alloc DG CH4 tons'

    prod_leak_ch4_ton_dg = pd.concat([prod_leak_dg, gath_proc_dg], axis=1).sum(axis=1)
    prod_leak_ch4_ton_dg.name = 'prod leak alloc DG CH4 tons'

    return(prod_leak_ch4_ton_dg, prod_df, gath_proc_df)

In [None]:
def calculate_prod_leak_rate_g_ch4_per_mcf_dg(
    leak_by_state, prod_leak_ch4_ton_dg, gas_dry_year_mmcf, data_year):
    """
    Calculates the weighted average production-area leakage rate, 
    for those production areas covered by methane leakage measurements.
    Sums methane leakage and sums dry gas production, dividing the first by the second.
    """
    df = pd.merge(
        prod_leak_ch4_ton_dg, 
        gas_dry_year_mmcf, 
        left_index=True, right_index=True, how='left')
    
    measured_area_leak_g = df['prod leak alloc DG CH4 tons'] * g_per_ton
    measured_area_dg_mcf = df['dry gas production (MMcf)'] * mcf_per_mmcf
    
    print(f"show measured_area_dg_mcf.sum(): {measured_area_dg_mcf.sum()}")
    
    df['prod leak g CH4/Mcf DG'] = measured_area_leak_g / measured_area_dg_mcf
    meas_prod_leak_g_ch4_per_mcf_avg = measured_area_leak_g.sum() / measured_area_dg_mcf.sum()
    
    # dropna (get rid of any states that have no production, e.g., Missouri)
    df = df.dropna(subset=['prod leak g CH4/Mcf DG'])

    prod_leak_g_ch4_per_mcf_dg = df['prod leak g CH4/Mcf DG']
    
    # percentage of production area leakage allocated to dry gas (cont US average)
    # out of total leakage from oil & gas (OG) production
    cont_us_leak_prod_OG_ton_ch4 = leak_by_state['CH4 leaked (tons) total'].sum()
    prod_leak_share_dg = prod_leak_ch4_ton_dg.sum() / cont_us_leak_prod_OG_ton_ch4
    print(f"% of prod area leakage allocated to dry gas: {round(100*prod_leak_share_dg, 2)}%")

    return(prod_leak_g_ch4_per_mcf_dg, prod_leak_share_dg, meas_prod_leak_g_ch4_per_mcf_avg)

In [None]:
if allocate_prod_leakage_mode == 'simple':
    prod_leak_ch4_ton_dg = allocate_leakage_to_dry_gas_produced_simple(
        leak_by_state, dry_gas_en_fract_of_dgno_by_state)

elif allocate_prod_leakage_mode == 'complex':
    (prod_leak_ch4_ton_dg, prod_df, gath_proc_df) = allocate_leakage_to_dry_gas_produced_complex(
        leak_by_state, dry_gas_en_fract_of_dgno_by_state, dry_gas_en_fract_of_gross_gas_by_state)
    
(prod_leak_g_ch4_per_mcf_dg, 
 prod_leak_share_dg, 
 prod_leak_g_ch4_per_mcf_avg) = calculate_prod_leak_rate_g_ch4_per_mcf_dg(
    leak_by_state, prod_leak_ch4_ton_dg, gas_dry_year_mmcf, data_year)

In [None]:
# shows shares of energy production
# (energy % as dry gas should match--at least, quite closely--the % of leakage allocated to dry gas)
table_en_fract_cont_us = table_en_fract.drop(['U.S.', 'Alaska'])

gas_fract = table_en_fract_cont_us['dry gas production (mmbtu)'].sum() / table_en_fract_cont_us['dgno mmbtu'].sum()
ngl_fract = table_en_fract_cont_us['NGL production (mmbtu)'].sum() / table_en_fract_cont_us['dgno mmbtu'].sum()
oil_fract = table_en_fract_cont_us['oil production (mmbtu)'].sum() / table_en_fract_cont_us['dgno mmbtu'].sum()
print("for contiguous U.S.:")
print(f"gas: {round(100*gas_fract, 2)}%")
print(f"NGL: {round(100*ngl_fract, 2)}%")
print(f"oil: {round(100*oil_fract, 2)}%")
print(f"total: {round(100*(gas_fract + ngl_fract + oil_fract), 1)}%")

In [None]:
table_en_fract.head()

In [None]:
# total production area leakage contiguous US (from oil, NGLs, and gas); units are Gg
leak_by_state['CH4 leaked (tons) total'].sum()/1e3

In [None]:
# show kt (aka Gg) of CH4 from production areas allocated to dry gas
prod_leak_ch4_dg_gg = prod_leak_ch4_ton_dg.sum()/1000
print(round(prod_leak_ch4_dg_gg, 1))

In [None]:
# show kt (aka Gg) of CH4 from production areas NOT allocated to dry gas
round((leak_by_state['CH4 leaked (tons) total'].sum() - prod_leak_ch4_ton_dg.sum())/1000, 1)

In [None]:
prod_leak_g_ch4_per_mcf_avg

In [None]:
# export
save_timestamp = time.strftime('%Y-%m-%d_%H%M', time.localtime())
leak_by_state.to_excel(lca_path + f'GIM results - production area leakage by state {save_timestamp}.xlsx')

## Create tables for report

In [None]:
prod_leak_g_ch4 = prod_leak_ch4_ton_dg * g_per_ton
prod_leak_g_ch4_per_mcf = prod_leak_g_ch4 / (gas_dry_year_mmcf * mcf_per_mmcf)
prod_leak_g_ch4_per_mcf.name = 'prod leak g CH4/Mcf dry gas'

gas_dry_year_bcf = gas_dry_year_mmcf / 1000
gas_dry_year_bcf.name = 'dry gas production (Bcf)'

df = pd.merge(
    gas_dry_year_bcf, prod_leak_g_ch4_per_mcf, 
    left_index=True, right_index=True, how='inner')
df = df.sort_values(by='dry gas production (Bcf)', ascending=False)
df = df.dropna(how='any')
table_5_4 = df

In [None]:
table_5_4.head(15)

In [None]:
# notes about states with high estimated leakage rate per Mcf dry gas:

# Florida is an extreme case; has much smaller dry gas and marketed gas, compared to gross extraction
# In FL, most gross gas extracted is reinjected (repressuring)
# (although FL production is so low it doesn't matter, even for consumption within FL)

## Comparison with Alvarez 2018

In [None]:
# gross gas production for 2015; all units MMcf
us_gross_2015 = 32914647
alaska_gross_2015 = 3175301
cont_us_2015 = us_gross_2015 - alaska_gross_2015
cont_us_share_2015 = cont_us_2015/us_gross_2015

# gross gas production for data_year; all units MMcf
us_gross_year = eia_gross_prod_all_yr_mmcf.at['U.S.']
alaska_gross_year = eia_gross_prod_all_yr_mmcf.at['Alaska']
cont_us_year = us_gross_year - alaska_gross_year
cont_us_share_year = cont_us_year/us_gross_year

In [None]:
# shares of US gas produced in contiguous US
round(cont_us_share_2015, 3), round(cont_us_share_year, 3)

In [None]:
alvarez_prod_area_leak_tg = 10.9
alvarez_prod_area_leak_tg_cont_us = alvarez_prod_area_leak_tg * cont_us_share_2015

print(f"Alvarez 2018 production area leakage (O&G), contiguous US: {round(alvarez_prod_area_leak_tg_cont_us, 2)} Tg")

gim_prod_area_leak_tg = leak_by_state['CH4 leaked (tons) total'].sum()/1e6
print(f"GIM production area leakage (O&G), contiguous US: {round(gim_prod_area_leak_tg, 2)} Tg")

leak_fract_diff = (gim_prod_area_leak_tg - alvarez_prod_area_leak_tg_cont_us)/alvarez_prod_area_leak_tg_cont_us

print(f"Difference (GIM - Alvarez): {round(100*leak_fract_diff, 2)}%")

In [None]:
leak_fract_diff
print(f"production area leakage %: GIM % higher than Alvarez: {round(100*((weighted_average_leak_fract - 1.9/100)/(1.9/100)), 1)}%")

In [None]:
# change in production
change_in_prod_fract = (cont_us_year - cont_us_2015)/cont_us_2015
print(f"change in production (%): {round(100*change_in_prod_fract, 1)}%")

In [None]:
# given difference in leakage rates & difference in contiguous US production
expected_increase_fract = ((1+change_in_prod_fract)*(1+leak_fract_diff))-1
print(f"Expectation for production leakage Tg in GIM to be this much higher than Alvarez: {round(expected_increase_fract*100, 1)}%")

# assign leakage to gas consumed

## production-level leakage for gas consumed by each state

In [None]:
# for each state's gas supply, 
# for each producing state fraction,
# look up the production leakage rate in prod_leak_g_ch4_per_mcf_dg
# multiply the producing state fraction by the production leakage rate
# when finished for each consuming state, sum the values to get the weighted average leakage from production for the state's gas supply

In [None]:
def calculate_ch4_rate_for_consumers(
    supply_fract_file, gas_flows_timestamp, 
    prod_leak_g_ch4_per_mcf_dg, prod_leak_share_dg):
    
    # for each state, get attribution of consumption traced back to production areas
    # import results from notebook 'GIM gas flows...'
    try:
        supply_fract = pd.read_excel(inputs_path + f'GIM gas flows results {gas_flows_timestamp}/' + supply_fract_file)
    except:
        print("Error!" + " File not found. It may be that the file needs to be moved to the GIM inputs folder.")
        print(f"File name: {supply_fract_file}")
    supply_fract = supply_fract.set_index('consuming juris')

    # note: column 'overseas' has NaN for every consuming juris, except for consuming juris 'overseas', where the value is 0
    # so can drop the column 'overseas'
    # also drop the row 'overseas'; used in gas flows sub-model to allocate production back to each state as best as possible
    supply_fract = supply_fract.drop('overseas', axis=1)
    supply_fract = supply_fract.drop('overseas', axis=0)

    consump_juris_avg_leak_from_prod = pd.Series(dtype=float) # initialize

    prod_leak_default_g_ch4_per_mcf = weighted_average_leak_fract * ch4_fract_in_ng_consumer_grade * ch4_kg_per_mcf * g_per_kg
    prod_leak_default_g_ch4_per_mcf_alloc_dg = prod_leak_default_g_ch4_per_mcf * prod_leak_share_dg
    print(f"prod_leak_default_g_ch4_per_mcf_alloc_dg: {prod_leak_default_g_ch4_per_mcf_alloc_dg}")

    for consump_juris in supply_fract.index:
        consump_supply_fract = supply_fract.loc[consump_juris]
        df = pd.concat(
            [consump_supply_fract, 
             prod_leak_g_ch4_per_mcf_dg], 
            axis=1, sort=False)
        
        # sort (useful if printing for debugging)
        df = df.sort_values(by=consump_juris, ascending=False)

        # if there are NaNs, fill with 0        
        df[consump_juris] = df[consump_juris].fillna(0)
        
        # fill in missing values for production leakage rate
        # (will fill in for Canada, Mexico, overseas)
        df['prod leak g CH4/Mcf DG'] = df['prod leak g CH4/Mcf DG'].fillna(prod_leak_default_g_ch4_per_mcf_alloc_dg)

        df = df.loc[df[consump_juris]>1e-9]
        df = df.dropna(subset=[consump_juris])
        df = df.sort_values(by=[consump_juris], ascending=False)

        df['weighted leak'] = df[consump_juris] * df['prod leak g CH4/Mcf DG']

        consump_juris_avg_leak_from_prod.at[consump_juris] = df['weighted leak'].sum()

    consump_juris_avg_leak_from_prod.name = 'prod leak g CH4/Mcf dry gas'
        
    return consump_juris_avg_leak_from_prod

In [None]:
consump_juris_avg_leak_from_prod = calculate_ch4_rate_for_consumers(
    supply_fract_file, gas_flows_timestamp, 
    prod_leak_g_ch4_per_mcf_dg, prod_leak_share_dg)

In [None]:
# export
save_timestamp = time.strftime('%Y-%m-%d_%H%M', time.localtime())
consump_juris_avg_leak_from_prod.to_csv(lca_path + f'GIM production area leakage rate by consuming state for {data_year} {save_timestamp}.csv')

## results: show main supply sources for each state

In [None]:
consuming_state_sel = 'California'
supply_fract = pd.read_excel(inputs_path + f'GIM gas flows results {gas_flows_timestamp}/' + supply_fract_file)
df = supply_fract.copy().set_index('consuming juris')
all_consuming_juris_dfs = [] # initialize
for juris in [consuming_state_sel]:
    juris_ser = df.loc[juris].sort_values(ascending=False)
    juris_df = pd.DataFrame(juris_ser).reset_index()
    juris_df = juris_df.rename(columns={'index': 'producing state'})
    top = juris_df.copy().loc[0:6]
    top_fract = top[juris].sum()
    remainder_fract = 1 - top_fract
    new_index = top.index.max()+1
    top.at[new_index, 'producing state'] = 'other'
    top.at[new_index, juris] = remainder_fract
print(consuming_state_sel)
top.T

#     all_consuming_juris_dfs += [df]

# all_consuming_juris = pd.concat(all_consuming_juris_dfs, sort=False)

# all_consuming_juris