In [None]:
# model settings

# choice of methods for services
# if True, leakage calculations for services are based on values for mains from Weller 2020
# if False, leakage calculations for services are based on values from EPA, Lamb 2015, and GTI 2016
services_based_on_mains = False

allocate_dist_pipe_leak_to_res_comm_only = False

In [None]:
# key input files for GIM
crosswalk_file = 'US gas company crosswalk (EIA, PHMSA, EPA) 2020-10-24.xlsx'

parameters_file = 'GIM parameters file.xlsx'

In [None]:
# file paths
lca_path = '/Users/masoninman/Dropbox/GEM/LCA of natural gas use/'
inputs_path = lca_path + 'US cities LCA of gas model/US gas model inputs/'
eia_176_path = lca_path + 'EIA data for LCA of gas/EIA-176 data/'

In [None]:
import pandas as pd
import numpy as np
import math
import time

In [None]:
# read parameters
df = pd.read_excel(inputs_path + 'GIM parameters file.xlsx', 'main parameters')
parameters_main = df.set_index('parameter name')['parameter value']

data_year = int(parameters_main.at['data_year'])
contiguous_us_only = parameters_main.at['contiguous_us_only']
ch4_fract_in_ng_consumer_grade = parameters_main.at['ch4_fract_in_ng_consumer_grade']
ch4_kg_per_mcf = parameters_main.at['ch4_kg_per_mcf']
behind_meter_res = parameters_main.at['behind_meter_res']
behind_meter_comm = parameters_main.at['behind_meter_comm']

In [None]:
# standard conversions
min_per_year = 60 * 24 * 365
kg_per_ton = 1e3 # ton = megagram (Mg)
gg_per_kg = 1e-6
g_per_gg = 1e9
ft_per_mile = 5280

# convert volume NG to mass of CH4 it contains
conversion_consumer_ng_mcf_to_ch4_gg = ch4_fract_in_ng_consumer_grade * ch4_kg_per_mcf * gg_per_kg

In [None]:
# input files from external sources

# PHMSA:
phmsa_path = '/Users/masoninman/Dropbox/GEM/LCA of natural gas use/PHMSA annual_gas_distribution_2010_present/'
phmsa_file = f'annual_gas_distribution_{data_year}.xlsx'

# EIA:
operations_year_file = f'EIA-176 Type of Operations and Sector Items {data_year}.xlsx'

# TO DO: delete; not used anymore
# # old file
# # other_file = 'EIA-176 Natural Gas Other Disposition Items 1997-2018 (incl leakage volumes).xlsx'
# if data_year <= 2019:
#     other_file = 'EIA-176 Natural Gas Other Disposition Items 1997-2019 (incl leakage volumes).xlsx'
# else:
#     print("Error!" + f" File EIA-176 Natural Gas Other Disposition Items does not cover {data_year}")
    
# GTI data on meter leakage (Moore 2019)
gti_meter_leak_file = 'GTI 2019 industrial and commercial meter leakage.xlsx'

state_abbrev_file = 'US states and abbreviations.xlsx'

res_meters_outdoors_file = 'GIM input - residential meters outdoors by region and state.xlsx'

epa_ghgi_2020_annex_gas_path = lca_path + 'EPA data on GHG inventories and methane leakage/EPA GHGI 2020 (through 2018)/'
epa_ghgi_2020_annex_gas_file = 'EPA 2020 - Inventory of US Greenhouse Gas Emissions 1990-2018 - Natural gas systems - Annex 3.6 tables.xlsx'

## Read data

In [None]:
def read_state_abbreviations(state_abbrev_file):
    states_df = pd.read_excel(inputs_path + state_abbrev_file)

    states_dict = states_df.set_index('abbrev')['state'].to_dict()
    
    return states_dict

In [None]:
def read_crosswalk(crosswalk_file):
    # import crosswalk
    crosswalk = pd.read_excel(
        lca_path + crosswalk_file,
        sheet_name='crosswalk', 
        dtype={'EPA GHGRP ID': str})
    
    if contiguous_us_only == True:
        # filter out Alaska & Hawaii; GIM only covers contiguous US
        crosswalk = crosswalk.loc[~crosswalk['PHMSA STOP'].isin(['AK', 'HI', 'PR'])]
    else:
        pass
    
    return crosswalk

In [None]:
# run functions
states_dict = read_state_abbreviations(state_abbrev_file)
crosswalk = read_crosswalk(crosswalk_file)

# EIA-176 gas utility data

In [None]:
def read_operations_file_and_calculate_sectoral_groups(operations_year_file, contiguous_us_only):
    """
    Note: for at least one utility (Midwest Energy, in Kansas), there is a mismatch
    between data in operations file and in deliveries file.
    
    In deliveries file, this utility's data is split between two different IDs,
    whereas in operations file, it's under one ID.
    
    Deliveries file has been removed from GIM.
    """
    # use EIA-176 "Type of Operation" data set that has customer numbers
    df = pd.read_excel(eia_176_path + operations_year_file, sheet_name='Sheet 1', header=1)
    df = df.rename(columns={
        'Company': 'EIA company ID',
        'Losses from leaks, etc. within State (Mcf)': 'Losses from leaks',
    })
    
    df['total deliv NG Mcf'] = df[[
        'Residential Total Volume (Mcf)',
        'Commercial Total Volume (Mcf)',
        'Industrial Total Volume (Mcf)',
        'Electric Total Volume (Mcf)',
        'Vehicle Fuel Total Volume (Mcf)',
    ]].sum(axis=1)
    
    print(f"all US deliveries (Bcf): {round(df['total deliv NG Mcf'].sum() / 1e6, 1)}")
    
    df['res-comm NG fract'] = df[[
        'Residential Total Volume (Mcf)',
        'Commercial Total Volume (Mcf)',
    ]].sum(axis=1).div(df['total deliv NG Mcf'], fill_value=None)
    
    # filter for only contiguous US (if parameter set to do so)
    if contiguous_us_only == True:
        df = df.loc[~df['State'].isin(['AK', 'HI', 'PR'])]
        
    print(f"contiguous US deliveries (Bcf): {round(df['total deliv NG Mcf'].sum() / 1e6, 1)}")
        
    # remove all adjustment companies 
    # (none of them have any volumes or customer numbers anyway, at least for 2018)
    df = df.loc[df['Company Name']!='ADJUSTMENT COMPANY']
    
    print(f"contiguous US deliveries excl. adjustment companies (Bcf): {round(df['total deliv NG Mcf'].sum() / 1e6, 1)}")

    operations_keep_cols = [
        'Year', 'State', 'EIA company ID', 'Company Name',
        'Losses from leaks',
        'Residential Total Volume (Mcf)', 'Residential Total Customers',
        'Commercial Total Volume (Mcf)', 'Commercial Total Customers',
        'Industrial Total Volume (Mcf)', 'Industrial Total Customers',
        'Electric Total Volume (Mcf)', 'Electric Total Customers',
        'Vehicle Fuel Total Volume (Mcf)', 'Vehicle Fuel Total Customers',
        'total deliv NG Mcf',
        'res-comm NG fract',
    ]
    df = df[operations_keep_cols]
    
    operations_year = df
    
    return operations_year

In [None]:
# run function for EIA data
operations_year = read_operations_file_and_calculate_sectoral_groups(
    operations_year_file, contiguous_us_only)

## PHMSA data
* Note: AGA also has data "AGA 2020 - Distribution Pipe by Company Data" https://www.aga.org/research/data/distribution-pipe-by-company-annual-data-1990--2016/. It is PHMSA data, but with column headings written out in full.
* each operator can show up than once because:
  * they may be operating in more than one state
  * each utility-state combination can have more than one report, for different commodities
* so filter only for natural gas, and then create operator + stop as unique ID

In [None]:
# ID columns to keep
phmsa_id_cols = ['EIA company ID', 'PHMSA operator_stop', 'PHMSA ID_ST'] 

# unneeded columns to drop
phmsa_unneeded_cols = [
    'DATAFILE_AS_OF',
    'REPORT_SUBMISSION_TYPE', 'REPORT_DATE', 'FILING_DATE', 'FORM_REV',
    'REPORT_YEAR', 'REPORT_NUMBER', 'SUPPLEMENTAL_NUMBER',
#     'OPERATOR_NAME',
#     'STOP',
    'OPERATOR_TYPE',
    'COMMODITY',
    'OFFICE_ADDRESS_STREET', 'OFFICE_ADDRESS_CITY', 'OFFICE_ADDRESS_COUNTY', 'OFFICE_ADDRESS_STATE', 'OFFICE_ADDRESS_ZIP',
    'HQ_ADDRESS_STREET', 'HQ_ADDRESS_CITY', 'HQ_ADDRESS_COUNTY', 'HQ_ADDRESS_STATE', 'HQ_ADDRESS_ZIP',
    'PREPARERS_NAME', 'PREPARERS_TITLE', 'PREPARERS_PHONE', 'PREPARERS_FAX', 'PREPARERS_EMAIL',
    'ADDITIONAL_INFORMATION',
    'MMILES_BY_DCD_UNK',
    'MMILES_BY_DCD_PRE1940',
    'MMILES_BY_DCD_1940_TO_1949',
    'MMILES_BY_DCD_1950_TO_1959',
    'MMILES_BY_DCD_1960_TO_1969',
    'MMILES_BY_DCD_1970_TO_1979',
    'MMILES_BY_DCD_1980_TO_1989',
    'MMILES_BY_DCD_1990_TO_1999',
    'MMILES_BY_DCD_2000_TO_2009',
    'MMILES_BY_DCD_2010_TO_2019',
    'MMILES_BY_DCD_TOTAL',
    'NUM_SRVS_BY_DCD_UNK',
    'NUM_SRVS_BY_DCD_PRE1940',
    'NUM_SRVS_BY_DCD_1940_TO_1949',
    'NUM_SRVS_BY_DCD_1950_TO_1959',
    'NUM_SRVS_BY_DCD_1960_TO_1969',
    'NUM_SRVS_BY_DCD_1970_TO_1979',
    'NUM_SRVS_BY_DCD_1980_TO_1989',
    'NUM_SRVS_BY_DCD_1990_TO_1999',
    'NUM_SRVS_BY_DCD_2000_TO_2009',
    'NUM_SRVS_BY_DCD_2010_TO_2019',
    'NUM_SRVS_BY_DCD_TOTAL',
#     'TOTAL_LEAKS_COR_MAINS',
#     'TOTAL_LEAKS_NF_MAINS',
#     'TOTAL_LEAKS_EX_MAINS',
#     'TOTAL_LEAKS_OF_DAM_MAINS',
#     'TOTAL_LEAKS_MAT_WELD_MAINS',
#     'TOTAL_LEAKS_EQ_MAINS',
#     'TOTAL_LEAKS_OP_MAINS',
#     'TOTAL_LEAKS_OT_MAINS',
#     'TOTAL_LEAKS_COR_SRVS',
#     'TOTAL_LEAKS_NF_SRVS',
#     'TOTAL_LEAKS_EX_SRVS',
#     'TOTAL_LEAKS_OF_DAM_SRVS',
#     'TOTAL_LEAKS_MAT_WELD_SRVS',
#     'TOTAL_LEAKS_EQ_SRVS',
#     'TOTAL_LEAKS_OP_SRVS',
#     'TOTAL_LEAKS_OT_SRVS',
    'TOTAL_HAZLEAKS_COR_MAINS',
    'TOTAL_HAZLEAKS_NF_MAINS',
    'TOTAL_HAZLEAKS_EX_MAINS',
    'TOTAL_HAZLEAKS_OF_DAM_MAINS',
    'TOTAL_HAZLEAKS_MAT_WELD_MAINS',
    'TOTAL_HAZLEAKS_EQ_MAINS',
    'TOTAL_HAZLEAKS_OP_MAINS',
    'TOTAL_HAZLEAKS_OT_MAINS',
    'TOTAL_HAZLEAKS_COR_SRVS',
    'TOTAL_HAZLEAKS_NF_SRVS',
    'TOTAL_HAZLEAKS_EX_SRVS',
    'TOTAL_HAZLEAKS_OF_DAM_SRVS',
    'TOTAL_HAZLEAKS_MAT_WELD_SRVS',
    'TOTAL_HAZLEAKS_EQ_SRVS',
    'TOTAL_HAZLEAKS_OP_SRVS',
    'TOTAL_HAZLEAKS_OT_SRVS',
#     'KNOWN_LEAKS',
    'FED_LAND_LEAKS_REPAIRED',
    'PERCENT_UNACC_GAS',
    'EXCAV_DAMAGES',
    'EXCAV_ONECALL',
    'EXCAV_LOCATING',
    'EXCAV_EXCAV',
    'EXCAV_OTHER',
    'EXCAV_TICKETS',
    'EFV_INSTALLED_CY',
    'EFV_IN_SYSTEM',
    'MMILES_OTHER_MATERIAL_DETAIL',
    'NUM_SRVS_OTHER_MATERIAL_DETAIL',
    'SHUTOFF_VALVE_INSTALLED_CY', 'SHUTOFF_VALVE_IN_SYSTEM',
]

In [None]:
def import_phmsa_data_set(phmsa_path, phmsa_file, phmsa_unneeded_cols):
    # import PHMSA data for specified year
    phmsa = pd.read_excel(phmsa_path + phmsa_file, sheet_name=f'GD AR {data_year}', header=2)

    print(f"Successfully read PHMSA file for {data_year}")

    # keep only 'Natural Gas'
    # (excludes propane, nitrogen, hydrogen, landfill gas, synthetic gas)
    # landfill gas has only 7 rows; all pipelines are short
    # synthetic gas has only 2 rows; one is a long pipeline (458.6 mi)
    # when restricting to only 'Natural Gas', then PHMSA operator name + state is a unique identifier
    phmsa = phmsa.loc[phmsa['COMMODITY']=='Natural Gas']
    
    print("After filtering for the commodity natural gas only") # for UI
    
    if contiguous_us_only == True:
        # filter out Alaska & Hawaii; GIM is only for contiguous US
        # ('STOP' means 'state of operation')
        phmsa = phmsa.loc[~phmsa['STOP'].isin(['AK', 'HI', 'PR'])]
        print("And filtering for contiguous US only") # for UI
    else:
        pass
    
    print(f"len: {len(phmsa)}")
    print(f"sum of MMILES_TOTAL: {phmsa['MMILES_TOTAL'].sum()}")

    # set dtype for IDs & rename
    phmsa = phmsa.rename(columns={'OPERATOR_ID': 'PHMSA OPERATOR_ID'})
    phmsa['PHMSA OPERATOR_ID'] = phmsa['PHMSA OPERATOR_ID'].astype(str)
    
    # fix error
    phmsa = fix_phmsa_error_kuttawa_city(phmsa)

    # sort from largest to smallest utility, 
    # in terms of miles of distribution mains, then by number of services
    phmsa = phmsa.sort_values(by=['MMILES_TOTAL', 'NUM_SRVCS_TOTAL'], ascending=False)
    phmsa = phmsa.reset_index(drop=True)
    
    # create operator name and stop
    phmsa['PHMSA operator_stop'] = phmsa['OPERATOR_NAME'] + '_' + phmsa['STOP']
    phmsa = phmsa.drop('OPERATOR_NAME', axis=1)
    
    # drop unneeded columns
    phmsa = phmsa.drop(phmsa_unneeded_cols, axis=1)
    
    # change column name(s)
    # From AGA "Distribution Pipe by Company Data":
    # PHMSA column "Average Length" means "Average Service Length in feet"
    phmsa = phmsa.rename(columns={'AVERAGE_LENGTH': 'avg length (ft)'})
    
    return phmsa

In [None]:
def fix_phmsa_error_kuttawa_city(phmsa):
    """
    Sub-function of import_phmsa_data_set.
    
    This gas company had the wrong value for 'STOP' (state of operation). 
    Was listed as KS for Kansas, should be KY for Kentucky.
    """
    kuttawa_rows = phmsa.loc[
        (phmsa['PHMSA OPERATOR_ID']=='10350') & 
        (phmsa['OPERATOR_NAME']=='KUTTAWA NATURAL GAS, CITY OF')].index
    for row in kuttawa_rows:
        kuttawa_stop = phmsa.at[row, 'STOP']
        if kuttawa_stop == 'KS':
            # change to 'KY'
            phmsa.at[row, 'STOP'] = 'KY'
    return phmsa

In [None]:
def combine_phmsa_companies(phmsa, eia_name, phmsa_names, phmsa_ids, stop):
    """
    Combines gas companies that are listed as separate in PHMSA, but reported as one in EIA.    
    """
    len_phmsa_init = len(phmsa)
    phmsa_mmiles_init = phmsa['MMILES_TOTAL'].sum()
    
    phmsa_op_stops = [x + '_' + stop for x in phmsa_names]
    
    # split phmsa into two dfs: one to change, one not to change
    phmsa_change = phmsa.copy().loc[phmsa['PHMSA operator_stop'].isin(phmsa_op_stops)]
    phmsa_no_change = phmsa.loc[~phmsa['PHMSA operator_stop'].isin(phmsa_op_stops)]
    
    eia_name_stop = eia_name + '_' + stop
    
    # combine the companies listed in PHMSA, 
    # so they correspond to the 1 company in EIA
    for phmsa_op_stop in phmsa_op_stops:
        phmsa_change['PHMSA operator_stop'] = phmsa_change['PHMSA operator_stop'].replace(
            phmsa_op_stop, eia_name_stop)
        
    phmsa_change['STOP'] = stop
        
    # create combined ID
    combined_id = ''
    for phmsa_id in phmsa_ids:
        combined_id += phmsa_id + '_'
    combined_id = combined_id.strip('_')
    
    for phmsa_id in phmsa_ids:
        phmsa_change['PHMSA OPERATOR_ID'] = phmsa_change['PHMSA OPERATOR_ID'].replace(phmsa_id, combined_id)

    # groupby & sum; all columns quantities, other than operator ID and name
    phmsa_change = phmsa_change.groupby(
        ['PHMSA OPERATOR_ID', 'PHMSA operator_stop', 'STOP']).sum().reset_index()
    
    # recombine
    phmsa = pd.concat([phmsa_change, phmsa_no_change], sort=False)
    
    # TEST:
    len_change = len(phmsa_names) - 1
    if len_phmsa_init - len_change == len(phmsa):
        pass
    else:
        print("Error!" + f" len_phmsa_init: {len_phmsa_init}; len phmsa final: {len(phmsa)}; names combined: {len(phmsa_names)}")
    # END OF TEST
    
    # TEST: check MMILES_TOTAL
    phmsa_mmiles_final = phmsa['MMILES_TOTAL'].sum()
    print(f"for {eia_name_stop}, mmiles diff: {phmsa_mmiles_init - phmsa_mmiles_final}") # for db
    if (phmsa_mmiles_init - phmsa_mmiles_final) < 1e-1:
        pass
    else:
        print("Error!" + f" phmsa_mmiles_init: {phmsa_mmiles_init}; phmsa_mmiles_final: {phmsa_mmiles_final}")
    
    return phmsa

In [None]:
# TO DO: delete; not used anymore

# def read_epa_ghgrp_dist_leak():
#     # read EPA file for distribution leakage methane emissions
#     epa_path = lca_path + 'EPA data for LCA of NG/'
#     df = pd.read_excel(
#         epa_path + ghgrp_dist_file, 
#         header=5, 
#         dtype={'GHGRP ID': str})
#     df = df.rename(columns={
#         'GHGRP ID': 'EPA GHGRP ID',
#         'FACILITY NAME': 'EPA FACILITY NAME'
#     })
    
#     df = df[['EPA FACILITY NAME', 'EPA GHGRP ID', 'GHG QUANTITY (METRIC TONS CO2e)']]
    
#     return df

In [None]:
def combine_epa_companies(df, eia_name, epa_names, epa_ids):
    """
    Combines gas companies that are listed as separate in PHMSA, but reported as one in EIA.    
    """
    
    len_init = len(df)
    ghg_init = df['GHG QUANTITY (METRIC TONS CO2e)'].sum()
    
    # split df into two dfs: one to change, one not to change
    df_change = df.copy().loc[df['EPA FACILITY NAME'].isin(epa_names)]
    df_no_change = df.loc[~df['EPA FACILITY NAME'].isin(epa_names)]
    
    # combine the companies listed in PHMSA, 
    # so they correspond to the 1 company in EIA
    for epa_name in epa_names:
        df_change['EPA FACILITY NAME'] = df_change['EPA FACILITY NAME'].replace(
            epa_name, eia_name)
        
    # create combined ID
    combined_id = ''
    for epa_id in epa_ids:
        combined_id += epa_id + '_'
    combined_id = combined_id.strip('_')
    
    for epa_id in epa_ids:
        df_change['EPA GHGRP ID'] = df_change['EPA GHGRP ID'].replace(epa_id, combined_id)

    # groupby & sum; all columns quantities, other than operator ID and name
    df_change = df_change.groupby(
        ['EPA GHGRP ID', 'EPA FACILITY NAME']).sum().reset_index()
    
    # recombine
    df = pd.concat([df_change, df_no_change], sort=False)
    
    # TEST:
    len_change = len(epa_names) - 1
    if len_init - len_change == len(df):
        pass
    else:
        print("Error!" + f" len_init: {len_init}; len final: {len(df)}; names combined: {len(epa_names)}")
    # END OF TEST
    
    # TEST: check MMILES_TOTAL
    ghg_final = df['GHG QUANTITY (METRIC TONS CO2e)'].sum()
    print(f"for {eia_name}, tons CO2e diff (init - final): {ghg_init - ghg_final}") # for db
    if (ghg_init - ghg_final) < 1e-1:
        pass
    else:
        print("Error!" + f" ghg_init: {ghg_init}; ghg_final: {ghg_final}")
    
    return df

In [None]:
def phmsa_merge_eia_company_ids_and_clean(phmsa, crosswalk):    
    df = phmsa.copy()
    
    df = df.reset_index(drop=True)

    # add column for combined operator ID and state (to serve as unique ID)
    df['PHMSA ID_ST'] = df['PHMSA OPERATOR_ID'].astype(str) + '_' + df['STOP']
    df = df.drop('PHMSA OPERATOR_ID', axis=1)

    df['PHMSA ID_ST'] = df['PHMSA ID_ST'].astype(str)

    crosswalk['PHMSA ID_ST'] = crosswalk['PHMSA ID_ST'].astype(str)

    # merge in EIA company IDs from crosswalk
    df = pd.merge(
        df, 
        crosswalk[['PHMSA ID_ST', 'EIA company ID']], 
        on='PHMSA ID_ST', how='left')

#     print(len(df))
#     print(f"sum of MMILES_TOTAL: {phmsa['MMILES_TOTAL'].sum()}")

    phmsa = df
    return phmsa

#### run functions for PHMSA data

In [None]:
phmsa = import_phmsa_data_set(phmsa_path, phmsa_file, phmsa_unneeded_cols)

In [None]:
# combine Atmos in Texas
eia_name = 'ATMOS ENERGY CORPORATION'
phmsa_names = ['ATMOS ENERGY CORPORATION - MID-TEX', 'ATMOS ENERGY CORPORATION - WEST TEXAS']
phmsa_ids = ['31348', '4473']
stop = 'TX'
phmsa = combine_phmsa_companies(phmsa, eia_name, phmsa_names, phmsa_ids, stop)

# combine Spire in Missouri
eia_name = 'SPIRE MISSOURI INC'
phmsa_names = ['SPIRE MISSOURI INC. WEST', 'SPIRE MISSOURI INC. EAST']
phmsa_ids = ['30769', '11032']
stop = 'MO'
phmsa = combine_phmsa_companies(phmsa, eia_name, phmsa_names, phmsa_ids, stop)

# combine Colonial Gas in Massachusetts
eia_name = 'COLONIAL GAS CO CO KEYSPAN'
phmsa_names = ['CAPE COD GAS CO (DIV OF COLONIAL GAS CO)', 'COLONIAL GAS CO - LOWELL DIV']
phmsa_ids = ['2066', '11856']
stop = 'MA'
phmsa = combine_phmsa_companies(phmsa, eia_name, phmsa_names, phmsa_ids, stop)

In [None]:
# note: merge crosswalk data only after changes above
phmsa = phmsa_merge_eia_company_ids_and_clean(phmsa, crosswalk)

In [None]:
# compare number of companies in PHMSA to number that don't have corresponding EIA ID in crosswalk
print(len(phmsa))
print(len(phmsa.loc[phmsa['EIA company ID'].isna()]))

## Share of gas sales from companies that own local distribution systems

In [None]:
# merge phmsa & operations_year, which has gas deliveries
df = pd.merge(phmsa, operations_year, on='EIA company ID', how='outer')

# calculate deliveries for all companies that are in phmsa with EIA Company ID
ldcs = df.loc[df['PHMSA operator_stop'].isna()==False]
ldcs_fract = ldcs['total deliv NG Mcf'].sum()/operations_year['total deliv NG Mcf'].sum()
print(f"% of deliveries from LDCs:     {round(ldcs_fract*100, 1)}%")

non_ldcs = df.loc[df['PHMSA operator_stop'].isna()==True]
non_ldcs_fract = non_ldcs['total deliv NG Mcf'].sum()/operations_year['total deliv NG Mcf'].sum()
print(f"% of deliveries from non-LDCs: {round(non_ldcs_fract*100, 1)}%")

In [None]:
# how much of gas to electric sector is from LDCs vs non-LDCs?
print(f"electric sector gas fraction from non-LDCs: \
{round(100*(non_ldcs['Electric Total Volume (Mcf)'].sum()/df['Electric Total Volume (Mcf)'].sum()), 1)}%")
print(f"industrial sector gas fraction from non-LDCs: \
{round(100*(non_ldcs['Industrial Total Volume (Mcf)'].sum()/df['Industrial Total Volume (Mcf)'].sum()), 1)}%")

In [None]:
non_ldcs_ind_elec_fract = non_ldcs[['Industrial Total Volume (Mcf)', 'Electric Total Volume (Mcf)']].sum().sum()/non_ldcs['total deliv NG Mcf'].sum()
print(f"% of non-LDCs deliveries that are to industrial-electric: {round(non_ldcs_ind_elec_fract*100, 1)}%")

## Distribution mains

### distribution mains caveats
* One argument against Weller might be that older pipes (using older materials like cast iron) also tend to be smaller pipes, so they don't leak as much.
  * PHMSA data has miles of pipe for each material and in various bins for diameters
    * e.g., Main - Cast/wrought iron - Diameter unknown, < 2", 2-4", 4-8", 8-12", >12"
  * Could calculate average diameter for each material of pipe, in each utility
* However, Weller 2020 says, in the section on the *size* of leaks (g/min): "Pipeline age, diameter, and pressure were not predictive of emissions." 
  * That's interesting and surprising to me; I would have thought that with larger diameter and pressure, the leaks would be larger. Could it be that cast iron is larger diameter, but lower pressure, and carries less gas for a given diameter, but is leaker per mile? And thus much leaker per volume of gas transported?
* In Weller 2020, is pipeline diameter predictive of the *number* of leaks per mile? (More surface area that could corrode, etc.?)

Weller 2020 Table S4:
* Total length of distribution mains (2017): 1,296,011 miles
* That fits with the total I calculated from PHMSA (2018) of 1,307,576 miles

Weller 2020 Table 2:
* Total number of leaks is 659,100
* That fits with stated leaks per mile of 0.51

### leakage based on pipeline materials

Weller 2020: 
* "The material categories listed in the report include steel (bare and coated), ductile iron, copper, cast/wrought iron, plastic (with subcategories), reconditioned cast iron, and 'other'"
* "For our analysis, we considered four material types (bare steel, cast iron, coated steel, and plastic)."
* "We included copper pipe (<0.002% of the total pipeline miles) in the bare steel category. We likewise combined the rarer types of iron pipeline materials (i.e., ductile iron (0.04%), and reconditioned cast iron (0.02%)) with the cast iron category. Finally, we included pipe categorized as “other” (0.1%) in the coated steel category."

#### dist mains leakage
* based on materials for each pipeline reported to PHMSA
* use leakage quantity per mile from Weller 2020
* caveat: there can be a range of utilities, some with large mains on average (e.g., NYC) and some with small mains (e.g., Smallville)
  * However, Weller 2020 was not able to find a correlation between size of leaks (check) and pipeline diameter
  * I would have expected larger diameter pipelines to have larger leaks because they hold more gas per mile, so the lack of correlation is interesting
  * What about correlation between the number of leaks per mile and pipeline diameter?

In [None]:
# sandbox for table:
mains_materials_cols = [
    'MMILES_STEEL_UNP_BARE', # steel: no cathodic protection, bare
    'MMILES_STEEL_UNP_COATED', # steel: no cathodic protection, coated
    'MMILES_STEEL_CP_BARE', # steel: cathodic protection, bare
    'MMILES_STEEL_CP_COATED', # steel: cathodic protection, coated
    'MMILES_PLASTIC',
    'MMILES_CI', # cast iron
    'MMILES_DI', # ductile iron
    'MMILES_CU', # copper
    'MMILES_OTHER',
    'MMILES_RCI', # reconditioned cast iron
    'MMILES_TOTAL',
]
for col in mains_materials_cols:
    print(f"{col}: {phmsa[col].sum()} mi")
    print(f"{col}: {round(100*phmsa[col].sum()/1302050.601, 1)}%")

In [None]:
def create_mains_mat(phmsa):    
    mains_materials_cols = [
        'MMILES_STEEL_UNP_BARE', # steel: no cathodic protection, bare
        'MMILES_STEEL_UNP_COATED', # steel: no cathodic protection, coated
        'MMILES_STEEL_CP_BARE', # steel: cathodic protection, bare
        'MMILES_STEEL_CP_COATED', # steel: cathodic protection, coated
        'MMILES_PLASTIC',
        'MMILES_CI', # cast iron
        'MMILES_DI', # ductile iron
        'MMILES_CU', # copper
        'MMILES_OTHER',
        'MMILES_RCI', # reconditioned cast iron
        'MMILES_TOTAL',
    ]
    
    df = phmsa.copy()[phmsa_id_cols + mains_materials_cols]

    df['cast iron mi'] = df[['MMILES_CI', 'MMILES_DI', 'MMILES_RCI']].sum(axis=1)
    df['bare steel mi'] = df[['MMILES_STEEL_UNP_BARE', 'MMILES_STEEL_CP_BARE', 'MMILES_CU']].sum(axis=1)
    df['coated steel mi'] = df[['MMILES_STEEL_UNP_COATED', 'MMILES_STEEL_CP_COATED', 'MMILES_OTHER']].sum(axis=1)
    df['plastic mi'] = df[['MMILES_PLASTIC']].sum(axis=1)
    
    # TEST:
    checksum = df[['cast iron mi', 'bare steel mi', 'coated steel mi', 'plastic mi']].sum(axis=1).sum() - df['MMILES_TOTAL'].sum()
    if abs(checksum) < 1e-3:
        pass
    else:
        print("Error!" + f" There was a difference in the miles of pipeline: {checksum}")

    # show fraction of pipes for each material (for the principal materials)
    sums_by_mat = df[['cast iron mi', 'bare steel mi', 'coated steel mi', 'plastic mi']].sum()
    total_length = df[['cast iron mi', 'bare steel mi', 'coated steel mi', 'plastic mi']].sum().sum()
    fract_by_mat = sums_by_mat / total_length
    fract_by_mat = fract_by_mat.sort_values(ascending=False)
    pct_by_mat = (round((100*fract_by_mat), 1)).astype(str) + '%'
    print("Show % for each material (after grouping):")
    print(pct_by_mat)
    print()

    print("Percentage of mains length from other materials that are difficult to categorize:")
    print(f"other: {round(100*df['MMILES_OTHER'].sum() / df['MMILES_TOTAL'].sum(), 3)}%")
    print(f"copper: {round(100*df['MMILES_CU'].sum() / df['MMILES_TOTAL'].sum(), 3)}%")

    mains_mat = df
    
    return mains_mat

In [None]:
def calc_num_leaks_and_mass_leak(mains_mat, mains_leak_df):
    cast_iron_leaks_per_mi = mains_leak_df.at['cast iron', 'number of leaks per mile']
    bare_steel_leaks_per_mi = mains_leak_df.at['bare steel', 'number of leaks per mile']
    coated_steel_leaks_per_mi = mains_leak_df.at['coated steel', 'number of leaks per mile']
    plastic_leaks_per_mi = mains_leak_df.at['plastic', 'number of leaks per mile']
    
    leak_size_cast_iron = mains_leak_df.at['cast iron', 'size per leak (g CH4/min)']
    leak_size_bare_steel = mains_leak_df.at['bare steel', 'size per leak (g CH4/min)']
    leak_size_coated_steel = mains_leak_df.at['coated steel', 'size per leak (g CH4/min)']
    leak_size_plastic = mains_leak_df.at['plastic', 'size per leak (g CH4/min)']
    
    mains_mat['num leaks cast iron'] = mains_mat['cast iron mi'] * cast_iron_leaks_per_mi
    mains_mat['num leaks bare steel'] = mains_mat['bare steel mi'] * bare_steel_leaks_per_mi
    mains_mat['num leaks coated steel'] = mains_mat['coated steel mi'] * coated_steel_leaks_per_mi
    mains_mat['num leaks plastic'] = mains_mat['plastic mi'] * plastic_leaks_per_mi
    
    mains_mat['num leaks total'] = mains_mat[[
        'num leaks cast iron', 
        'num leaks bare steel', 
        'num leaks coated steel', 
        'num leaks plastic'
    ]].sum(axis=1)

    # leaks per mile and leak size from above (in distribution mains section)
    # ef means "emissions factor"; units are g CH4/(mi-min)
    cast_iron_ef = cast_iron_leaks_per_mi * leak_size_cast_iron
    steel_bare_ef = bare_steel_leaks_per_mi * leak_size_bare_steel
    steel_coat_ef = coated_steel_leaks_per_mi * leak_size_coated_steel
    plastic_ef = plastic_leaks_per_mi * leak_size_plastic
    
    # put material EFs into df
    mains_mat['cast iron EF'] = cast_iron_ef
    mains_mat['steel bare EF'] = steel_bare_ef
    mains_mat['steel coat EF'] = steel_coat_ef
    mains_mat['plastic EF'] = plastic_ef    
    
#     print()
#     print(f"cast_iron_ef:  {cast_iron_ef}")
#     print(f"steel_bare_ef: {steel_bare_ef}")
#     print(f"steel_coat_ef: {steel_coat_ef}")
#     print(f"plastic_ef:    {plastic_ef}")

    # convert from g/min to Gg/y
    conversion_g_min_to_gg_y = min_per_year / 1e9

    # TO DO: delete 4 rows below if new approach following is working
#     mains_mat['leak CH4 Gg/y cast iron'] = mains_mat['cast iron mi'] * cast_iron_ef * conversion_g_min_to_gg_y
#     mains_mat['leak CH4 Gg/y bare steel'] = mains_mat['bare steel mi'] * steel_bare_ef * conversion_g_min_to_gg_y
#     mains_mat['leak CH4 Gg/y coated steel'] = mains_mat['coated steel mi'] * steel_coat_ef * conversion_g_min_to_gg_y
#     mains_mat['leak CH4 Gg/y plastic'] = mains_mat['plastic mi'] * plastic_ef * conversion_g_min_to_gg_y

    mains_mat['leak CH4 Gg/y cast iron'] = mains_mat['cast iron mi'] * mains_mat['cast iron EF'] * conversion_g_min_to_gg_y
    mains_mat['leak CH4 Gg/y bare steel'] = mains_mat['bare steel mi'] * mains_mat['steel bare EF'] * conversion_g_min_to_gg_y
    mains_mat['leak CH4 Gg/y coated steel'] = mains_mat['coated steel mi'] * mains_mat['steel coat EF'] * conversion_g_min_to_gg_y
    mains_mat['leak CH4 Gg/y plastic'] = mains_mat['plastic mi'] * mains_mat['plastic EF'] * conversion_g_min_to_gg_y
    
    mains_mat['mains leak CH4 Gg/y'] = mains_mat[[
        'leak CH4 Gg/y cast iron', 
        'leak CH4 Gg/y bare steel', 
        'leak CH4 Gg/y coated steel', 
        'leak CH4 Gg/y plastic'
    ]].sum(axis=1)
    
    return mains_mat

In [None]:
# mean leaks per mile for each material
# from Weller 2020 Table 2
# mean leak size for each material (for distribution mains)
# Weller 2020, Table 3; units g/min of CH4
# this distinction is separate from cathodic protection (or lack thereof)

mains_leak_df = pd.read_excel(inputs_path + parameters_file, sheet_name='mains leakage Weller')
mains_leak_df = mains_leak_df.set_index('mains material')

In [None]:
# run functions
mains_mat = create_mains_mat(phmsa)
mains_mat = calc_num_leaks_and_mass_leak(mains_mat, mains_leak_df)

In [None]:
# sandbox:
# merge mains_mat and deliveries (operations_year)
test = pd.merge(
    mains_mat[[
        'EIA company ID', 'PHMSA operator_stop', 'PHMSA ID_ST',
        'MMILES_STEEL_UNP_BARE', 'MMILES_STEEL_UNP_COATED', 'MMILES_STEEL_CP_BARE', 'MMILES_STEEL_CP_COATED', 
        'MMILES_CI', 'MMILES_DI', 'MMILES_CU', 'MMILES_OTHER', 'MMILES_RCI', 'MMILES_PLASTIC',
        'MMILES_TOTAL', 
        'cast iron mi', 'bare steel mi', 'coated steel mi', 'plastic mi',
        'cast iron EF', 'steel bare EF', 'steel coat EF', 'plastic EF',
#         'num leaks cast iron', 'num leaks bare steel',
#         'num leaks coated steel', 'num leaks plastic', 'num leaks total',
        'leak CH4 Gg/y cast iron', 'leak CH4 Gg/y bare steel',
        'leak CH4 Gg/y coated steel', 'leak CH4 Gg/y plastic',
        'mains leak CH4 Gg/y']],
    operations_year[['EIA company ID', 'total deliv NG Mcf']], # 'res-comm NG fract'
    on='EIA company ID',
    how='outer'
)
test['g CH4 dist main leak/Mcf deliv'] = (test['mains leak CH4 Gg/y']*g_per_gg) / (test['total deliv NG Mcf'])
test = test.sort_values(by='g CH4 dist main leak/Mcf deliv', ascending=False)

mains_mi = test[['cast iron mi', 'bare steel mi', 'coated steel mi', 'plastic mi']].sum(axis=1)
test['kg CH4 leak/mi'] = (test['mains leak CH4 Gg/y'] * 1e6) / mains_mi
test['mi mains/MMcf deliv'] = test['MMILES_TOTAL'] / (test['total deliv NG Mcf']/1000)

test = test.loc[test['PHMSA ID_ST'].isna()==False]
test = test.loc[test['MMILES_TOTAL']>0]

# # export
# save_timestamp = time.strftime('%Y-%m-%d_%H%M', time.localtime())
# test.to_excel(lca_path + f'GIM results - LDC leakage - mains_mat test {save_timestamp}.xlsx', index=False)

### Compare mains leakage vs Weller 2020

In [None]:
def compare_mains_vs_weller(mains_mat, data_year):
    # calculate values for comparison with Weller 2020 study of distribution mains
    
    if data_year != 2017:
        print(f"Warning! Weller 2020 analyzed data for 2017; GIM is running with data_year {data_year}")
        print()
    
    # compare number of leaks
    total_leaks_mains_mat = mains_mat[
        ['num leaks cast iron', 'num leaks bare steel', 'num leaks coated steel', 'num leaks plastic']
    ].sum(axis=1).sum()
    print(f"for {data_year}, total leaks based on mains_mat: {int(total_leaks_mains_mat)}")
    weller_num_leaks = 659100
    print(f"Weller 2020, Table 2 for 2017: Total number of leaks is {weller_num_leaks}")
    fract_diff = (total_leaks_mains_mat-weller_num_leaks)/weller_num_leaks
    print(f"number of leaks, difference GEM vs Weller, as %: {round(fract_diff*100, 2)}%")
    print()
    
    # compare mass of leaked CH4
    total_leakage_mains_mat = mains_mat['mains leak CH4 Gg/y'].sum()
    print(f"For {data_year}, GIM total leakage from mains (CH4 Gg/y): {round(total_leakage_mains_mat, 2)}")
    total_leakage_mains_mat_Weller = 0.69 * 1000 # units here Gg/y; reported in paper as 0.69 Tg/y
    fract_diff = (total_leakage_mains_mat-total_leakage_mains_mat_Weller)/total_leakage_mains_mat_Weller
    print(f"Total leakage based on materials: difference from Weller: {str(((fract_diff)*100).round(1))}%")
    
    # no return

In [None]:
# show comparisons
compare_mains_vs_weller(mains_mat, data_year)

In [None]:
# PHMSA reports leaks repaired in various columns starting with "TOTAL_LEAKS" and ending in "MAINS"
mains_leaks_repaired_cols = [
    'TOTAL_LEAKS_COR_MAINS',
    'TOTAL_LEAKS_NF_MAINS',
    'TOTAL_LEAKS_EX_MAINS',
    'TOTAL_LEAKS_OF_DAM_MAINS',
    'TOTAL_LEAKS_MAT_WELD_MAINS',
    'TOTAL_LEAKS_EQ_MAINS',
    'TOTAL_LEAKS_OP_MAINS',
    'TOTAL_LEAKS_OT_MAINS',
]

serv_leaks_repaired_cols = [
    'TOTAL_LEAKS_COR_SRVS',
    'TOTAL_LEAKS_NF_SRVS',
    'TOTAL_LEAKS_EX_SRVS',
    'TOTAL_LEAKS_OF_DAM_SRVS',
    'TOTAL_LEAKS_MAT_WELD_SRVS',
    'TOTAL_LEAKS_EQ_SRVS',
    'TOTAL_LEAKS_OP_SRVS',
    'TOTAL_LEAKS_OT_SRVS',
]

# PHSMA also reports leaks remaining at end of reporting cycle; apparently this is for mains & services together
phmsa_leaks_repaired_mains = phmsa[mains_leaks_repaired_cols].sum(axis=1).sum()
print(f"US total mains leaks repaired (PHMSA): {int(phmsa_leaks_repaired_mains)}")

phmsa_leaks_repaired_serv = phmsa[serv_leaks_repaired_cols].sum(axis=1).sum()
print(f"US total services leaks repaired (PHMSA): {int(phmsa_leaks_repaired_serv)}")

fract_leak_repaired_serv = phmsa_leaks_repaired_serv / (phmsa_leaks_repaired_serv + phmsa_leaks_repaired_mains)
print(f"fraction of leaks repaired that were on service lines: {round(100*fract_leak_repaired_serv, 1)}%")

distribution_mains_leak_num_weller = 659100
# Note: Weller 2020 also said 630,000 leaks in abstract and on Figure 1; 
# From discussion with him Oct 2020, it turned out the 630,000 value was an error due to a coding problem they had fixed;
# Weller is going to file a correction with the journal

print(f"Ratio of leaks in distribution mains in Weller/PHMSA leaks repaired: {round(distribution_mains_leak_num_weller/phmsa_leaks_repaired_mains, 1)}")
print("Note: PHMSA reports leaks remaining at the end of the year (both mains & services?)")

# Note that Weller's total quantity leaked from distribution mains is 4.8X EPA estimate
# Weller's number of leaks is 5.3X PHMSA data (reported by companies).
# Interesting that those two ratios are in the same ballpark!

# But Weller also had higher average leak size than EPA.
# So the difference between Weller and EPA is due to both leak count (or leaks/mi) and leak size.

# service line leakage
* Weller 2020 says: "Pipeline age, diameter, and pressure were not predictive of emissions. As a result, we follow the GRI/EPA 1992 and Lamb 2015 studies and consider differences in emissions factors between materials." (The emissions factor in this case is the size of leak (g/min).)
* Also, EPA used similar emissions per mile for mains and service lines
* This may be due to higher number of leaks per mile, even if each individual leak is smaller
  * In PHMSA data, the rate of leak repairs per mile was ~4X higher for services than for mains
  * Might be partly due to service leaks more often being rated hazardous because of being close to buildings
  * Nonetheless suggests that services suffer a higher number of leaks per mile than mains
* Therefore, use Weller 2020 leakage rates per mile for mains to apply to service lines
* Split ind-elec and res-comm based on number of customers
  * Caveat: there can be more res-comm customers than the number of service lines
  * Presumably this is because there are large buildings that have one service line, but then it splits out to many customers
  * For cases like this, the length of services may be much longer than the average length for the utility
  * So it is reasonable to use the number of customers as a way of splitting res-comm and ind-elec leakage

In [None]:
# # calculate emissions for each utility

# def calc_serv_leakage_based_on_mains(phmsa, mains_leak_df):
#     """
#     Use parameters from mains to apply to service lines as well.
#     """
       
#     cols_to_keep = [
#         'PHMSA operator_stop',
#         'PHMSA ID_ST',
#         'EIA company ID',
#         'NUM_SRVS_STEEL_UNP_BARE', 'NUM_SRVS_STEEL_UNP_COATED',
#         'NUM_SRVS_STEEL_CP_BARE', 'NUM_SRVS_STEEL_CP_COATED',
#         'NUM_SRVS_PLASTIC', 'NUM_SRVS_CI',
#         'NUM_SRVS_DI', 'NUM_SRVS_CU',
#         'NUM_SRVS_OTHER', 'NUM_SRVS_RCI',
#         'avg length (ft)',
#     ]
#     # note: above excludes 'NUM_SRVCS_TOTAL'
    
#     serv_mat = phmsa.copy()[cols_to_keep]
    
#     bare_steel_leaks_per_mi = mains_leak_df.at['bare steel mi', 'number of leaks per mi']
#     coated_steel_leaks_per_mi = mains_leak_df.at['coated steel mi', 'number of leaks per mi']
#     plastic_leaks_per_mi = mains_leak_df.at['plastic mi', 'number of leaks per mi']

#     # leak sizes have units of g CH4/min
#     leak_size_bare_steel = mains_leak_df.at['bare steel mi', 'size per leak (g CH4/min)']
#     leak_size_coated_steel = mains_leak_df.at['coated steel mi', 'size per leak (g CH4/min)']
#     leak_size_plastic = mains_leak_df.at['plastic mi', 'size per leak (g CH4/min)']

#     steel_bare_mats = [
#         'NUM_SRVS_STEEL_UNP_BARE', 'NUM_SRVS_STEEL_CP_BARE', 
#         'NUM_SRVS_CI', 'NUM_SRVS_DI', 'NUM_SRVS_RCI', 
#         'NUM_SRVS_CU'
#         ]
#     num_serv_steel_bare = serv_mat[steel_bare_mats].sum(axis=1)
#     num_serv_steel_bare.name = 'NUM_SRVS steel bare'

#     steel_coated_mats = ['NUM_SRVS_STEEL_UNP_COATED', 'NUM_SRVS_STEEL_CP_COATED']
#     num_serv_steel_coat = serv_mat[steel_coated_mats].sum(axis=1)
#     num_serv_steel_coat.name = 'NUM_SRVS steel coated'

#     plastic_mats = ['NUM_SRVS_PLASTIC', 'NUM_SRVS_OTHER']
#     num_serv_plastic = serv_mat[plastic_mats].sum(axis=1)
#     num_serv_plastic.name = 'NUM_SRVS plastic'
    
#     # leaks per mile and leak size from above (in distribution mains section)
#     # ef means "emissions factor"; units are annual g CH4/mi
#     steel_bare_ef = bare_steel_leaks_per_mi * leak_size_bare_steel * min_per_year
#     steel_coated_ef = coated_steel_leaks_per_mi * leak_size_coated_steel * min_per_year
#     plastic_ef = plastic_leaks_per_mi * leak_size_plastic * min_per_year
    
#     # emissions = number of services * average length (mi) * emissions factor (g CH4/mi/min) * (1 Gg/1e6) * conversion factor
#     # where average length (mi) = average length (ft) / (ft/mi)
#     # resulting units for emissions: Gg CH4/y
#     serv_length_mi = serv_mat['avg length (ft)'] / ft_per_mile
          
#     # emissions factors have units of g/mi-yr; emissions results are Gg/yr
#     gg_per_g = 1e-9
#     serv_steel_bare_em = num_serv_steel_bare * serv_length_mi * steel_bare_ef * gg_per_g
#     serv_steel_bare_em.name = 'serv steel bare CH4 Gg'
#     serv_steel_coat_em = num_serv_steel_coat * serv_length_mi * steel_coated_ef * gg_per_g
#     serv_steel_coat_em.name = 'serv steel coated CH4 Gg'
#     serv_plastic_em    = num_serv_plastic    * serv_length_mi * plastic_ef * gg_per_g
#     serv_plastic_em.name = 'serv plastic CH4 Gg'

#     all_serv_em = [serv_steel_bare_em, serv_steel_coat_em, serv_plastic_em]
#     serv_mat = pd.concat([serv_mat] + all_serv_em, sort=False, axis=1)
    
#     all_serv_em_sum = pd.concat(all_serv_em, sort=False, axis=1).sum(axis=1)
#     serv_mat['service leak CH4 Gg'] = all_serv_em_sum
    
#     return serv_mat

In [None]:
# calculate emissions for each utility

def calc_serv_leakage_based_on_epa(phmsa, serv_leak_epa_df):
    """
    Use parameters from mains to apply to service lines as well.
    """
    print("Calculating services leakage based on EPA")
       
    cols_to_keep = [
        'PHMSA operator_stop',
        'PHMSA ID_ST',
        'EIA company ID',
        'NUM_SRVS_STEEL_UNP_BARE', 'NUM_SRVS_STEEL_UNP_COATED',
        'NUM_SRVS_STEEL_CP_BARE', 'NUM_SRVS_STEEL_CP_COATED',
        'NUM_SRVS_PLASTIC', 'NUM_SRVS_CI',
        'NUM_SRVS_DI', 'NUM_SRVS_CU',
        'NUM_SRVS_OTHER', 'NUM_SRVS_RCI',
        'NUM_SRVCS_TOTAL',
        'avg length (ft)',
    ]
    
    # from PHMSA, calculate miles of services for each LDC
    # then sum to get national total miles of services
    # also calculate number of services
    # calculate average length of service line
    # then calculate epa leakage rates in terms of length (not number) of service lines
    # (assumes service lines are the same length for each LDC, regardless of material)
    serv_mat = phmsa.copy()[cols_to_keep]
    ft_of_services_us_tot = serv_mat[['avg length (ft)', 'NUM_SRVCS_TOTAL']].product(axis=1, skipna=False).sum()
    ft_per_service_us_avg = ft_of_services_us_tot / serv_mat['NUM_SRVCS_TOTAL'].sum()
    
    serv_leak_epa_df = serv_leak_epa_df.set_index('services material')
    # emissions factors (ef): units of kg CH4 per year per ft of services
    prot_steel_leak_ef = serv_leak_epa_df.at['protected steel', 'leakage per service line (kg CH4/service/y)'] / ft_per_service_us_avg
    unprot_steel_leak_ef = serv_leak_epa_df.at['unprotected steel', 'leakage per service line (kg CH4/service/y)'] / ft_per_service_us_avg
    plastic_leak_ef = serv_leak_epa_df.at['plastic', 'leakage per service line (kg CH4/service/y)'] / ft_per_service_us_avg
    copper_leak_ef = serv_leak_epa_df.at['copper', 'leakage per service line (kg CH4/service/y)'] / ft_per_service_us_avg
    
#     print(f"prot_steel_leak_ef: {prot_steel_leak_ef}")
#     print(f"unprot_steel_leak_ef: {unprot_steel_leak_ef}")
#     print(f"plastic_leak_ef: {plastic_leak_ef}")
#     print(f"copper_leak_ef: {copper_leak_ef}")

    serv_mat_num_by_mat = serv_mat[[       
        'NUM_SRVS_STEEL_CP_COATED',
        'NUM_SRVS_STEEL_CP_BARE', 
        'NUM_SRVS_STEEL_UNP_BARE', 
        'NUM_SRVS_STEEL_UNP_COATED',
        'NUM_SRVS_CI', 
        'NUM_SRVS_DI', 
        'NUM_SRVS_RCI', 
        'NUM_SRVS_PLASTIC', 
        'NUM_SRVS_OTHER',
        'NUM_SRVS_CU'
    ]].sum()
    
    # calculate number of services for 
    prot_steel_mats = [
        'NUM_SRVS_STEEL_CP_COATED',
        'NUM_SRVS_STEEL_CP_BARE', 
        ]
    num_serv_prot_steel = serv_mat[prot_steel_mats].sum(axis=1)
    num_serv_prot_steel.name = 'NUM_SRVS prot steel'

    # note: emissions factors are from GHGI, not from GHGRP
    # TO DO: check GHGI documentation to see if they explained how they handled cast iron (CI), ductile iron (DI), and reconditioned cast iron (RCI)
    # nonetheless, relevant to note that GHGRP regulations did not seem to include explanation of how to group materials into categories listed
    unprot_steel_mats = [
        'NUM_SRVS_STEEL_UNP_BARE', 
        'NUM_SRVS_STEEL_UNP_COATED',
        'NUM_SRVS_CI', 
        'NUM_SRVS_DI', 
        'NUM_SRVS_RCI', 
    ]
    num_serv_unprot_steel = serv_mat[unprot_steel_mats].sum(axis=1)
    num_serv_unprot_steel.name = 'NUM_SRVS unprot steel'

    # note: plastic has lowest emissions rate, 
    # so grouping 'other' with plastic is most conservative approach
    plastic_mats = ['NUM_SRVS_PLASTIC', 'NUM_SRVS_OTHER']
    num_serv_plastic = serv_mat[plastic_mats].sum(axis=1)
    num_serv_plastic.name = 'NUM_SRVS plastic'
    
    copper_mats = ['NUM_SRVS_CU']
    num_serv_copper = serv_mat[copper_mats].sum(axis=1)
    num_serv_copper.name = 'NUM_SRVS copper'
    
    # emissions = emissions factor (kg CH4/serv-ft/y) * number of services * (avg length (ft)/serv) * (1 Gg/1e6 kg)
    # resulting units for emissions: Gg CH4/y
    serv_prot_steel_em = prot_steel_leak_ef * num_serv_prot_steel * serv_mat['avg length (ft)'] * gg_per_kg
    serv_prot_steel_em.name = 'serv steel prot CH4 Gg'
    
    serv_unprot_steel_em = unprot_steel_leak_ef * num_serv_unprot_steel * serv_mat['avg length (ft)'] * gg_per_kg
    serv_unprot_steel_em.name = 'serv steel unprot CH4 Gg'
    
    serv_plastic_em = plastic_leak_ef * num_serv_plastic * serv_mat['avg length (ft)'] * gg_per_kg
    serv_plastic_em.name = 'serv plastic CH4 Gg'
    
    serv_copper_em = copper_leak_ef * num_serv_copper * serv_mat['avg length (ft)'] * gg_per_kg
    serv_copper_em.name = 'serv coppy CH4 Gg'

    all_serv_em = [serv_prot_steel_em, serv_unprot_steel_em, serv_plastic_em, serv_copper_em]
    serv_mat = pd.concat([serv_mat] + all_serv_em, sort=False, axis=1)
    
    all_serv_em_sum = pd.concat(all_serv_em, sort=False, axis=1).sum(axis=1)
    serv_mat['service leak CH4 Gg'] = all_serv_em_sum
    
    print(f"total service line leakage: {round(serv_mat['service leak CH4 Gg'].sum(), 1)} Gg")
    
#     print(f"ft_per_service_us_avg: {ft_per_service_us_avg}")
#     # print(f"mi_per_service_us_avg: {mi_per_service_us_avg}")
#     print(f"num_serv_prot_steel tot: {num_serv_prot_steel.sum()}")
#     print(f"num_serv_unprot_steel tot: {num_serv_unprot_steel.sum()}")
#     print(f"num_serv_plastic tot: {num_serv_plastic.sum()}")
#     print(f"num_serv_copper tot: {num_serv_copper.sum()}")
    
#     num_ci_di_rci = serv_mat[['NUM_SRVS_CI', 'NUM_SRVS_DI', 'NUM_SRVS_RCI']].sum(axis=1)
#     print(f"num_ci_di_rci tot: {num_ci_di_rci.sum()}")
    
    return serv_mat

In [None]:
# note: this is all services leakage; not split between sectors
if services_based_on_mains == True:
    serv_mat = calc_serv_leakage_based_on_mains(phmsa, mains_leak_df)
    
elif services_based_on_mains == False:
    serv_leak_epa_df = pd.read_excel(inputs_path + parameters_file, sheet_name='services leakage EPA')
    serv_mat = calc_serv_leakage_based_on_epa(phmsa, serv_leak_epa_df)

else:
    print('Error!' + f"Unexpected value for services_based_on_mains: {services_based_on_mains}")

In [None]:
# sandbox:
serv_mat_cols = [
    'NUM_SRVS_STEEL_CP_COATED',
    'NUM_SRVS_STEEL_CP_BARE', 
    'NUM_SRVS_STEEL_UNP_BARE', 
    'NUM_SRVS_STEEL_UNP_COATED',
    'NUM_SRVS_CI', 
    'NUM_SRVS_DI', 
    'NUM_SRVS_RCI', 
    'NUM_SRVS_PLASTIC', 
    'NUM_SRVS_OTHER',
    'NUM_SRVS_CU',
    'NUM_SRVCS_TOTAL',
]
serv_mat_by_mat = serv_mat.copy()[serv_mat_cols + ['avg length (ft)']]
serv_len_total = serv_mat[['NUM_SRVCS_TOTAL', 'avg length (ft)']].product(axis=1).sum()

for mat_cat in serv_mat_cols:
    mat_cat_short = mat_cat.split('NUM_SRVS_')[-1]
    one_mat_len = serv_mat_by_mat[[mat_cat, 'avg length (ft)']].product(axis=1).sum()
    one_mat_fract = one_mat_len/serv_len_total
    print(f"{mat_cat_short}: {round(100*one_mat_fract, 3)}%")

### Distribution pipelines misc (blowdowns, mishaps, releases)
* Use EPA GHGI 2020 estimates for emissions in these categories
* Read EPA GHGI Annex 3.6 spreadsheet and calculate annual total
* Divide by dry gas delivered to end customers, to get a leakage rate g CH4/Mcf delivered

In [None]:
if data_year <= 2018:
    # units are kt (kilotons) CH4/y, which is same as Gg/y
    epa_ghgi_2020_annex_gas = pd.read_excel(
        epa_ghgi_2020_annex_gas_path + epa_ghgi_2020_annex_gas_file, 
        sheet_name='3.6-1', 
        header=7)
else:
    print("ERROR!" + f" The chosen data_year ({data_year}) not be covered by EPA GHGI file here. May need to download a new file.")

In [None]:
misc_categories = [
    'Pressure Relief Valve Releases', 
    'Pipeline Blowdown', 
    'Mishaps (Dig-ins)']
dist_misc_leak_gg_year = epa_ghgi_2020_annex_gas.loc[
    epa_ghgi_2020_annex_gas['Segment/Source'].isin(misc_categories)][data_year].sum()

In [None]:
# get gas volumes delivered to end consumers via LDCs
# (this is lower than total volumes delivered to all end consumers, which includes direct sales)
ldc_total_deliv_mcf = ldcs['total deliv NG Mcf'].sum()

# calculate leakage rate (g CH4 per Mcf)
dist_misc_leak_gg_per_mcf = dist_misc_leak_gg_year / ldc_total_deliv_mcf

# apply this leakage rate to all sectors
# note: tried to come up with a way of distinguishing between sectors, 
# but that collapsed down to using the same leakage rate for each sector

misc_leak = ldcs.copy()
misc_leak['misc leak CH4 Gg'] = misc_leak['total deliv NG Mcf'] * dist_misc_leak_gg_per_mcf

### All distribution pipelines (mains, services, & misc)

In [None]:
def calc_gim_dist_pipe_leak(mains_mat, serv_mat, operations_year):
    # GIM estimates for total leakage from mains & services
    # (not allocated to particular sectors)
    # note: only companies in EIA-PHMSA crosswalk have values for leakage from mains & services
    
    dist_pipe = pd.merge(
        mains_mat[['EIA company ID', 'mains leak CH4 Gg/y']].dropna(subset=['EIA company ID']),
        serv_mat[['EIA company ID', 'service leak CH4 Gg']].dropna(subset=['EIA company ID']),
        on='EIA company ID', how='outer',
    )
    dist_pipe = pd.merge(
        dist_pipe,
        misc_leak[['EIA company ID', 'misc leak CH4 Gg']].dropna(subset=['EIA company ID']),
        on='EIA company ID', how='left',    
    )
    dist_pipe = pd.merge(
        dist_pipe,
        operations_year[['EIA company ID', 'total deliv NG Mcf']],
        on='EIA company ID', how='left',    
    )
    
    dist_pipe['GIM dist pipe leak CH4 Gg'] = dist_pipe[
        ['mains leak CH4 Gg/y', 'service leak CH4 Gg', 'misc leak CH4 Gg']
    ].sum(axis=1)
    dist_pipe['total NG CH4 Gg'] = dist_pipe['total deliv NG Mcf'] * conversion_consumer_ng_mcf_to_ch4_gg
    
    dist_pipe['GIM dist pipe leak fract'] = dist_pipe['GIM dist pipe leak CH4 Gg'].div(dist_pipe['total NG CH4 Gg'], fill_value=0)

    if services_based_on_mains==True:
        print("GIM calc based Weller for mains & services:")
    elif services_based_on_mains==False:
        print("GIM calc based Weller for mains & EPA for services:")
    print(f"mains leak (Gg): {round(dist_pipe['mains leak CH4 Gg/y'].sum(), 1)}")
    print(f"serv leak (Gg): {round(dist_pipe['service leak CH4 Gg'].sum(), 1)}")
    print(f"mains + serv leak (Gg): {round(dist_pipe['GIM dist pipe leak CH4 Gg'].sum(), 1)}")
    print()
    
    return dist_pipe

In [None]:
# sandbox:
# rows that have n/a for EIA company ID are LDCs in PHMSA data that weren't connected in crosswalk to EIA
# how much of mains & of services do these companies own?
phmsa_not_matched = phmsa.loc[phmsa['EIA company ID'].isna()]
fract_mmi_not_matched = phmsa_not_matched['MMILES_TOTAL'].sum()/phmsa['MMILES_TOTAL'].sum()
print(f"fraction of mains miles covered: {100*(1-fract_mmi_not_matched)}%")

fract_smi_not_matched = phmsa_not_matched[['NUM_SRVCS_TOTAL', 'avg length (ft)']].product(axis=1).sum()/phmsa[['NUM_SRVCS_TOTAL', 'avg length (ft)']].product(axis=1).sum()
print(f"fraction of services miles covered: {100*(1-fract_smi_not_matched)}%")

# so the companies in the crosswalk, connecting PHMSA and EIA, account for 98-99% of miles of mains & services--that's great!
# what about gas delivered? is it similar?

In [None]:
# sandbox:
# calculate fraction of gas delivered that's accounted for by gas companies in crosswalk (connecting PHMSA & EIA)
df = pd.merge(phmsa, operations_year, on='EIA company ID', how='left')
df = df.loc[df['EIA company ID'].isna()==False]

# print(df['Residential Total Volume (Mcf)'].sum()/operations_year['Residential Total Volume (Mcf)'].sum())
# print(df['Commercial Total Volume (Mcf)'].sum()/operations_year['Commercial Total Volume (Mcf)'].sum())
# print(df['Industrial Total Volume (Mcf)'].sum()/operations_year['Industrial Total Volume (Mcf)'].sum())
# print(df['Electric Total Volume (Mcf)'].sum()/operations_year['Electric Total Volume (Mcf)'].sum())
# print(df['Vehicle Fuel Total Volume (Mcf)'].sum()/operations_year['Vehicle Fuel Total Volume (Mcf)'].sum())
# print(df['total deliv NG Mcf'].sum()/operations_year['total deliv NG Mcf'].sum())

In [None]:
dist_pipe = calc_gim_dist_pipe_leak(mains_mat, serv_mat, operations_year)
dist_pipe['dist pipe leak final CH4 Gg'] = dist_pipe['GIM dist pipe leak CH4 Gg']
dist_pipe['dist pipe leak final fract'] = dist_pipe['GIM dist pipe leak fract']

In [None]:
# # export
# save_timestamp = time.strftime('%Y-%m-%d_%H%M', time.localtime())
# df = dist_pipe.copy()
# df = pd.merge(df, operations_year[['EIA company ID', 'Company Name']], on='EIA company ID', how='left')
# df = df.sort_values('EPA/GIM', ascending=False)

# df.to_excel(
#     lca_path + f'distribution pipe leakage - GIM vs EPA for analysis year {data_year} {save_timestamp}.xlsx',
#     index=False
# )

### Distribution pipeline leakage by sector
* For companies in PHMSA, steps above calculate their leakage for their local distribution pipelines
* For companies that are also in crosswalk, steps above bring in EIA leakage values.
* Only have gas volumes for companies in crosswalk (since PHMSA doesn't report volumes)
* Assume all sales not accounted for by companies in crosswalk are direct sales, and assume these have 0 distribution leakage. For each state:
  * Take gas sales in EIA operations file for all companies NOT in crosswalk; assume these are direct sales
  * Sum these for each state
* Nationwide, calculate number of customers for companies in crosswalk vs not in crosswalk
  * If we were to estimate leakage from distribution lines to direct customers, would it add up to much?
* How to handle gas consumed by the gas industry (e.g., consumed by equipment on transmission lines)?
  * Transmission leakage is calculated relative to total gas consumed in each state, right?
  * If so, that would include end users (residential, etc.), but also gas industry consumption.

In [None]:
def allocate_dist_pipe_leak_by_sector_volume(
    dist_pipe, operations_year, 
    allocate_dist_pipe_leak_to_res_comm_only):    
    
    df = dist_pipe.copy()

    # allocate leakage from distribution pipes proportional to sectoral consumption
    df = pd.merge(
        df.drop('total deliv NG Mcf', axis=1), 
        operations_year,
        on='EIA company ID', how='outer')
    
    res_comm_veh_mcf_tot = df[[
        'Residential Total Volume (Mcf)',
        'Commercial Total Volume (Mcf)',
        'Vehicle Fuel Total Volume (Mcf)',
    ]].sum(axis=1)

    if allocate_dist_pipe_leak_to_res_comm_only == True:
        # allocate distribution pipeline leakage only between residential & commercial
        # (and a tiny portion to vehicle fuel)
        res_fract = df['Residential Total Volume (Mcf)']/res_comm_veh_mcf_tot
        comm_fract = df['Commercial Total Volume (Mcf)']/res_comm_veh_mcf_tot
        vehicle_fract = df['Vehicle Fuel Total Volume (Mcf)']/res_comm_veh_mcf_tot
        elec_fract = 0
        ind_fract = 0
    
    elif allocate_dist_pipe_leak_to_res_comm_only == False:
        # allocate distribution pipeline leakage between all sectors
        res_fract = df['Residential Total Volume (Mcf)'].div(df['total deliv NG Mcf'])
        comm_fract = df['Commercial Total Volume (Mcf)'].div(df['total deliv NG Mcf'])
        vehicle_fract = df['Vehicle Fuel Total Volume (Mcf)'].div(df['total deliv NG Mcf'])
        ind_fract = df['Industrial Total Volume (Mcf)'].div(df['total deliv NG Mcf'])
        elec_fract = df['Electric Total Volume (Mcf)'].div(df['total deliv NG Mcf'])
        
    else:
        print("ERROR!" + f" Unexpected value for allocate_dist_pipe_leak_to_res_comm_only: {allocate_dist_pipe_leak_to_res_comm_only}")

    dist_leak = df['dist pipe leak final CH4 Gg']
    df['dist pipe res leak CH4 Gg'] = dist_leak * res_fract
    df['dist pipe comm leak CH4 Gg'] = dist_leak * comm_fract
    df['dist pipe vehicle leak CH4 Gg'] = dist_leak * vehicle_fract
    df['dist pipe ind leak CH4 Gg'] = dist_leak * ind_fract
    df['dist pipe elec leak CH4 Gg'] = dist_leak * elec_fract
    
    df['dist pipe res-comm leak CH4 Gg'] = df[
        ['dist pipe res leak CH4 Gg', 'dist pipe comm leak CH4 Gg']].sum(axis=1, skipna=True)
    
    # show summary result
    if allocate_dist_pipe_leak_to_res_comm_only==True:
        gim_dist_pipe_leak_mcf = df['dist pipe leak final CH4 Gg'].sum() / conversion_consumer_ng_mcf_to_ch4_gg
        gim_dist_pipe_leak_fract = gim_dist_pipe_leak_mcf/res_comm_veh_mcf_tot.sum()
        print(f"GIM calculated leakage rate for distribution pipelines, relative to res-comm-veh NG deliv: {round(100*gim_dist_pipe_leak_fract, 3)}%")

    elif allocate_dist_pipe_leak_to_res_comm_only==False:
        gim_dist_pipe_leak_mcf = df['dist pipe leak final CH4 Gg'].sum() / conversion_consumer_ng_mcf_to_ch4_gg
        gim_dist_pipe_leak_fract = gim_dist_pipe_leak_mcf/dist_pipe['total deliv NG Mcf'].sum()
        print(f"GIM calculated leakage rate for distribution pipelines, relative to total NG deliv: {round(100*gim_dist_pipe_leak_fract, 3)}%")
    
    # TEST
    # note: for test, keep only rows with any deliveries listed; should be only 3 companies excluded
    # steps above calculate leakage for them, even though they have no deliveries
#     dist_leak_total = df.loc[df['total deliv NG Mcf']>0]['dist pipe leak final CH4 Gg'].sum()
    dist_leak_total = df.loc[~df['EIA company ID'].isin(
        ['17611930TN', '17601881IL', '17614062AL']
    )]['dist pipe leak final CH4 Gg'].sum()
    dist_leak_sectoral_totals = df[[
        'dist pipe res leak CH4 Gg',
        'dist pipe comm leak CH4 Gg',
        'dist pipe vehicle leak CH4 Gg',
        'dist pipe ind leak CH4 Gg',
        'dist pipe elec leak CH4 Gg'
    ]].sum(axis=1).sum()
    diff = dist_leak_total - dist_leak_sectoral_totals
    if abs(diff) < 1e-6:
        pass
    else:
        print()
        print("ERROR!" + " Mismatch in distribution leakage values")
        print(f"dist_leak_total: {round(dist_leak_total, 2)}")
        print(f"dist_leak_sectoral_totals: {round(dist_leak_sectoral_totals, 2)}")
    # END OF TEST
    
    dist_pipe_by_sector = df
        
    return dist_pipe_by_sector

In [None]:
dist_pipe_by_sector = allocate_dist_pipe_leak_by_sector_volume(
    dist_pipe, operations_year, allocate_dist_pipe_leak_to_res_comm_only)

In [None]:
print(f"residential: {round(dist_pipe_by_sector['dist pipe res leak CH4 Gg'].sum(), 0)}")
print(f"commercial: {round(dist_pipe_by_sector['dist pipe comm leak CH4 Gg'].sum(), 0)}")
print(f"industrial: {round(dist_pipe_by_sector['dist pipe ind leak CH4 Gg'].sum(), 0)}")
print(f"electric: {round(dist_pipe_by_sector['dist pipe elec leak CH4 Gg'].sum(), 0)}")

In [None]:
res_comm_dist_leak_fract = dist_pipe_by_sector['dist pipe res-comm leak CH4 Gg'].sum()/dist_pipe_by_sector['GIM dist pipe leak CH4 Gg'].sum()
print(f"Residential/commercial dist leak % of total dist leak: {round(res_comm_dist_leak_fract*100, 1)}%")

In [None]:
# show leakage rate (g CH4/Mcf) for distribution pipelines allocated to each sector
# note that this calculates the distribution pipeline leakage for all companies in PHMSA
# if also in crosswalk, checks against EIA reported leakage
# then calculates sectoral share based on the sectoral total deliveries 
# (even if some of the deliveries are direct and don't pass through local distribution)
print(f"res dist leak CH4 g/Mcf: {round(1e9*dist_pipe_by_sector['dist pipe res leak CH4 Gg'].sum()/operations_year['Residential Total Volume (Mcf)'].sum(), 1)}")
print(f"comm dist leak CH4 g/Mcf: {round(1e9*dist_pipe_by_sector['dist pipe comm leak CH4 Gg'].sum()/operations_year['Commercial Total Volume (Mcf)'].sum(), 1)}")
print(f"res-comm dist leak CH4 g/Mcf: {round(1e9*dist_pipe_by_sector['dist pipe res-comm leak CH4 Gg'].sum()/operations_year[['Residential Total Volume (Mcf)', 'Commercial Total Volume (Mcf)']].sum(axis=1).sum(), 1)}")
print(f"ind dist leak CH4 g/Mcf: {round(1e9*dist_pipe_by_sector['dist pipe ind leak CH4 Gg'].sum()/operations_year['Industrial Total Volume (Mcf)'].sum(), 1)}")
print(f"elec dist leak CH4 g/Mcf: {round(1e9*dist_pipe_by_sector['dist pipe elec leak CH4 Gg'].sum()/operations_year['Electric Total Volume (Mcf)'].sum(), 1)}")

if allocate_dist_pipe_leak_to_res_comm_only==False:
    print(f"all sectors dist leak CH4 g/Mcf: {round(1e9*dist_pipe_by_sector['dist pipe leak final CH4 Gg'].sum()/operations_year['total deliv NG Mcf'].sum(), 1)}")

In [None]:
dist_pipe_by_sector

In [None]:
df = dist_pipe_by_sector.copy()
df['state'] = df['EIA company ID'].str[-2:]
df = df.groupby('state')[['dist pipe elec leak CH4 Gg', 'Electric Total Volume (Mcf)']].sum()
df['dist pipe elec leak g CH4/Mcf'] = df['dist pipe elec leak CH4 Gg'] * g_per_gg / df['Electric Total Volume (Mcf)']
ser = df['dist pipe elec leak g CH4/Mcf']
dist_pipe_elec_leak_g_ch4_per_mcf_by_state = ser

# TO DO: delete section below
# # export
# save_timestamp = time.strftime('%Y-%m-%d_%H%M', time.localtime())
# dist_pipe_elec_leak_g_ch4_per_mcf_by_state.to_csv(
#     lca_path + f'GIM distribution pipeline leakage electric sector by state {save_timestamp}.csv'
# )

# meter leakage
* based on Moore 2019. Classification of Methane Emissions from Industrial Meters, Vintage vs Modern Plastic Pipe, and Plastic-lined Steel and Cast-Iron Pipe. DOE Project Number DE-FE0029061, GTI Project Number 22070
* GTI 2019 says EPA estimates meter leakage using number of customers as a proxy for number of meters
* Industrial and commercial meters are lumped together, with a single EF applied to both

In [None]:
def assign_regions_for_utilities(df, state_col_name):
    """
    Assign each state to a region, based on regions assigned in GTI 2019 report 
    on commercial & industrial meter leakage (Moore C.)

    Note: The paper assigns AK & HI to Pacific region, however:
    * HI has no gas consumption.
    * AK has a very different environment and history, 
    and probably they didn't take any measurements there.
    so excluding it from the analysis
    (also AK has no measurements of production area leakage)

    Note: Not clear whether GTI 2019 assigned DC to Northeast or Southeast region;
    Northeast has much lower leakage rate than Southeast;
    so to be conservative, assigned DC to Northeast.
    """
    # initialize column with empty strings
    df['region'] = ''
    
    for row in df.index:
        state = df.at[row, state_col_name]

        if state in ['WA', 'OR', 'CA']:
            region = 'Pacific'
        elif state in ['ID', 'MT', 'WY', 'NV', 'UT', 'CO']:
            region = 'Rocky'
        elif state in ['AZ', 'NM', 'TX', 'OK']:
            region = 'Southwest'
        elif state in ['ND', 'SD', 'NE', 'KS', 'MN', 'IA', 'MO', 'WI', 'IL', 'MI', 'IN', 'OH']:
            region = 'Midwest'
        elif state in ['AR', 'LA', 'MS', 'AL', 'GA', 'FL', 'SC', 'NC', 'TN', 'KY', 'WV', 'VA']:
            region = 'Southeast'
        elif state in ['ME', 'NH', 'VT', 'NY', 'MA', 'RI', 'CT', 'NJ', 'PA', 'MD', 'DE', 'DC']:
            region = 'Northeast'
        else:
            print("Error!" + f" state {state} wasn't in if-else statements")

        df.at[row, 'region'] = region
    
    return df

### commercial meters (based on regional percentage leakage)
* Calculate a percentage leakage from commercial meters that applies to each region
* Percentage leakage = (total commercial sales in region) / ((total commercial meters in region) * (average leakage per meter in region))
* Where we assume (following EPA) that the number of meters is equal to the number of customers
* Then apply this percentage leakage to each utility, based on the regions defined by GTI 2019

In [None]:
def calc_average_comm_meter_leak_by_region(operations_year, gti_meter_leak_file):
    df = operations_year.copy()
    df = df[['EIA company ID', 'Company Name', 'State', 
             'Commercial Total Volume (Mcf)', 'Commercial Total Customers']]
    df = assign_regions_for_utilities(df, 'State')
    for col in ['Commercial Total Volume (Mcf)', 'Commercial Total Customers']:
        df[col] = df[col].fillna(0)
    deliv_cust_c = df.copy() # returned at end
        
    # sum over regions, so all values below are regional
    df = df.groupby(['region'])[['Commercial Total Volume (Mcf)', 'Commercial Total Customers']].sum()
    df = df.reset_index()
    
    # create dictionary of meter leakage rate for each region
    gti_meter_leak = pd.read_excel(lca_path + gti_meter_leak_file, sheet_name='data')
    gti_comm_meter_leak = gti_meter_leak.set_index('region')['commercial meter leakage rate kg CH4/y']
    gti_comm_meter_leak_dict = gti_comm_meter_leak.to_dict()
    
    # calculate leakage from commercial meters, based on regional leakage rates
    df['leakage per meter kg/y'] = df['region'].replace(
        gti_comm_meter_leak_dict).astype(float)
    df['leakage comm Gg/y'] = df['leakage per meter kg/y'] * df['Commercial Total Customers'] / 1e6

    # get commercial deliveries, CH4 content only
    df['Commercial Total Volume (Gg CH4)'] = df['Commercial Total Volume (Mcf)'] * conversion_consumer_ng_mcf_to_ch4_gg

    for col in ['Commercial Total Volume (Gg CH4)']:
        df[col] = df[col].fillna(0)
    
    # calculate average commerical meter leakage fraction for each region
    df['leak fract'] = df['leakage comm Gg/y'].div(df[f'Commercial Total Volume (Gg CH4)'], fill_value=0)
    df['leakage %'] = (df['leak fract']*100).round(3).astype(str) + '%'

    # create dictionary of regional commercial meter leakage rates (fract)
    comm_meters = df[['region', 'leak fract']].set_index('region')
    comm_meters_regional_dict = comm_meters.to_dict('series')['leak fract']
    
    return(deliv_cust_c, comm_meters_regional_dict)

In [None]:
def calc_comm_meters_leak(deliv_cust_c, comm_meters_regional_dict):
    df = deliv_cust_c.copy()

    # apply average leakage fraction by region to each utility, based on its region
    df['comm meter leak fract'] = df['region'].replace(comm_meters_regional_dict).astype(float)
    
    # fill empty values (NaN) with zeros, to avoid anomalies in calculations below
    df['comm meter leak fract'] = df['comm meter leak fract'].fillna(0)

    # total volume leakage from commercial meters
    df['Commercial Total Volume (Gg CH4)'] = df['Commercial Total Volume (Mcf)'] * conversion_consumer_ng_mcf_to_ch4_gg
    df['comm meter leak CH4 Gg'] = df[
        ['Commercial Total Volume (Gg CH4)', 'comm meter leak fract']
    ].product(axis=1, skipna=False)

    print(f"total leakage from commercial customer meters (Gg CH4): {round(df['comm meter leak CH4 Gg'].sum(), 1)}")
    
    comm_meters_avg_leakage_fract = df['comm meter leak CH4 Gg'].sum() / df['Commercial Total Volume (Gg CH4)'].sum()

    print(f"commercial meters, average leakage rate across all utilities: {round(comm_meters_avg_leakage_fract*100, 3)}%")
    
    deliv_cust_c = df
    return deliv_cust_c

In [None]:
# run functions for commercial meter leakage
(deliv_cust_c, comm_meters_regional_dict) = calc_average_comm_meter_leak_by_region(
    operations_year, gti_meter_leak_file)

deliv_cust_c = calc_comm_meters_leak(
    deliv_cust_c, comm_meters_regional_dict)

In [None]:
# commercial meter leakage rate (g CH4/Mcf)
(1e9*deliv_cust_c['comm meter leak CH4 Gg'].sum())/deliv_cust_c['Commercial Total Volume (Mcf)'].sum()

In [None]:
# for documentation
df = deliv_cust_c.groupby('region')[['Commercial Total Customers', 'comm meter leak CH4 Gg', 'Commercial Total Volume (Mcf)',]].sum()
df['leak per meter CH4 kg/y'] = (df['comm meter leak CH4 Gg']*1e6).div(df['Commercial Total Customers'])
df['Commercial vol Gg CH4'] = df['Commercial Total Volume (Mcf)'] * conversion_consumer_ng_mcf_to_ch4_gg
df['pct leak'] = (round(df['comm meter leak CH4 Gg'].div(df['Commercial vol Gg CH4'])*100, 2)).astype(str)+'%'

df = df.sort_values(by='leak per meter CH4 kg/y', ascending=True)
table_12_3_1 = df

table_12_3_1

### residential meters

In [None]:
def calculate_residential_meter_leak(
    operations_year, comm_meters_regional_dict, parameters_main):
    """
    Calculate leakage from residential customer gas meters, based on EPA GHGI 2020 emissions factor.
    """
    df = operations_year.copy()
    df = df[['EIA company ID', 'Company Name', 'State', 
             'Residential Total Volume (Mcf)', 'Residential Total Customers']]
    
    # fill empty values (NaN) with zeros, to avoid anomalies in calculations below
    for col in ['Residential Total Volume (Mcf)', 'Residential Total Customers']:
        df[col] = df[col].fillna(0)

    # get data on share of meters outdoors for each region and state
    # (from EPA documentation)
    res_meters_outdoors_df = pd.read_excel(
        inputs_path + res_meters_outdoors_file, sheet_name='pct outdoor meters')
    res_meters_outdoors_df = res_meters_outdoors_df.rename(columns={
        'percent residential meters outdoors': 'fract res meters outdoors'})
    
    state_by_census = pd.read_excel(
        inputs_path +  res_meters_outdoors_file, sheet_name='states by Census Division')
    res_meters_outdoors_df = pd.merge(
        res_meters_outdoors_df, state_by_census, 
        left_on='region', right_on='US Census Division')
    
    # merge deliv_cust_r & df;
    df = pd.merge(
        df, res_meters_outdoors_df[['state', 'region', 'fract res meters outdoors']], 
        left_on='State', right_on='state', how='left')
       
    # read value from parameters file; derives from EPA GHGI
    df['res meter leak kg/meter'] = parameters_main.at['res_meter_leak_kg_ch4_per_meter_pa']
    df['res meter leak CH4 Gg'] = df[[
        'res meter leak kg/meter', 
        'Residential Total Customers', 
        'fract res meters outdoors'
    ]].product(axis=1, skipna=False) * gg_per_kg   
    
    # calculate average leakage rate for US
    df['Residential Total Volume (Gg CH4)'] = df['Residential Total Volume (Mcf)'] * conversion_consumer_ng_mcf_to_ch4_gg
    res_meters_avg_leakage_fract = df['res meter leak CH4 Gg'].sum()/df['Residential Total Volume (Gg CH4)'].sum()
    print(f"residential meters, average leakage rate across all utilities: {round(res_meters_avg_leakage_fract*100, 3)}%")
    
    # calculate total leakage from residential meters, and average per meter
    total_res_meter_leak = df['Residential Total Volume (Gg CH4)'].sum() * res_meters_avg_leakage_fract
    print(f'GIM US leakage from residential meters: {round(total_res_meter_leak, 1)} Gg/y')
    # print(f'GIM US leakage from residential meters: {total_res_meter_leak*1000} metric tons/y')
    
    deliv_cust_r = df
    
    return deliv_cust_r

In [None]:
# run functions for residential meters
# deliv_cust_r = prepare_residential_sector_data(deliv, operations_year)
deliv_cust_r = calculate_residential_meter_leak(
    operations_year, comm_meters_regional_dict, parameters_main)

In [None]:
# residential meter leakage rate (g CH4/Mcf)
1e9*deliv_cust_r['res meter leak CH4 Gg'].sum()/deliv_cust_r['Residential Total Volume (Mcf)'].sum()

In [None]:
# sandbox
# calculate leakage per meter (aka per customer) for residential
# (differences are solely due to the share of meters assumed to be outdoors;
# if more outdoor meters, more leakage on average across all meters)
test = deliv_cust_r.groupby('region')[['res meter leak CH4 Gg', 'Residential Total Customers']].sum()
1e6*test['res meter leak CH4 Gg']/test['Residential Total Customers']

### res-comm meters combined

In [None]:
# view: total comm meter leak
deliv_cust_c['comm meter leak CH4 Gg'].sum(), deliv_cust_r['res meter leak CH4 Gg'].sum()

In [None]:
def calc_meter_leakage_res_and_comm(deliv_cust_c, deliv_cust_r):
    # for residential, each utility has: 
    # 'res meter leak CH4 Gg' & 'Residential Total Volume (Mcf)'
    df = pd.merge(
        deliv_cust_c[['EIA company ID', 'comm meter leak CH4 Gg', 'Commercial Total Volume (Mcf)', 'Commercial Total Volume (Gg CH4)']],
        deliv_cust_r[['EIA company ID', 'res meter leak CH4 Gg', 'Residential Total Volume (Mcf)', 'Residential Total Volume (Gg CH4)', 'Residential Total Customers']],
        on='EIA company ID', how='outer'
    )
    for col in ['Commercial Total Volume (Mcf)', 'Residential Total Volume (Mcf)']:
        df[col] = df[col].fillna(0)

    df['res-comm meter leak CH4 Gg'] = df[[
        'comm meter leak CH4 Gg', 'res meter leak CH4 Gg'
    ]].sum(axis=1)
    df['res-comm vol (Mcf)'] = df[[
        'Commercial Total Volume (Mcf)', 'Residential Total Volume (Mcf)'
    ]].sum(axis=1)
    df['res-comm CH4 Gg'] = df[[
        'Commercial Total Volume (Gg CH4)', 'Residential Total Volume (Gg CH4)'
    ]].sum(axis=1)

    df['res-comm meter leak CH4 kg/Mcf'] = 1e6 * df['res-comm meter leak CH4 Gg'] / df['res-comm vol (Mcf)']

    res_comm_meter = df
    
    return res_comm_meter

In [None]:
res_comm_meter = calc_meter_leakage_res_and_comm(deliv_cust_c, deliv_cust_r)

In [None]:
# TO DO: put into data compilation for US averages
# (will be only contiguous US normally; depends on parameter setting)
res_comm_meter_leak_rate = res_comm_meter['res-comm meter leak CH4 Gg'].sum()*g_per_gg/res_comm_meter['res-comm vol (Mcf)'].sum()
print(f"res-comm meter leakage rate: {round(res_comm_meter_leak_rate, 1)} g/Mcf")

In [None]:
# compare volumes/meter, commercial vs residential
comm_sales_per_meter = deliv_cust_c['Commercial Total Volume (Mcf)'].sum() / deliv_cust_c['Commercial Total Customers'].sum()
res_sales_per_meter = deliv_cust_r['Residential Total Volume (Mcf)'].sum() / deliv_cust_r['Residential Total Customers'].sum()

print(f"commercial sales per meter: {round(comm_sales_per_meter, 1)} Mcf/meter")
print(f"residential sales per meter: {round(res_sales_per_meter, 1)} Mcf/meter")
print(f"ratio of commercial sales per meter / residential sales per meter: {round(comm_sales_per_meter/res_sales_per_meter, 1)}")

# since commercial meters have much larger average flows per meter than residential meters,
# not reasonable to use same leakage per meter (kg/y) for residential as commercial

In [None]:
# TO DO: if keeping, need to check this to make sure it's consistent about the year of analysis

# # then compare these against EPA GHGI; see revisions doc for distribution for 2016 GHGI
# # for data in 2013:
# ghgi_2016_scf_per_res_meter = 77.31
# ghgi_2016_num_res_meters = 52991569
# mcf_per_scf = 1/1000

# ghgi_gg_ch4_res_meters = ghgi_2016_num_res_meters * ghgi_2016_scf_per_res_meter * mcf_per_scf * conversion_consumer_ng_mcf_to_ch4_gg
# print(f"EPA GHGI 2020 res meters leak (Gg): {round(ghgi_gg_ch4_res_meters, 1)}")

# ghgi_gg_ch4_res_meters_leak_fract = ghgi_gg_ch4_res_meters / deliv_cust_r['Residential Total Volume (Gg CH4)'].sum()
# print(f"EPA GHGI residential meter leakage rate (%): {round(ghgi_gg_ch4_res_meters_leak_fract, 5)*100}%")

In [None]:
# # back-calculating pct leakage rate based on EPA's value for emissions
# ghgi_mt_co2e_res_meters = 1972656
# gg_per_mt = 1/1000
# epa_gwp_ch4 = 25 # approx.; have to check what EPA used; probably used AR3 or AR4 100-year
# back_ghgi_gg_ch4_res_meters = ghgi_mt_co2e_res_meters * gg_per_mt / epa_gwp_ch4

# print(f"EPA GHGI residential meter leakage, back calculated(Gg/y): {round(back_ghgi_gg_ch4_res_meters, 1)}")
# # note: this result is close to the value above, so it's clear how EPA is arriving at their result

### industrial meters
* uses values per meter from Moore 2019
* applies same approach as for customer meters (calculate percentage leakage for each region)

In [None]:
def calc_average_indust_meter_leak_by_region(operations_year, gti_meter_leak_file):
    df = operations_year.copy()
    df = df[['EIA company ID', 'Company Name', 'State', 
             'Industrial Total Volume (Mcf)', 'Industrial Total Customers']]
    df = assign_regions_for_utilities(df, 'State')
    for col in ['Industrial Total Volume (Mcf)', 'Industrial Total Customers']:
        df[col] = df[col].fillna(0)
    deliv_cust_i = df.copy() # returned at end
        
    # sum over regions, so all values below are regional
    df = df.groupby(['region'])[['Industrial Total Volume (Mcf)', 'Industrial Total Customers']].sum()
    df = df.reset_index()
    
    # create dictionary of meter leakage rate for each region
    gti_meter_leak = pd.read_excel(lca_path + gti_meter_leak_file, sheet_name='data')
    gti_indust_meter_leak = gti_meter_leak.set_index('region')['industrial meter leakage rate kg CH4/y']
    gti_indust_meter_leak_dict = gti_indust_meter_leak.to_dict()
    
    # calculate leakage from industrial meters, based on regional leakage rates
    df['leakage per meter kg/y'] = df['region'].replace(
        gti_indust_meter_leak_dict).astype(float)
    df['leakage indust Gg/y'] = df['leakage per meter kg/y'] * df['Industrial Total Customers'] / 1e6

    # get industrial deliveries, CH4 content only
    df['Industrial Total Volume (Gg CH4)'] = df['Industrial Total Volume (Mcf)'] * conversion_consumer_ng_mcf_to_ch4_gg

    for col in ['Industrial Total Volume (Gg CH4)']:
        df[col] = df[col].fillna(0)
    
    # calculate average industrial meter leakage fraction for each region
    df['leak fract'] = df['leakage indust Gg/y'].div(df[f'Industrial Total Volume (Gg CH4)'], fill_value=0)
    df['leakage %'] = (df['leak fract']*100).round(3).astype(str) + '%'

    # create dictionary of regional industrial meter leakage rates (fract)
    indust_meters = df.set_index('region')
    indust_meters_leak_fract_region_dict = indust_meters[['leak fract']].to_dict('series')['leak fract']
    
    return(deliv_cust_i, indust_meters, indust_meters_leak_fract_region_dict)

In [None]:
def calc_indust_meters_leak(deliv_cust_i, indust_meters_leak_fract_region_dict):
    df = deliv_cust_i.copy()

    # apply average leakage fraction by region to each utility, based on its region
    df['indust meter leak fract'] = df['region'].replace(indust_meters_leak_fract_region_dict).astype(float)
    
    # fill empty values (NaN) with zeros, to avoid anomalies in calculations below
    df['indust meter leak fract'] = df['indust meter leak fract'].fillna(0)

    # total volume leakage from industrial meters
    df['Industrial Total Volume (Gg CH4)'] = df['Industrial Total Volume (Mcf)'] * conversion_consumer_ng_mcf_to_ch4_gg
    df['indust meter leak CH4 Gg'] = df[
        ['Industrial Total Volume (Gg CH4)', 'indust meter leak fract']
    ].product(axis=1, skipna=False)

    print(f"total leakage from industrial customer meters (Gg CH4): {round(df['indust meter leak CH4 Gg'].sum(), 1)}")
    
    indust_meters_avg_leakage_fract = df['indust meter leak CH4 Gg'].sum() / df['Industrial Total Volume (Gg CH4)'].sum()

    print(f"industrial meters, average leakage rate across all utilities: {round(indust_meters_avg_leakage_fract*100, 3)}%")
    
    deliv_cust_i = df
    return deliv_cust_i

In [None]:
# run functions for industrial meter leakage
(deliv_cust_i, 
 indust_meters,
 indust_meters_leak_fract_region_dict) = calc_average_indust_meter_leak_by_region(
    operations_year, gti_meter_leak_file)

deliv_cust_i = calc_indust_meters_leak(
    deliv_cust_i, indust_meters_leak_fract_region_dict)

In [None]:
# for documentation
df = deliv_cust_i.groupby('region')[['Industrial Total Customers', 'indust meter leak CH4 Gg', 'Industrial Total Volume (Mcf)',]].sum()
df['leak per meter CH4 kg/y'] = (df['indust meter leak CH4 Gg']*1e6).div(df['Industrial Total Customers'])
df['Industrial vol Gg CH4'] = df['Industrial Total Volume (Mcf)'] * conversion_consumer_ng_mcf_to_ch4_gg
df['pct leak'] = (round(df['indust meter leak CH4 Gg'].div(df['Industrial vol Gg CH4'])*100, 2)).astype(str)+'%'

df = df.sort_values(by='leak per meter CH4 kg/y', ascending=True)

df = df[['Industrial Total Volume (Mcf)', 'Industrial vol Gg CH4', 'Industrial Total Customers', 
         'leak per meter CH4 kg/y', 'indust meter leak CH4 Gg', 'pct leak'        
        ]]

table_12_3_i = df

table_12_3_i

In [None]:
ind_meter_g_per_mcf = 1e9*deliv_cust_i['indust meter leak CH4 Gg'].sum()/operations_year['Industrial Total Volume (Mcf)'].sum()
print(f"contiguous US total industrial meter leakage rate (g CH4/Mcf): {round(ind_meter_g_per_mcf, 1)}")

### electric meters
assume that electric meters have same percentage leakage rate as industrial meters

In [None]:
def compile_deliv_cust_electric(operations_year): # , gti_meter_leak_file):
    df = operations_year.copy()
    df = df[['EIA company ID', 'Company Name', 'State', 
             'Electric Total Volume (Mcf)', 'Electric Total Customers']]
    df = assign_regions_for_utilities(df, 'State')
    for col in ['Electric Total Volume (Mcf)', 'Electric Total Customers']:
        df[col] = df[col].fillna(0)
    deliv_cust_e = df.copy() # returned at end
    
    return deliv_cust_e

In [None]:
def calc_elec_meters_leak(deliv_cust_e, indust_meters_leak_fract_region_dict):
    """
    Assume same percentage leakage from electric meters as industrial meters.
    
    So use dictionary of leakage rates for industrial meters, by region.
    """
    df = deliv_cust_e.copy()

    # apply average leakage fraction by region to each utility, based on its region
    df['elec meter leak fract'] = df['region'].replace(indust_meters_leak_fract_region_dict).astype(float)
    
    # fill empty values (NaN) with zeros, to avoid anomalies in calculations below
    df['elec meter leak fract'] = df['elec meter leak fract'].fillna(0)

    # total volume leakage from electric meters
    df['Electric Total Volume (Gg CH4)'] = df['Electric Total Volume (Mcf)'] * conversion_consumer_ng_mcf_to_ch4_gg
    df['elec meter leak CH4 Gg'] = df[
        ['Electric Total Volume (Gg CH4)', 'elec meter leak fract']
    ].product(axis=1, skipna=False)

    print(f"total leakage from electric customer meters (Gg CH4): {round(df['elec meter leak CH4 Gg'].sum(), 1)}")
    
    elec_meters_avg_leakage_fract = df['elec meter leak CH4 Gg'].sum() / df['Electric Total Volume (Gg CH4)'].sum()

    print(f"electric meters, average leakage rate across all utilities: {round(elec_meters_avg_leakage_fract*100, 3)}%")
    
    deliv_cust_e = df
    return deliv_cust_e

In [None]:
# run functions for electric meter leakage
deliv_cust_e = compile_deliv_cust_electric(operations_year)
deliv_cust_e = calc_elec_meters_leak(deliv_cust_e, indust_meters_leak_fract_region_dict)

In [None]:
elec_meter_leak_fract = deliv_cust_e[['State', 'elec meter leak fract']].drop_duplicates().set_index('State')['elec meter leak fract']

# export
save_timestamp = time.strftime('%Y-%m-%d_%H%M', time.localtime())
elec_meter_leak_fract.to_csv(
    lca_path + f'GIM elec_meter_leak_fract {save_timestamp}.csv'
)

In [None]:
elec_meter_g_per_mcf = 1e9*deliv_cust_e['elec meter leak CH4 Gg'].sum()/operations_year['Electric Total Volume (Mcf)'].sum()
print(f"contiguous US total electric meter leakage rate (g CH4/Mcf): {round(elec_meter_g_per_mcf, 1)}")

In [None]:
# for documentation
df = deliv_cust_e.groupby('region')[['Electric Total Customers', 'elec meter leak CH4 Gg', 'Electric Total Volume (Mcf)',]].sum()
df['leak per meter CH4 kg/y'] = (df['elec meter leak CH4 Gg']*1e6).div(df['Electric Total Customers'])
df['Electric vol Gg CH4'] = df['Electric Total Volume (Mcf)'] * conversion_consumer_ng_mcf_to_ch4_gg
df['pct leak'] = (round(df['elec meter leak CH4 Gg'].div(df['Electric vol Gg CH4'])*100, 2)).astype(str)+'%'

df = df.sort_values(by='leak per meter CH4 kg/y', ascending=True)

df = df[['Electric Total Volume (Mcf)', 'Electric vol Gg CH4', 'Electric Total Customers', 
         'leak per meter CH4 kg/y', 'elec meter leak CH4 Gg', 'pct leak'        
        ]]

table_12_3_e = df

table_12_3_e

## combine state-level results for electric sector

In [None]:
# combine electricity sector outputs by state

# distribution pipeline leakage for electricity sector: dist_pipe_elec_leak_g_ch4_per_mcf_by_state

# electricity sector customer meter leakage: convert units to g CH4/Mcf
elec_cust_meter_leak_g_ch4_per_mcf = elec_meter_leak_fract * conversion_consumer_ng_mcf_to_ch4_gg * g_per_gg
elec_cust_meter_leak_g_ch4_per_mcf.name = 'cust meter elec leak g CH4/Mcf'

elec_sector_leak_rate_by_state = pd.concat([
    dist_pipe_elec_leak_g_ch4_per_mcf_by_state,
    elec_cust_meter_leak_g_ch4_per_mcf
], axis=1)

# export
save_timestamp = time.strftime('%Y-%m-%d_%H%M', time.localtime())
elec_sector_leak_rate_by_state.to_csv(
    inputs_path + f'GIM electric sector leakage rates dist & gas meters {save_timestamp}.csv')

## behind-the-meter leakage

In [None]:
def calc_behind_the_meter_leakage(deliv_cust_r, deliv_cust_c):
    # multiply residential consumption by assumed residential behind-the-meter leakage rate
    deliv_cust_r['behind-the-meter leak res CH4 Gg'] = deliv_cust_r['Residential Total Volume (Gg CH4)'] * behind_meter_res
    
    # multiply commercial consumption by assumed commercial behind-the-meter leakage rate
    deliv_cust_c['behind-the-meter leak comm CH4 Gg'] = deliv_cust_c['Commercial Total Volume (Gg CH4)'] * behind_meter_comm
    
    behind_leak = pd.merge(
        deliv_cust_r[['EIA company ID', 'behind-the-meter leak res CH4 Gg']],
        deliv_cust_c[['EIA company ID', 'behind-the-meter leak comm CH4 Gg']],
        on='EIA company ID', how='outer',
    )
    behind_leak['behind-the-meter res-comm CH4 Gg'] = behind_leak[[
        'behind-the-meter leak res CH4 Gg', 'behind-the-meter leak comm CH4 Gg'
    ]].sum(axis=1)
    
    return behind_leak

In [None]:
behind_leak = calc_behind_the_meter_leakage(deliv_cust_r, deliv_cust_c)

In [None]:
# calculate leakage rate
g_per_gg = 1e9
behind_leak_g = behind_leak['behind-the-meter res-comm CH4 Gg'].sum() * g_per_gg
behind_leak_g_per_mcf_all = behind_leak_g / operations_year['total deliv NG Mcf'].sum()
behind_leak_g_per_mcf_res_comm = behind_leak_g / operations_year[['total deliv NG Mcf', 'res-comm NG fract']].product(axis=1, skipna=False).sum()
behind_leak_g_per_mcf_res_comm

In [None]:
behind_leak['behind-the-meter leak res CH4 Gg'].sum(), behind_leak['behind-the-meter leak comm CH4 Gg'].sum()

# compile LDC res-comm leakage

* for each LDC
  * calculate leakage quantity for all residential and for all commercial, from distribution, service, meters, and behind-the-meter
  * then calculate leakage rate for res + comm

In [None]:
dist_pipe_res_comm = dist_pipe_by_sector[['EIA company ID', 'dist pipe res-comm leak CH4 Gg']]

df = operations_year.copy()
df['res-comm NG Mcf'] = df[['total deliv NG Mcf', 'res-comm NG fract']].product(axis=1, skipna=False)
# df['res-comm deliv CH4 Gg'] = df['res-comm NG Mcf'] * conversion_consumer_ng_mcf_to_ch4_gg
operations_year_p = df[['Company Name', 'EIA company ID', 'total deliv NG Mcf', 'res-comm NG Mcf']]

ldc_leak_rc = operations_year_p.copy()

ldc_leak_rc = pd.merge(
    ldc_leak_rc,
    dist_pipe_res_comm[['EIA company ID', 'dist pipe res-comm leak CH4 Gg']],
    on='EIA company ID', how='left'
)
ldc_leak_rc = pd.merge(
    ldc_leak_rc,
    res_comm_meter[['EIA company ID', 'res-comm meter leak CH4 Gg']],
    on='EIA company ID', how='left'
)
ldc_leak_rc = pd.merge(
    ldc_leak_rc,
    behind_leak[['EIA company ID', 'behind-the-meter res-comm CH4 Gg']],
    on='EIA company ID', how='left'
)

In [None]:
ldc_leak_rc['LDC res-comm leak total CH4 Gg'] = ldc_leak_rc[[
    'dist pipe res-comm leak CH4 Gg', 
    'res-comm meter leak CH4 Gg',
    'behind-the-meter res-comm CH4 Gg'
]].sum(axis=1)

In [None]:
ldc_leak_rc.head()

In [None]:
ldc_leak_rc['dist pipe res-comm leak CH4 Gg'].sum()

In [None]:
res_comm_dist_leak_rate_g_per_mcf = (g_per_gg*dist_pipe_leak_gg)/ldc_leak_rc['res-comm NG Mcf'].sum()
res_comm_dist_leak_rate_g_per_mcf

In [None]:
ldc_leak_rc['res-comm meter leak CH4 Gg'].sum()

In [None]:
res_comm_meter_leak_rate_g_per_mcf = (g_per_gg*ldc_leak_rc['res-comm meter leak CH4 Gg'].sum())/ldc_leak_rc['res-comm NG Mcf'].sum()
res_comm_meter_leak_rate_g_per_mcf

In [None]:
res_comm_behind_leak_rate_g_per_mcf = (g_per_gg*ldc_leak_rc[['behind-the-meter res-comm CH4 Gg']].sum().sum())/ldc_leak_rc['res-comm NG Mcf'].sum()
res_comm_behind_leak_rate_g_per_mcf

In [None]:
# TO DO: delete below, or double-check it; seems to be wrong to calculate it using total deliveries in denominator
# but it wasn't used for any subsequent calculations

# ldc_leak_rc_mean_rate = ldc_leak_rc['LDC res-comm leak total CH4 Gg'].sum() * g_per_gg / ldc_leak_rc['total deliv NG Mcf'].sum()

# # show mean leakage rate within cities:
# print(f"mean LDC res-comm leakage rate: {round(ldc_leak_rc_mean_rate, 1)} g CH4/Mcf delivered")

## compile LDC electric sector distribution emissions
* distribution pipeline leakage: dist_pipe_by_sector
* customer meter leakage: deliv_cust_e
* behind-the-meter leakage assumed to be 0

In [None]:
dist_pipe_elec = dist_pipe_by_sector[['EIA company ID', 'Electric Total Volume (Mcf)', 'dist pipe elec leak CH4 Gg']]
deliv_cust_e_p = deliv_cust_e[['EIA company ID', 'elec meter leak CH4 Gg']]

ldc_leak_elec = pd.merge(dist_pipe_elec, deliv_cust_e_p, on='EIA company ID', how='outer')
ldc_leak_elec['LDC elec leak total CH4 Gg'] = ldc_leak_elec[['dist pipe elec leak CH4 Gg', 'elec meter leak CH4 Gg']].sum(axis=1)

In [None]:
elec_leak_quant = ldc_leak_elec[['LDC elec leak total CH4 Gg']].sum()
elec_leak_quant

## compile LDC leakage for all sectors (res, comm, elec, ind)

In [None]:
ldc_leak_rce = pd.merge(
    ldc_leak_rc,
    ldc_leak_elec,
    on='EIA company ID',
    how='outer'
)

In [None]:
ldc_leak_rcei = ldc_leak_rce.copy()
ldc_leak_rcei = pd.merge(
    ldc_leak_rcei,
    dist_pipe_by_sector[['EIA company ID', 'dist pipe ind leak CH4 Gg']],
    on='EIA company ID',
    how='outer'
)
ldc_leak_rcei = pd.merge(
    ldc_leak_rcei,
    deliv_cust_i[['EIA company ID', 'Industrial Total Volume (Mcf)', 'indust meter leak CH4 Gg']],
    on='EIA company ID',
    how='outer'
)
ldc_leak_rcei['LDC ind leak total CH4 Gg'] = ldc_leak_rcei[
    ['dist pipe ind leak CH4 Gg', 'indust meter leak CH4 Gg']].sum()

In [None]:
total_leak_gg = ldc_leak_rcei[[
    'LDC res-comm leak total CH4 Gg',
    'LDC elec leak total CH4 Gg',
    'LDC ind leak total CH4 Gg',
]].sum(axis=1, skipna=True)

# TO DO: delete line below? seems to be causing problems in next spreadsheet; not sure it's correct
# ldc_leak_rcei['rcei city leak rate g CH4/Mcf'] = (total_leak_gg * g_per_gg) / ldc_leak_rcei['total deliv NG Mcf']

In [None]:
dist_pipe_leak_gg = ldc_leak_rcei[[
    'dist pipe res-comm leak CH4 Gg', 
    'dist pipe elec leak CH4 Gg', 
    'dist pipe ind leak CH4 Gg']].sum(axis=1).sum()
dist_pipe_leak_gg

In [None]:
ldc_leak_rcei['dist pipe elec leak CH4 Gg'].sum()

In [None]:
ldc_leak_rcei.sum()

In [None]:
# all meter leakage:
ldc_leak_rcei[[
    'res-comm meter leak CH4 Gg',
    'elec meter leak CH4 Gg',
    'indust meter leak CH4 Gg'
]].sum().sum()

In [None]:
ldc_leak_rcei[[
    'elec meter leak CH4 Gg',
]].sum().sum()

In [None]:
behind_leak_gg = ldc_leak_rcei['behind-the-meter res-comm CH4 Gg'].sum()
behind_leak_gg

## export all results (res-comm-elec-ind)

In [None]:
# export
save_timestamp = time.strftime('%Y-%m-%d_%H%M', time.localtime())
ldc_leak_rcei.to_csv(
    lca_path + 
    f'GIM LDC res-comm-elec-ind leak for {data_year} {save_timestamp}.csv', 
    index=False)

# END