In [None]:
import pandas as pd
import time

# for haversine function for transmission distance calculations
from math import radians, cos, sin, asin, sqrt
from collections import Counter

In [None]:
# input parameters from previous steps

# for input files from gas flows sub-model
num_iter = 5000
# prod_leak_timestamp = '2020-10-11_0954'
# prod_leak_timestamp = '2020-11-10_1856'
prod_leak_timestamp = '2020-11-12_0701'

# template for state files used for transmission distance calculations
state_df_file_template = f'US gas model - consuming_state results supp_fract_prod {num_iter} iterations ({prod_leak_timestamp}).csv'

In [None]:
# input files from external sources
eia_release_date = '2020-10-30'
consump_file = f'EIA Natural Gas Consumption by End Use NG_CONS_SUM_DCU_NUS_A released {eia_release_date}.xls'

In [None]:
# other input files compiled by GEM
# cities_file = 'US utilities by city (PHMSA & EIA) 2020-10-31.xlsx'
cities_file = 'GIM cities in index 2020-10-31.xlsx'

In [None]:
# file paths
lca_path = '/Users/masoninman/Dropbox/GEM/LCA of natural gas use/'
model_path = lca_path + 'US cities LCA of gas model/'
inputs_path = lca_path + 'US cities LCA of gas model/US gas model inputs/'
eia_path = lca_path + 'EIA data for LCA of gas/'
gas_consump_data_path = eia_path + 'EIA gas consumption data 2019 (released 2020-09-30)/'
state_coords_file = 'US Census - center of population by state (2010 census) and FIPS Code.xlsx'
state_abbrev_file = 'US states and abbreviations.xlsx'

In [None]:
# conversions
gg_per_kg = 1e-6
g_per_ton = 1e6

df = pd.read_excel(inputs_path + 'GIM parameters file.xlsx', sheet_name='main parameters')
parameters_main = df.set_index('parameter name')['parameter value']

data_year = int(parameters_main.at['data_year'])
contiguous_us_only = parameters_main.at['contiguous_us_only']
trans_leak_fract = parameters_main.at['trans_leak_fract']
trans_distance_avg_km = parameters_main.at['trans_distance_avg_km']
ch4_kg_per_mcf = parameters_main.at['ch4_kg_per_mcf']
ch4_fract_in_ng_consumer_grade = parameters_main.at['ch4_fract_in_ng_consumer_grade']

conversion_consumer_ng_mcf_to_ch4_gg = ch4_fract_in_ng_consumer_grade * ch4_kg_per_mcf * gg_per_kg
trans_leak_per_km = trans_leak_fract / trans_distance_avg_km

## transmission leakage
* calculate distance traveled for gas to each state or city
* apply leakage factor set at start of notebook

In [None]:
# calculate total transmission leakage, and rate of leakage (g/Mcf)
df = pd.read_excel(inputs_path + consump_file, sheet_name='Data 1', header=2)
df['year'] = df['Date'].dt.year
df = df.drop('Date', axis=1)
df = df.set_index('year')

mcf_per_mmcf = 1000
tot_mcf_consump_year = df.at[data_year, 'U.S. Natural Gas Total Consumption (MMcf)'] * mcf_per_mmcf

trans_leak_mcf_ch4_gg = tot_mcf_consump_year * trans_leak_fract * conversion_consumer_ng_mcf_to_ch4_gg
print(f"transmission leakage: {round(trans_leak_mcf_ch4_gg, 1)} Gg")
trans_leak_g_ch4_per_mcf = trans_leak_mcf_ch4_gg * 1e9 / tot_mcf_consump_year
print(f"transmission leakage rate: {round(trans_leak_g_ch4_per_mcf, 1)} g/Mcf")

In [None]:
# alternative method, based on transmission leakage %
# (gives same answer)
trans_leak_fract * ch4_fract_in_ng_consumer_grade * ch4_kg_per_mcf * 1000

In [None]:
# for Alvarez 2018 (analysis for 2015)
alvarez_trans_leak_gg = 1800
tot_mcf_2015 = df.at[2015, 'U.S. Natural Gas Total Consumption (MMcf)'] * mcf_per_mmcf
alvarez_trans_leak_g_ch4_per_mcf = alvarez_trans_leak_gg * 1e9 / tot_mcf_2015
alvarez_trans_leak_g_ch4_per_mcf

## functions to calculate transmission leakage by city

In [None]:
def read_state_coords_and_abbrev(state_coords_file, state_abbrev_file):
    state_coords = pd.read_excel(
        inputs_path + state_coords_file, 
        sheet_name='data'
    )
    state_coords = state_coords.rename(columns={
        'LATITUDE': 'Latitude',
        'LONGITUDE': 'Longitude',
        'STNAME': 'State',
        'STATEFP': 'State FIPS Code'
    })
    
    state_coords = state_coords.loc[state_coords['State']!='Puerto Rico']
    
    state_coords = state_coords.set_index('State')

    # add to state_coords:

    # for Canada, use center of Alberta
    state_coords.at['Canada', 'Latitude'] = 56.031769
    state_coords.at['Canada', 'Longitude'] =  -114.781056

    # for Fed GOM
    state_coords.at['GOM (federal)', 'Latitude'] = 28.748915
    state_coords.at['GOM (federal)', 'Longitude'] = -89.065917

    # for Mexico
    state_coords.at['Mexico', 'Latitude'] = 19.944744
    state_coords.at['Mexico', 'Longitude'] = -97.26833
    
    # -------
    
    states_df = pd.read_excel(inputs_path + state_abbrev_file)

    states_dict_abbrev_to_full = states_df.set_index('abbrev')['state'].to_dict()
    
    states_dict_full_to_abbrev = states_df.set_index('state')['abbrev'].to_dict()
    
    return(state_coords, states_dict_abbrev_to_full, states_dict_full_to_abbrev)

In [None]:
def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles
    return c * r

In [None]:
def create_clean_paths(state_df):
    """
    Create simpler ("clean") paths for imports, removing loops and other repeats of states.
    
    Uses three subfunctions:
    cut_middle_loops
    cut_producing_loops
    cut_consuming_loops
    """
    for row in state_df.index:
        path_list = state_df.at[row, 'path'].split('-')
        consum_state = path_list[0]
        prod_state = path_list[-1]
        middle_states = path_list[1:-1]

        if len(path_list) == 1:
            path_list_clean = path_list
        elif len(path_list) > 1:
            path_list_clean = cut_middle_loops(consum_state, prod_state, middle_states)
            path_list_clean = cut_producing_loops(path_list_clean, prod_state)
            path_list_clean = cut_consuming_loops(path_list_clean, consum_state)
        else:
            print("Error!" + f" Unexpected case for path_list: {path_list}")

        state_df.at[row, 'clean path'] = str(path_list_clean)
        
    return(state_df)

In [None]:
def cut_middle_loops(consum_state, prod_state, middle_states):
    counts = Counter(middle_states)

    for state in middle_states:
        if counts[state] > 1:
            # there's a loop; cut it out
            middle_match = []
            for num in range(0, len(middle_states)):
                if middle_states[num] == state:
                    middle_match.append(num)

            # remove states from first index + 1 to last index
            del middle_states[middle_match[0]+1:middle_match[-1]+1]

    path_list_clean = [consum_state] + middle_states + [prod_state]
    
    return path_list_clean

In [None]:
def cut_producing_loops(path_list, prod_state):    
    for num in range(0, len(path_list)-1):
        count_back = len(path_list)-2-num
        state_to_check = path_list[count_back]

        if prod_state == state_to_check:
            # drop that state from the list, based on the number
            del path_list[count_back]
        else:
            pass

    # add back prod_state
    path_list.append(prod_state)

    return path_list

In [None]:
def cut_consuming_loops(path_list, consum_state):
    # find all indices that match consuming state
    match = []
    for num in range(0, len(path_list)):
        if path_list[num] == consum_state:
            match.append(num)

    # take the last index that matches; del list elements before that
    path_list = path_list[match[-1]:]

    return path_list

In [None]:
def process_one_path_list(path_list, state_coords):
    # iterate through the path, 
    # calculating distances between each successive pair of states in the list
    path_dist = 0 # initialize
    
    # TEST: check that state_coords was read correctly
    if len(state_coords)==0:
        print('Error!' + f" DataFrame state_coords was not read correctly; len: {len(state_coords)}")
    elif len(state_coords)>0:
        if len(state_coords)==54:
            pass
        else:
            # print('Warning!' + f"DataFrame state_coords was expected to have len 54, but actually had len {len(state_coords)}")
            pass
    # END OF TEST
    
    if len(path_list) > 1:
        for state_1_num in range(0, len(path_list)-1):
            state_2_num = state_1_num + 1

            # get names of state_1 and state_2
            state_1 = path_list[state_1_num]
            state_2 = path_list[state_2_num]

            try:
                # get centroids of each state
                state_1_lat = state_coords.at[state_1, 'Latitude']
                state_1_lon = state_coords.at[state_1, 'Longitude']
                state_2_lat = state_coords.at[state_2, 'Latitude']
                state_2_lon = state_coords.at[state_2, 'Longitude']

                # calculate distance between them
                pair_dist = haversine(
                    state_1_lon, state_1_lat,
                    state_2_lon, state_2_lat)
                                
                # add that distance to path_dist
                path_dist += pair_dist
                
            except:
                # print(f"Didn't find {state_2}; assumed 0 distance") # for db
                pass

    return path_dist

In [None]:
def process_one_coord_list(coord_list):
    # iterate through the list, 
    # calculating distances between each successive pair of coordinates in the list
    
    if len(coord_list)==1:
        print("Error!" + " All coord_lists should have more than 1 tuple.")
        print(f"coord_list: {coord_list}")
    
    elif len(coord_list) > 1:
        path_dist = 0 # initialize
        
        for coord_1_num in range(0, len(coord_list)-1):
            coord_2_num = coord_1_num + 1
            
            coord_1_lat = coord_list[coord_1_num][0]
            coord_1_lon = coord_list[coord_1_num][1]

            coord_2_lat = coord_list[coord_2_num][0]
            coord_2_lon = coord_list[coord_2_num][1]
            
            # calculate distance between them
            pair_dist = haversine(
                coord_1_lon, coord_1_lat,
                coord_2_lon, coord_2_lat)
            
            # add that distance to path_dist
            path_dist = path_dist + pair_dist
            
    else:
        print("Error!" + f" len(coord_list) had unexpected value: {len(coord_list)}")
        
    return path_dist

In [None]:
def calc_distance_gas_travels_to_each_state(
    contig_48_states_dc, state_df_file_template, prod_leak_timestamp):
    state_distances_list = [] # initialize

    for consuming_state in contig_48_states_dc:
        print(f"processing consuming_state: {consuming_state}") # for UI

        # create name of file using template, and inserting a particular state
        state_df_file = state_df_file_template.replace('consuming_state', consuming_state)
        
        try:
            state_df = pd.read_csv(inputs_path + f'GIM gas flows results {prod_leak_timestamp}/' + state_df_file)
        except:
            print("Error!" + " File not found. It may be that the state files need to be moved to the GIM inputs folder.")
            print(f"File name: {state_df_file}")

        state_df = create_clean_paths(state_df)

        for row in state_df.index:
            path_str = state_df.at[row, 'clean path']
            path_list = path_str.strip('\[').strip('\]').replace("'", "").split(', ')

            path_dist = process_one_path_list(path_list, state_coords)

            state_df.at[row, 'path dist'] = path_dist

        # after iterating through all rows (for all the paths),
        # calculate the weighted average distance for each consuming state
        state_df_weighted = state_df['share scaled'] * state_df['path dist']
        state_avg_dist = state_df_weighted.sum()/state_df['share scaled'].sum()

        # add result to list (to become df) for all states
        state_distances_list += [(consuming_state, state_avg_dist)]

    state_distances = pd.DataFrame(state_distances_list, columns=['state', 'avg distance (km)'])
    
    return state_distances

In [None]:
# list of jurisidictions to calculate leakage for, attributable to their consumption
# contiguous 48 states + DC
contig_48_states_dc = [
    'Alabama', 'Arizona', 'Arkansas', 'California', 'Colorado',
       'Connecticut', 'Delaware', 'District of Columbia', 'Florida', 'Georgia',
       'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky',
       'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan',
       'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada',
       'New Hampshire', 'New Jersey', 'New Mexico', 'New York',
       'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon',
       'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota',
       'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington',
       'West Virginia', 'Wisconsin', 'Wyoming'
]

In [None]:
(state_coords, states_dict_abbrev_to_full, states_dict_full_to_abbrev) = read_state_coords_and_abbrev(
    state_coords_file, state_abbrev_file)

state_distances = calc_distance_gas_travels_to_each_state(
    contig_48_states_dc, state_df_file_template, prod_leak_timestamp)

In [None]:
# calculate nationwide average leakage fraction per distance
trans_leak_fract_per_km = trans_leak_fract / trans_distance_avg_km

# convert from leakage fraction to g CH4/Mcf
trans_leak_g_ch4_per_mcf = trans_leak_fract_per_km * conversion_consumer_ng_mcf_to_ch4_gg * 1e9

trans_leak_state = state_distances.copy()
trans_leak_state['trans leak g CH4/Mcf'] = trans_leak_state['avg distance (km)'] * trans_leak_g_ch4_per_mcf

In [None]:
# export
save_timestamp = time.strftime('%Y-%m-%d_%H%M', time.localtime())
trans_leak_state.to_csv(
    lca_path + f'GIM average transmission distance for gas by consuming state for analysis year {data_year} {save_timestamp}.csv', 
    index=False,
)

In [None]:
def read_states_df_create_list(state_abbrev_file):
    states_df = pd.read_excel(inputs_path + state_abbrev_file)
    
    if contiguous_us_only == True:
        states_df = states_df.loc[~states_df['state'].isin(['Alaska', 'Hawaii'])]
    else:
        pass
    
    states_list = states_df['state'].tolist()
    
    all_jurisdictions_list = states_list + ['Canada', 'Mexico', 'overseas']
    
    return(states_list, all_jurisdictions_list)

In [None]:
def read_eia_consump_data(states_list, states_dict_full_to_abbrev):
    """
    Import EIA data on gas consumption by state. Units are MMcf (million cubic feet).
    """
    df = pd.DataFrame() # initialize

    for state in states_list:
        state_abbrev = states_dict_full_to_abbrev[state]
        consump_juris_df = pd.read_excel(
            gas_consump_data_path +
            f'NG_CONS_SUM_DCU_S{state_abbrev}_A.xls', 
            sheet_name='Data 1', 
            header=2,
        )

        consump_juris_df['year'] = consump_juris_df['Date'].astype(str).str.split('-').str[0].astype(int)

        consump_juris_df = consump_juris_df.set_index('year')
        consump_juris_df = consump_juris_df.drop('Date', axis=1)

        # keep only data_year
        consump_juris_df = consump_juris_df.loc[consump_juris_df.index==data_year]

        # pull out total consumption series
        consump_juris_df_tot = consump_juris_df[[f'{state} Natural Gas Total Consumption (MMcf)']]
        consump_juris_df_tot = consump_juris_df_tot.rename(columns={f'{state} Natural Gas Total Consumption (MMcf)': state})

        df = pd.concat([df, consump_juris_df_tot], axis=1)        
    df = df.T
    df = df.rename(columns={data_year: 'consump MMcf'})
    consump_juris_tot_mmcf = df

    return consump_juris_tot_mmcf

In [None]:
(states_list, all_jurisdictions_list) = read_states_df_create_list(state_abbrev_file)

In [None]:
consump_juris_tot_mmcf = read_eia_consump_data(states_list, states_dict_full_to_abbrev)

df = pd.merge(state_distances, consump_juris_tot_mmcf, left_on='state', right_index=True, how='outer')
df['trans leak NG MMcf'] = trans_leak_per_km * df[['avg distance (km)', 'consump MMcf']].product(axis=1)

# calculate total transmission leakage, based on transport of gas from state to state
df['trans leak CH4 Gg'] = df['trans leak NG MMcf'] * 1000 * conversion_consumer_ng_mcf_to_ch4_gg

if contiguous_us_only==True:
    print(f"transmission leakage for contiguous US (Gg): {round(df['trans leak CH4 Gg'].sum(), 1)}")
elif contiguous_us_only == False:
    print(f"transmission leakage for all US (Gg): {round(df['trans leak CH4 Gg'].sum(), 1)}")
    
trans_leak_state_level = df

In [None]:
# export:
save_timestamp = time.strftime('%Y-%m-%d_%H%M', time.localtime())
trans_leak.to_excel(
    lca_path + 
    f'GIM results - transmission leakage by consuming state {save_timestamp}.xlsx', 
    index=False)

In [None]:
# calculate transmission leakage rate g/Mcf
# assume all gas consumed by end-users is transported over the average distance calculated

# note that calculation of trans_leak_gg above is based on total consumption, 
# which includes "lease and plant fuel consumption"

# to be more conservative, could assume zero distance for lease and plant fuel consumption
# from EIA definitions
# https://www.eia.gov/dnav/ng/TblDefs/ng_cons_sum_tbldef2.asp
# Lease use: Natural gas used in well, field, and lease operations, such as gas used in drilling operations, heaters, dehydrators, and field compressors
# Plant fuel: Natural gas used as fuel in natural gas processing plants

trans_leak_g_ch4_per_mcf = (df['trans leak CH4 Gg'].sum()*1e9)/(consump_juris_tot_mmcf['consump MMcf'].sum()*1000)

print(f"trans_leak_g_ch4_per_mcf: {round(trans_leak_g_ch4_per_mcf, 1)}")

### Transmission: for each city, calculate distance & emissions

In [None]:
def calc_city_distances_gas_transmission(
    cities_file, states_dict_abbrev_to_full, 
    state_df_file_template, prod_leak_timestamp):
    
    # import data on cities
    cities = pd.read_excel(inputs_path + cities_file, sheet_name='data')
    cities = cities.drop(['notes'], axis=1)
    
    city_distances_list = [] # initialize

    city_coords = cities[['metro area', 'metro area state', 'Census urban area name',
                          'latitude', 'longitude']].drop_duplicates()
    city_coords = city_coords.set_index(['metro area', 'metro area state'])

    for city_st in city_coords.index:
        print(f"processing {city_st}")
        city_coords_tuple = (city_coords.loc[city_st, 'latitude'], city_coords.loc[city_st, 'longitude'])

        metro_area = city_st[0]
        consuming_state_abbrev = city_st[1]
        urban_name = city_coords.at[city_st, 'Census urban area name']
        consuming_state = states_dict_abbrev_to_full[consuming_state_abbrev]

        # create name of file using template, and inserting a particular state
        state_df_file = state_df_file_template.replace('consuming_state', consuming_state)
        state_df = pd.read_csv(inputs_path + f'GIM gas flows results {prod_leak_timestamp}/' + state_df_file)
        state_df = create_clean_paths(state_df)

        # TEST: check that sum of shares is 1 (within rounding error)
        shares_sum = state_df['share scaled'].sum()
        if abs(1 - shares_sum) < 1e-6:
            pass
        else:
            print("Error!" + f" Shares sum is not 1 (within rounding error); it's: {shares_sum}")
        # END OF TEST

        city_df = state_df.copy()

        for row in state_df.index:
            path_str = state_df.at[row, 'clean path']
            path_list = path_str.strip('\[').strip('\]').replace("'", "").split(', ')

            coord_list = [] # initialize

            path_coords_tuple_prev = () # initialize
            for path_element in path_list:
                try:
                    path_coords_tuple = (state_coords.at[path_element, 'Latitude'], state_coords.at[path_element, 'Longitude'])
                except:
                    # print("Error!" + f" {path_element} is not in state_coords df; use path_coords_tuple_prev: {path_coords_tuple_prev}") # for db
                    path_coords_tuple = path_coords_tuple_prev

                path_coords_tuple_prev = path_coords_tuple

                coord_list.append(path_coords_tuple)

            if len(path_list)==1:
                # if the list has len == 1, it is for consumption that comes from production in the same state
                # put the city coordinates at the start of the list, to calculate distance from city to center of state
                coord_list = [city_coords_tuple] + coord_list

            elif len(path_list)>1:
                # replace the first tuple of coordinates (the consuming state) with the city's coordinates
                del coord_list[0]
                city_coords_tuple_list = [city_coords_tuple]
                coord_list = city_coords_tuple_list + coord_list

            path_dist = process_one_coord_list(coord_list)

            city_df.at[row, 'path dist'] = path_dist

        # after iterating through all rows (for all the paths),
        # calculate the weighted average distance for each consuming state
        if abs(1 - city_df['share scaled'].sum()) < 1e-3:
            # then all fractional shares of imports add up to 1
            # calculate average distance by multiplying fractional shares (column name is consuming state)
            # by the path distance for that particular share        
            city_avg_dist = (city_df['share scaled'] * city_df['path dist']).sum()

            # add result to list (to become df) for all states
            city_distances_list += [[metro_area, consuming_state_abbrev, urban_name, city_avg_dist]]
        else:
            print("Error!" + f" For {city_st}, the fractional shares didn't sum to 1: {city_df['share scaled'].sum()}")

    # after iterating through all cities, assemble into df
    city_distances = pd.DataFrame(city_distances_list, columns=[
        'metro area', 'metro area state', 'urban name', 'avg distance (km)'])
    
    return city_distances

In [None]:
city_distances = calc_city_distances_gas_transmission(
    cities_file, states_dict_abbrev_to_full, 
    state_df_file_template, prod_leak_timestamp)

In [None]:
city_distances.sort_values(by='avg distance (km)')

In [None]:
print(f"mean distance gas traveled to each city: {round(city_distances['avg distance (km)'].mean(), 1)} km")
ratio = city_distances['avg distance (km)'].max()/city_distances['avg distance (km)'].min()

print(f"ratio of max/min transmission distance: {round(ratio, 1)}")

## transmission leakage by city
(adjustment to production and transmission leakage rates)

In [None]:
def calc_trans_leak_by_city(city_distances):  
    trans_leak_per_km = trans_leak_fract / trans_distance_avg_km
    
    df = city_distances.copy()
    df['trans leakage fract'] = trans_leak_per_km * df['avg distance (km)']
    
    # convert to mass methane per volume of natural gas delivered
    # convert vol NG to vol CH4, then convert vol CH4 to mass, then convert mass to other units
    df['trans leak g CH4/Mcf'] = df['trans leakage fract'] * ch4_fract_in_ng_consumer_grade * (ch4_kg_per_mcf/1000) * g_per_ton

    df['full state'] = df['metro area state'].replace(states_dict_abbrev_to_full)

    df = df[[
        'metro area',
        'metro area state',
        'urban name',
        'avg distance (km)',
        'trans leak g CH4/Mcf',
    ]]

    trans_leak_by_city = df
    
    return trans_leak_by_city

In [None]:
trans_leak_by_city = calc_trans_leak_by_city(city_distances)

In [None]:
# show
trans_leak_by_city.sort_values(by='trans leak g CH4/Mcf')

In [None]:
# export
save_timestamp = time.strftime('%Y-%m-%d_%H%M', time.localtime())
trans_leak_by_city.to_csv(
    f'GIM trans leak by city for {data_year} {save_timestamp}.csv', 
    index=False)

# END