## Overview

The goal of this notebook is to gather data from the National Transit Database (NTD) and parse it into something usable for our transit carbon calculations. There are two datasets we need to import and parse, one being [NTD Annual Data - Fuel and Energy](https://data.transportation.gov/Public-Transit/2022-NTD-Annual-Data-Fuel-and-Energy/8ehq-7his/data) set, and the other being [NTD Service](https://www.transit.dot.gov/ntd/data-product/2022-service).

<details>
<summary>NTD Fuel and Energy data format</summary>

</details>

<details>
<summary>NTD Service data format</summary>

</details>

#### To Maintain

To update the data each year, update the two url variables below with the most recent download link.

In [1]:
import pandas as pd
from zipfile import ZipFile
from io import BytesIO
from urllib.request import urlopen

ntd_urls = {
  2022: {
    # https://www.transit.dot.gov/ntd/data-product/2022-fuel-and-energy
    "fuel_energy_csv": "https://data.transportation.gov/api/views/8ehq-7his/rows.csv?date=20231027&accessType=DOWNLOAD&bom=true&format=true",
    # https://www.transit.dot.gov/ntd/data-product/2022-service
    "service_csv": "https://data.transportation.gov/api/views/4fir-qbim/rows.csv?date=20231102&accessType=DOWNLOAD&bom=true&format=true"
  },
  2021: {
    # https://www.transit.dot.gov/ntd/data-product/2021-fuel-and-energy
    "fuel_energy_xlsx": "https://www.transit.dot.gov/sites/fta.dot.gov/files/2023-12/2021%20Fuel%20and%20Energy_1-1_0.xlsx",
    # https://www.transit.dot.gov/ntd/data-product/2021-service
    "service_xlsx": "https://www.transit.dot.gov/sites/fta.dot.gov/files/2022-10/2021%20Service_static.xlsx"
  },
  2020: {
    # https://www.transit.dot.gov/ntd/data-product/2020-fuel-and-energy
    "fuel_energy_xlsx": "https://www.transit.dot.gov/sites/fta.dot.gov/files/2023-12/2020-Fuel%20and%20Energy_1-1_1.xlsx",
    # https://www.transit.dot.gov/ntd/data-product/2020-service
    "service_zip": "https://www.transit.dot.gov/sites/fta.dot.gov/files/2020-Service.zip"
  },
  2019: {
    # https://www.transit.dot.gov/ntd/data-product/2019-fuel-and-energy
    "fuel_energy_zip": "https://www.transit.dot.gov/sites/fta.dot.gov/files/Fuel%20and%20Energy.zip",
    # https://www.transit.dot.gov/ntd/data-product/2019-service
    "service_zip": "https://www.transit.dot.gov/sites/fta.dot.gov/files/Service.zip"
  },
  2018: {
    # https://www.transit.dot.gov/ntd/data-product/2018-fuel-and-energy
    "fuel_energy_xlsm": "https://www.transit.dot.gov/sites/fta.dot.gov/files/Fuel%20and%20Energy_3.xlsm",
    # https://www.transit.dot.gov/ntd/data-product/2018-service
    "service_xlsm": "https://www.transit.dot.gov/sites/fta.dot.gov/files/Service_4.xlsm"
  }
}

### Unify the NTD Fuel and Energy data into one JSON file

All of the NTD data between the years is split up into different file structures and formats, so we need to unify that into one cohesive JSON file. We will also need to refactor all of the data a bit by converting all the rows, which are represented by arrays, into dictionaries, removing extraneous values, and adding additional fields where necessary.

In [27]:
ntd_fuel_energy_data = {}
ntd_to_uace = {}
output = {}

def refactor_fuel_energy_data(url_entry):
    '''
    Take all of the NTD Fuel and Energy data and do the following
    1) Load in the data from the file
    2) Convert the rows into dictionaries
    3) Group all the rows by UACE code
    4) Write all the data to a single file
    '''
    # Load in data
    df = load_dataframe(url_entry, "Fuel and Energy")
    # Ensure there are no NaN values
    df = df.fillna(0)
    # Convert all rows that are arrays into dictionaries
    converted_rows = convert_arrays_dictionary(df, url_entry["year"])
    # Group all rows by UACE code
    aggregate_data = group_by_uace(converted_rows)
    # Add to data
    ntd_fuel_energy_data[url_entry["year"]] = aggregate_data
    print("Service data year " + url_entry["year"] + " is finished!")


def load_dataframe(urls, db_name, sheet_name):
    '''
    Return a dataframe for the given urls, database name and sheet name.
    '''
    if db_name + "_csv" in urls:
        return pd.read_csv(urls[db_name + "_csv"])
    elif db_name + "_xlsx" in urls:
        return pd.read_excel(urls[db_name + "_xlsx"], sheet_name=sheet_name)
    elif db_name + "_xlsm" in urls:
        with urlopen(urls[db_name + "_xlsm"]) as xlsm_file:
            return pd.read_excel(BytesIO(xlsm_file.read()), sheet_name=sheet_name)
    elif db_name + "_zip" in urls:
        with urlopen(urls[db_name + "_zip"]) as zip_file:
            with ZipFile(BytesIO(zip_file.read())) as zip_ref:
                with zip_ref.open(zip_ref.namelist()[0]) as xlsm_file:
                    return pd.read_excel(xlsm_file, sheet_name=sheet_name)


# def convert_arrays_dictionary(df):
#     '''
#     Takes in a dataframe and converts all of the rows into a dictionary.
#     '''
#     for _, row in df.iterrows():
#         print(row.to_dict().items())
#     converted_rows = [{k: v for k, v in row.to_dict().items()} for _, row in df.iterrows()]
#     print(converted_rows)
#     return converted_rows

def group_by_uace(converted_rows):
    '''
    Organize the data into UACE codes.
    Example data:
    {
        "uace_code": [
            {
                "field1": 1,
                ...
            },
            {
                "field1": 1,
                ...
            }
        ]
    } 
    '''
    aggregate_data = defaultdict(list)
    for row in converted_rows:
        code = row["UACE Code"]
        aggregate_data[code].append(row)
    return aggregate_data

def map_ntd_to_uace(fuel_energy_df):
    '''
    Use 2022 data to create a mapping between NTD ids and UACE codes because older versions of the data
    2018-2021 don't have an UACE field.
    '''
    for _, row in fuel_energy_df.iterrows():
        temp = row.to_dict()
        ntd_to_uace[temp["NTD ID"]] = temp["UACE Code"]

fuel_energy_col_renames = {
    "Diesel": "Diesel (miles)",
    "Gasoline": "Gasoline (miles)",
    "Liquefied Petroleum Gas": "Liquefied Petroleum Gas (miles)",
    "Compressed Natural Gas": "Compressed Natural Gas (miles)",
    "Bio-Diesel": "Bio-Diesel (miles)",
    "Hydrogen": "Hydrogen (miles)",
    "Other Fuel": "Other Fuel (miles)",
    "Electric Propulsion": "Electric Propulsion (miles)",
    "Electric Battery": "Electric Battery (miles)",

    "Electric Propulsion (kwh)": "Electric Propulsion (kWh)",
    "Electric Battery (kwh)": "Electric Battery (kWh)"
}

service_col_renames = {
    "Passenger Miles": "Passenger Miles Traveled",
    "Unlinked Passenger Trips (UPT)": "Unlinked Passenger Trips",
    "Actual Vehicle/Passenger Car Miles": "Vehicle Miles",
}

ntd_to_uace = {}
for year, urls in ntd_urls.items():
    fuel_energy_df = load_dataframe(urls, 'fuel_energy', 'Fuel and Energy')
    if ntd_to_uace == {}:
        map_ntd_to_uace(fuel_energy_df)
    fuel_energy_df = fuel_energy_df.rename(columns=fuel_energy_col_renames)
    # if Hydrogen cols are missing, add them with all 0 values
    if 'Hydrogen (miles)' not in fuel_energy_df.columns:
        fuel_energy_df['Hydrogen (miles)'] = 0
    if 'Hydrogen (kg)' not in fuel_energy_df.columns:
        fuel_energy_df['Hydrogen (kg)'] = 0
    fuel_energy_df = fuel_energy_df[['NTD ID',
                                    'Mode',
                                    'Diesel (gal)',
                                    'Gasoline (gal)',
                                    'Liquefied Petroleum Gas (gal equivalent)',
                                    'Compressed Natural Gas (gal equivalent)',
                                    'Bio-Diesel (gal)',
                                    'Hydrogen (kg)',
                                    'Other Fuel (gal/gal equivalent)',
                                    'Electric Propulsion (kWh)',
                                    'Electric Battery (kWh)',
                                    'Diesel (miles)',
                                    'Gasoline (miles)',
                                    'Liquefied Petroleum Gas (miles)',
                                    'Compressed Natural Gas (miles)',
                                    # 'Bio-Diesel (miles)',
                                    'Hydrogen (miles)',
                                    'Other Fuel (miles)',
                                    'Electric Propulsion (miles)',
                                    'Electric Battery (miles)'
    ]]

    # remove commas, convert to numeric, and sort
    fuel_energy_df = fuel_energy_df.replace({',': ''}, regex=True)
    fuel_energy_df = fuel_energy_df.apply(pd.to_numeric, errors='ignore')
    fuel_energy_df = fuel_energy_df.sort_values(by=['NTD ID', 'Mode'])
    # sum rows with the same NTD ID and Mode
    fuel_energy_df = fuel_energy_df.groupby(['NTD ID', 'Mode'], as_index=False).sum()

    fuel_energy_df['All Fuels (miles)'] = fuel_energy_df['Diesel (miles)'] \
                                        + fuel_energy_df['Gasoline (miles)'] \
                                        + fuel_energy_df['Liquefied Petroleum Gas (miles)'] \
                                        + fuel_energy_df['Compressed Natural Gas (miles)'] \
                                        + fuel_energy_df['Hydrogen (miles)'] \
                                        + fuel_energy_df['Other Fuel (miles)'] \
                                        + fuel_energy_df['Electric Propulsion (miles)'] \
                                        + fuel_energy_df['Electric Battery (miles)']

    service_df = load_dataframe(urls, 'service', 'Annual Service Data By Mode')
    service_df = service_df.rename(columns=service_col_renames)
    service_df = service_df[['NTD ID',
                             'Mode',
                             'Passenger Miles Traveled',
                             'Unlinked Passenger Trips',
                             'Vehicle Miles',
                             'Train Miles',
    ]]
    # remove commas, convert to numeric, and sort
    service_df = service_df.replace({',': ''}, regex=True)
    service_df = service_df.apply(pd.to_numeric, errors='ignore')
    service_df = service_df.sort_values(by=['NTD ID', 'Mode'])
    # sum rows with the same NTD ID and Mode
    service_df = service_df.groupby(['NTD ID', 'Mode'], as_index=False).sum()

    # remove rows that have "Passenger Miles Traveled" = 0
    service_df = service_df[service_df['Passenger Miles Traveled'] != 0]

    print(fuel_energy_df.dtypes)
    display(fuel_energy_df)
    print(service_df.dtypes)
    display(service_df)

    df = fuel_energy_df.merge(service_df, on=['NTD ID', 'Mode'])
    # remove rows that have "All Fuels (miles)" = 0
    df = df[df['All Fuels (miles)'] != 0]

    # add UACE code
    df['UACE Code'] = df['NTD ID'].map(ntd_to_uace)

    print(df.dtypes)
    display(df)

    output[year] = df.to_dict(orient='records')

# Write to file
with open('ntd_data.json', 'w') as f:
    import json
    json.dump(output, f, indent=2)

  fuel_energy_df = fuel_energy_df.apply(pd.to_numeric, errors='ignore')


NTD ID                                       int64
Mode                                        object
Diesel (gal)                                 int64
Gasoline (gal)                               int64
Liquefied Petroleum Gas (gal equivalent)     int64
Compressed Natural Gas (gal equivalent)      int64
Bio-Diesel (gal)                             int64
Hydrogen (kg)                                int64
Other Fuel (gal/gal equivalent)              int64
Electric Propulsion (kWh)                    int64
Electric Battery (kWh)                       int64
Diesel (miles)                               int64
Gasoline (miles)                             int64
Liquefied Petroleum Gas (miles)              int64
Compressed Natural Gas (miles)               int64
Hydrogen (miles)                             int64
Other Fuel (miles)                           int64
Electric Propulsion (miles)                  int64
Electric Battery (miles)                     int64
All Fuels (miles)              

  service_df = service_df.apply(pd.to_numeric, errors='ignore')


Unnamed: 0,NTD ID,Mode,Diesel (gal),Gasoline (gal),Liquefied Petroleum Gas (gal equivalent),Compressed Natural Gas (gal equivalent),Bio-Diesel (gal),Hydrogen (kg),Other Fuel (gal/gal equivalent),Electric Propulsion (kWh),Electric Battery (kWh),Diesel (miles),Gasoline (miles),Liquefied Petroleum Gas (miles),Compressed Natural Gas (miles),Hydrogen (miles),Other Fuel (miles),Electric Propulsion (miles),Electric Battery (miles),All Fuels (miles)
0,1,DR,48943,695954,274670,0,0,0,0,0,0,516803,4964066,1370803,0,0,1370803,0,0,8222475
1,1,FB,18944,0,0,0,194128,0,0,0,0,52574,0,0,0,0,0,0,0,52574
2,1,MB,7241546,139985,0,0,381262,0,0,0,970902,35273756,1486573,0,0,0,0,0,411760,37172089
3,1,SR,0,0,0,0,0,0,0,1871516,0,0,0,0,0,0,0,186566,0,186566
4,1,TB,0,0,0,0,0,0,0,14936108,0,0,0,0,0,0,0,2817710,0,2817710
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1147,99423,DR,0,13600,0,0,0,0,0,0,0,0,127685,0,0,0,0,0,0,127685
1148,99423,MB,0,0,0,354288,0,0,0,0,0,0,0,0,920187,0,0,0,0,920187
1149,99424,DR,0,31512,0,0,0,0,0,0,0,0,165476,0,0,0,0,0,0,165476
1150,99424,MB,0,0,0,258854,0,0,0,0,0,0,0,0,880599,0,0,0,0,880599


NTD ID                       int64
Mode                        object
Passenger Miles Traveled     int64
Unlinked Passenger Trips     int64
Vehicle Miles                int64
Train Miles                  int64
dtype: object


Unnamed: 0,NTD ID,Mode,Passenger Miles Traveled,Unlinked Passenger Trips,Vehicle Miles,Train Miles
0,1,DR,7159157,666004,6456147,0
1,1,FB,1361870,400407,51236,0
2,1,MB,213459017,53983641,37095313,0
3,1,SR,1269696,1117605,186566,186566
4,1,TB,17545751,9575042,2812243,0
...,...,...,...,...,...,...
3457,99423,DR,69757,17839,102368,0
3458,99423,MB,1127888,517073,919317,0
3459,99424,DR,125538,37434,161226,0
3460,99424,MB,1905376,1053208,797741,0


NTD ID                                       int64
Mode                                        object
Diesel (gal)                                 int64
Gasoline (gal)                               int64
Liquefied Petroleum Gas (gal equivalent)     int64
Compressed Natural Gas (gal equivalent)      int64
Bio-Diesel (gal)                             int64
Hydrogen (kg)                                int64
Other Fuel (gal/gal equivalent)              int64
Electric Propulsion (kWh)                    int64
Electric Battery (kWh)                       int64
Diesel (miles)                               int64
Gasoline (miles)                             int64
Liquefied Petroleum Gas (miles)              int64
Compressed Natural Gas (miles)               int64
Hydrogen (miles)                             int64
Other Fuel (miles)                           int64
Electric Propulsion (miles)                  int64
Electric Battery (miles)                     int64
All Fuels (miles)              

Unnamed: 0,NTD ID,Mode,Diesel (gal),Gasoline (gal),Liquefied Petroleum Gas (gal equivalent),Compressed Natural Gas (gal equivalent),Bio-Diesel (gal),Hydrogen (kg),Other Fuel (gal/gal equivalent),Electric Propulsion (kWh),...,Hydrogen (miles),Other Fuel (miles),Electric Propulsion (miles),Electric Battery (miles),All Fuels (miles),Passenger Miles Traveled,Unlinked Passenger Trips,Vehicle Miles,Train Miles,UACE Code
0,1,DR,48943,695954,274670,0,0,0,0,0,...,0,1370803,0,0,8222475,7159157,666004,6456147,0,80389
1,1,FB,18944,0,0,0,194128,0,0,0,...,0,0,0,0,52574,1361870,400407,51236,0,80389
2,1,MB,7241546,139985,0,0,381262,0,0,0,...,0,0,0,411760,37172089,213459017,53983641,37095313,0,80389
3,1,SR,0,0,0,0,0,0,0,1871516,...,0,0,186566,0,186566,1269696,1117605,186566,186566,80389
4,1,TB,0,0,0,0,0,0,0,14936108,...,0,0,2817710,0,2817710,17545751,9575042,2812243,0,80389
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1141,99422,VP,0,391661,0,0,0,0,0,0,...,0,0,0,0,5519061,37515600,724678,8391406,0,85087
1142,99423,DR,0,13600,0,0,0,0,0,0,...,0,0,0,0,127685,69757,17839,102368,0,51445
1143,99423,MB,0,0,0,354288,0,0,0,0,...,0,0,0,0,920187,1127888,517073,919317,0,51445
1144,99424,DR,0,31512,0,0,0,0,0,0,...,0,0,0,0,165476,125538,37434,161226,0,51445


  fuel_energy_df = fuel_energy_df.apply(pd.to_numeric, errors='ignore')


NTD ID                                       int64
Mode                                        object
Diesel (gal)                                 int64
Gasoline (gal)                               int64
Liquefied Petroleum Gas (gal equivalent)     int64
Compressed Natural Gas (gal equivalent)      int64
Bio-Diesel (gal)                             int64
Hydrogen (kg)                                int64
Other Fuel (gal/gal equivalent)              int64
Electric Propulsion (kWh)                    int64
Electric Battery (kWh)                       int64
Diesel (miles)                               int64
Gasoline (miles)                             int64
Liquefied Petroleum Gas (miles)              int64
Compressed Natural Gas (miles)               int64
Hydrogen (miles)                             int64
Other Fuel (miles)                           int64
Electric Propulsion (miles)                  int64
Electric Battery (miles)                     int64
All Fuels (miles)              

  service_df = service_df.apply(pd.to_numeric, errors='ignore')


Unnamed: 0,NTD ID,Mode,Diesel (gal),Gasoline (gal),Liquefied Petroleum Gas (gal equivalent),Compressed Natural Gas (gal equivalent),Bio-Diesel (gal),Hydrogen (kg),Other Fuel (gal/gal equivalent),Electric Propulsion (kWh),Electric Battery (kWh),Diesel (miles),Gasoline (miles),Liquefied Petroleum Gas (miles),Compressed Natural Gas (miles),Hydrogen (miles),Other Fuel (miles),Electric Propulsion (miles),Electric Battery (miles),All Fuels (miles)
0,1,DR,80768,612884,349270,0,0,0,0,0,0,863077,4473638,0,0,0,1780680,0,0,7117395
1,1,FB,19736,0,0,0,181039,0,0,0,0,50395,0,0,0,0,0,0,0,50395
2,1,MB,7469540,83175,0,0,434148,0,0,0,80444,36789560,812167,0,0,0,0,0,4156,37605883
3,1,SR,0,0,0,0,0,0,0,2120085,0,0,0,0,0,0,0,193919,0,193919
4,1,TB,0,0,0,0,0,0,0,15561912,0,0,0,0,0,0,0,2871115,0,2871115
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1148,99423,DR,0,14311,0,0,0,0,0,0,0,0,121220,0,0,0,0,0,0,121220
1149,99423,MB,0,0,0,310368,0,0,0,0,0,0,0,0,845139,0,0,0,0,845139
1150,99424,DR,0,34100,0,0,0,0,0,0,0,0,175162,0,0,0,0,0,0,175162
1151,99424,MB,0,0,0,253874,0,0,0,0,0,0,0,0,882993,0,0,0,0,882993


NTD ID                      object
Mode                        object
Passenger Miles Traveled     int64
Unlinked Passenger Trips     int64
Vehicle Miles                int64
Train Miles                  int64
dtype: object


Unnamed: 0,NTD ID,Mode,Passenger Miles Traveled,Unlinked Passenger Trips,Vehicle Miles,Train Miles
1,1,DR,5715608,557048,5910433,0
2,1,FB,984134,286843,49606,0
4,1,MB,175311235,42536203,38623369,0
5,1,SR,992847,829953,193919,193919
6,1,TB,14349255,7976186,2870606,0
...,...,...,...,...,...,...
2003,99423,DR,44095,14524,90625,0
2004,99423,MB,781940,357990,839248,0
2005,99424,DR,86453,29368,170684,0
2006,99424,MB,1759735,882456,771647,0


NTD ID                                       object
Mode                                         object
Diesel (gal)                                  int64
Gasoline (gal)                                int64
Liquefied Petroleum Gas (gal equivalent)      int64
Compressed Natural Gas (gal equivalent)       int64
Bio-Diesel (gal)                              int64
Hydrogen (kg)                                 int64
Other Fuel (gal/gal equivalent)               int64
Electric Propulsion (kWh)                     int64
Electric Battery (kWh)                        int64
Diesel (miles)                                int64
Gasoline (miles)                              int64
Liquefied Petroleum Gas (miles)               int64
Compressed Natural Gas (miles)                int64
Hydrogen (miles)                              int64
Other Fuel (miles)                            int64
Electric Propulsion (miles)                   int64
Electric Battery (miles)                      int64
All Fuels (m

Unnamed: 0,NTD ID,Mode,Diesel (gal),Gasoline (gal),Liquefied Petroleum Gas (gal equivalent),Compressed Natural Gas (gal equivalent),Bio-Diesel (gal),Hydrogen (kg),Other Fuel (gal/gal equivalent),Electric Propulsion (kWh),...,Hydrogen (miles),Other Fuel (miles),Electric Propulsion (miles),Electric Battery (miles),All Fuels (miles),Passenger Miles Traveled,Unlinked Passenger Trips,Vehicle Miles,Train Miles,UACE Code
0,1,DR,80768,612884,349270,0,0,0,0,0,...,0,1780680,0,0,7117395,5715608,557048,5910433,0,80389.0
1,1,FB,19736,0,0,0,181039,0,0,0,...,0,0,0,0,50395,984134,286843,49606,0,80389.0
2,1,MB,7469540,83175,0,0,434148,0,0,0,...,0,0,0,4156,37605883,175311235,42536203,38623369,0,80389.0
3,1,SR,0,0,0,0,0,0,0,2120085,...,0,0,193919,0,193919,992847,829953,193919,193919,80389.0
4,1,TB,0,0,0,0,0,0,0,15561912,...,0,0,2871115,0,2871115,14349255,7976186,2870606,0,80389.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1129,99422,VP,0,268493,0,0,0,0,0,0,...,0,0,0,0,4454811,21373631,451172,5026281,0,85087.0
1130,99423,DR,0,14311,0,0,0,0,0,0,...,0,0,0,0,121220,44095,14524,90625,0,51445.0
1131,99423,MB,0,0,0,310368,0,0,0,0,...,0,0,0,0,845139,781940,357990,839248,0,51445.0
1132,99424,DR,0,34100,0,0,0,0,0,0,...,0,0,0,0,175162,86453,29368,170684,0,51445.0


  fuel_energy_df = fuel_energy_df.apply(pd.to_numeric, errors='ignore')


NTD ID                                       int64
Mode                                        object
Diesel (gal)                                 int64
Gasoline (gal)                               int64
Liquefied Petroleum Gas (gal equivalent)     int64
Compressed Natural Gas (gal equivalent)      int64
Bio-Diesel (gal)                             int64
Hydrogen (kg)                                int64
Other Fuel (gal/gal equivalent)              int64
Electric Propulsion (kWh)                    int64
Electric Battery (kWh)                       int64
Diesel (miles)                               int64
Gasoline (miles)                             int64
Liquefied Petroleum Gas (miles)              int64
Compressed Natural Gas (miles)               int64
Hydrogen (miles)                             int64
Other Fuel (miles)                           int64
Electric Propulsion (miles)                  int64
Electric Battery (miles)                     int64
All Fuels (miles)              

  service_df = service_df.apply(pd.to_numeric, errors='ignore')


Unnamed: 0,NTD ID,Mode,Diesel (gal),Gasoline (gal),Liquefied Petroleum Gas (gal equivalent),Compressed Natural Gas (gal equivalent),Bio-Diesel (gal),Hydrogen (kg),Other Fuel (gal/gal equivalent),Electric Propulsion (kWh),Electric Battery (kWh),Diesel (miles),Gasoline (miles),Liquefied Petroleum Gas (miles),Compressed Natural Gas (miles),Hydrogen (miles),Other Fuel (miles),Electric Propulsion (miles),Electric Battery (miles),All Fuels (miles)
0,1,DR,128946,572171,398139,0,0,0,0,0,0,1256491,3955806,0,0,0,2019221,0,0,7231518
1,1,FB,7047,0,0,0,152414,0,0,0,0,36849,0,0,0,0,0,0,0,36849
2,1,MB,7860130,75268,0,0,455055,0,0,0,499478,36905562,523151,0,0,0,0,0,184987,37613700
3,1,SR,0,0,0,0,0,0,0,1992782,0,0,0,0,0,0,0,152125,0,152125
4,1,TB,0,0,0,0,0,0,0,13543196,0,0,0,0,0,0,0,1986145,0,1986145
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1142,99423,DR,0,17360,0,0,0,0,0,0,0,0,146059,0,0,0,0,0,0,146059
1143,99423,MB,0,0,0,288956,0,0,0,0,0,0,0,0,767740,0,0,0,0,767740
1144,99424,DR,0,45980,0,0,0,0,0,0,0,0,198604,0,0,0,0,0,0,198604
1145,99424,MB,0,0,0,239446,0,0,0,0,0,0,0,0,869845,0,0,0,0,869845


NTD ID                      object
Mode                        object
Passenger Miles Traveled     int64
Unlinked Passenger Trips     int64
Vehicle Miles                int64
Train Miles                  int64
dtype: object


Unnamed: 0,NTD ID,Mode,Passenger Miles Traveled,Unlinked Passenger Trips,Vehicle Miles,Train Miles
0,1,DR,5555669,541851,6348763,0
1,1,FB,896100,146930,35948,0
2,1,MB,211868860,49257744,39772547,0
3,1,SR,932676,749443,152125,152125
4,1,TB,15704827,8385162,2530256,0
...,...,...,...,...,...,...
1946,99423,DR,177411,33158,128638,0
1947,99423,MB,2278590,1040034,760445,0
1948,99424,DR,235921,68517,237892,0
1949,99424,MB,2439762,1305756,777405,0


NTD ID                                       object
Mode                                         object
Diesel (gal)                                  int64
Gasoline (gal)                                int64
Liquefied Petroleum Gas (gal equivalent)      int64
Compressed Natural Gas (gal equivalent)       int64
Bio-Diesel (gal)                              int64
Hydrogen (kg)                                 int64
Other Fuel (gal/gal equivalent)               int64
Electric Propulsion (kWh)                     int64
Electric Battery (kWh)                        int64
Diesel (miles)                                int64
Gasoline (miles)                              int64
Liquefied Petroleum Gas (miles)               int64
Compressed Natural Gas (miles)                int64
Hydrogen (miles)                              int64
Other Fuel (miles)                            int64
Electric Propulsion (miles)                   int64
Electric Battery (miles)                      int64
All Fuels (m

Unnamed: 0,NTD ID,Mode,Diesel (gal),Gasoline (gal),Liquefied Petroleum Gas (gal equivalent),Compressed Natural Gas (gal equivalent),Bio-Diesel (gal),Hydrogen (kg),Other Fuel (gal/gal equivalent),Electric Propulsion (kWh),...,Hydrogen (miles),Other Fuel (miles),Electric Propulsion (miles),Electric Battery (miles),All Fuels (miles),Passenger Miles Traveled,Unlinked Passenger Trips,Vehicle Miles,Train Miles,UACE Code
0,1,DR,128946,572171,398139,0,0,0,0,0,...,0,2019221,0,0,7231518,5555669,541851,6348763,0,80389.0
1,1,FB,7047,0,0,0,152414,0,0,0,...,0,0,0,0,36849,896100,146930,35948,0,80389.0
2,1,MB,7860130,75268,0,0,455055,0,0,0,...,0,0,0,184987,37613700,211868860,49257744,39772547,0,80389.0
3,1,SR,0,0,0,0,0,0,0,1992782,...,0,0,152125,0,152125,932676,749443,152125,152125,80389.0
4,1,TB,0,0,0,0,0,0,0,13543196,...,0,0,1986145,0,1986145,15704827,8385162,2530256,0,80389.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1139,99422,VP,0,140183,0,0,0,0,0,0,...,0,0,0,0,891458,11641897,295489,2175922,0,85087.0
1140,99423,DR,0,17360,0,0,0,0,0,0,...,0,0,0,0,146059,177411,33158,128638,0,51445.0
1141,99423,MB,0,0,0,288956,0,0,0,0,...,0,0,0,0,767740,2278590,1040034,760445,0,51445.0
1142,99424,DR,0,45980,0,0,0,0,0,0,...,0,0,0,0,198604,235921,68517,237892,0,51445.0


  fuel_energy_df = fuel_energy_df.apply(pd.to_numeric, errors='ignore')


NTD ID                                      float64
Mode                                         object
Diesel (gal)                                float64
Gasoline (gal)                              float64
Liquefied Petroleum Gas (gal equivalent)    float64
Compressed Natural Gas (gal equivalent)     float64
Bio-Diesel (gal)                            float64
Hydrogen (kg)                                 int64
Other Fuel (gal/gal equivalent)             float64
Electric Propulsion (kWh)                   float64
Electric Battery (kWh)                      float64
Diesel (miles)                              float64
Gasoline (miles)                            float64
Liquefied Petroleum Gas (miles)             float64
Compressed Natural Gas (miles)              float64
Hydrogen (miles)                              int64
Other Fuel (miles)                          float64
Electric Propulsion (miles)                 float64
Electric Battery (miles)                    float64
All Fuels (m

  service_df = service_df.apply(pd.to_numeric, errors='ignore')


Unnamed: 0,NTD ID,Mode,Diesel (gal),Gasoline (gal),Liquefied Petroleum Gas (gal equivalent),Compressed Natural Gas (gal equivalent),Bio-Diesel (gal),Hydrogen (kg),Other Fuel (gal/gal equivalent),Electric Propulsion (kWh),Electric Battery (kWh),Diesel (miles),Gasoline (miles),Liquefied Petroleum Gas (miles),Compressed Natural Gas (miles),Hydrogen (miles),Other Fuel (miles),Electric Propulsion (miles),Electric Battery (miles),All Fuels (miles)
0,1.0,DR,348542.0,597679.0,443747.0,0.0,0.0,0,0.0,0.0,0.0,3438651.0,4595440.0,0.0,0.0,0,2320187.0,0.0,0.0,10354278.0
1,1.0,FB,40878.0,0.0,0.0,0.0,203058.0,0,0.0,0.0,0.0,51028.0,0.0,0.0,0.0,0,0.0,0.0,0.0,51028.0
2,1.0,MB,10226121.0,126250.0,0.0,0.0,524839.0,0,0.0,0.0,542519.0,43641566.0,1133451.0,0.0,0.0,0,0.0,0.0,290053.0,45065070.0
3,1.0,SR,0.0,0.0,0.0,0.0,0.0,0,0.0,2241000.0,0.0,0.0,0.0,0.0,0.0,0,0.0,209216.0,0.0,209216.0
4,1.0,TB,0.0,0.0,0.0,0.0,0.0,0,0.0,15300827.0,0.0,0.0,0.0,0.0,0.0,0,0.0,2777387.0,0.0,2777387.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1124,99423.0,DR,0.0,21425.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,187297.0,0.0,0.0,0,0.0,0.0,0.0,187297.0
1125,99423.0,MB,0.0,0.0,0.0,295187.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,773956.0,0,0.0,0.0,0.0,773956.0
1126,99424.0,DR,0.0,32147.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,288134.0,0.0,0.0,0,0.0,0.0,0.0,288134.0
1127,99424.0,MB,0.0,0.0,0.0,262257.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,866447.0,0,0.0,0.0,0.0,866447.0


NTD ID                      object
Mode                        object
Passenger Miles Traveled     int64
Unlinked Passenger Trips     int64
Vehicle Miles                int64
Train Miles                  int64
dtype: object


Unnamed: 0,NTD ID,Mode,Passenger Miles Traveled,Unlinked Passenger Trips,Vehicle Miles,Train Miles
0,1,DR,8887848,887915,10111164,0
1,1,DT,2398907,177791,0,0
2,1,FB,3464136,701608,52362,0
3,1,MB,476447911,104362252,46387111,0
4,1,SR,2027877,1863409,209216,209216
...,...,...,...,...,...,...
1990,99423,MB,3096640,1411254,760585,0
1991,99424,DR,338657,90865,281613,0
1992,99424,MB,2611781,1489376,771289,0
1993,99425,DR,412863,82553,303217,0


NTD ID                                       object
Mode                                         object
Diesel (gal)                                float64
Gasoline (gal)                              float64
Liquefied Petroleum Gas (gal equivalent)    float64
Compressed Natural Gas (gal equivalent)     float64
Bio-Diesel (gal)                            float64
Hydrogen (kg)                                 int64
Other Fuel (gal/gal equivalent)             float64
Electric Propulsion (kWh)                   float64
Electric Battery (kWh)                      float64
Diesel (miles)                              float64
Gasoline (miles)                            float64
Liquefied Petroleum Gas (miles)             float64
Compressed Natural Gas (miles)              float64
Hydrogen (miles)                              int64
Other Fuel (miles)                          float64
Electric Propulsion (miles)                 float64
Electric Battery (miles)                    float64
All Fuels (m

Unnamed: 0,NTD ID,Mode,Diesel (gal),Gasoline (gal),Liquefied Petroleum Gas (gal equivalent),Compressed Natural Gas (gal equivalent),Bio-Diesel (gal),Hydrogen (kg),Other Fuel (gal/gal equivalent),Electric Propulsion (kWh),...,Hydrogen (miles),Other Fuel (miles),Electric Propulsion (miles),Electric Battery (miles),All Fuels (miles),Passenger Miles Traveled,Unlinked Passenger Trips,Vehicle Miles,Train Miles,UACE Code
0,1.0,DR,348542.0,597679.0,443747.0,0.0,0.0,0,0.0,0.0,...,0,2320187.0,0.0,0.0,10354278.0,8887848,887915,10111164,0,80389.0
1,1.0,FB,40878.0,0.0,0.0,0.0,203058.0,0,0.0,0.0,...,0,0.0,0.0,0.0,51028.0,3464136,701608,52362,0,80389.0
2,1.0,MB,10226121.0,126250.0,0.0,0.0,524839.0,0,0.0,0.0,...,0,0.0,0.0,290053.0,45065070.0,476447911,104362252,46387111,0,80389.0
3,1.0,SR,0.0,0.0,0.0,0.0,0.0,0,0.0,2241000.0,...,0,0.0,209216.0,0.0,209216.0,2027877,1863409,209216,209216,80389.0
4,1.0,TB,0.0,0.0,0.0,0.0,0.0,0,0.0,15300827.0,...,0,0.0,2777387.0,0.0,2777387.0,32207119,17373451,2777247,0,80389.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1121,99422.0,VP,0.0,109427.0,0.0,0.0,0.0,0,0.0,0.0,...,0,0.0,0.0,0.0,1660042.0,7614197,205308,1482122,0,85087.0
1122,99423.0,DR,0.0,21425.0,0.0,0.0,0.0,0,0.0,0.0,...,0,0.0,0.0,0.0,187297.0,260602,46574,174336,0,51445.0
1123,99423.0,MB,0.0,0.0,0.0,295187.0,0.0,0,0.0,0.0,...,0,0.0,0.0,0.0,773956.0,3096640,1411254,760585,0,51445.0
1124,99424.0,DR,0.0,32147.0,0.0,0.0,0.0,0,0.0,0.0,...,0,0.0,0.0,0.0,288134.0,338657,90865,281613,0,51445.0


  fuel_energy_df = fuel_energy_df.apply(pd.to_numeric, errors='ignore')


NTD ID                                      float64
Mode                                         object
Diesel (gal)                                float64
Gasoline (gal)                              float64
Liquefied Petroleum Gas (gal equivalent)    float64
Compressed Natural Gas (gal equivalent)     float64
Bio-Diesel (gal)                            float64
Hydrogen (kg)                                 int64
Other Fuel (gal/gal equivalent)             float64
Electric Propulsion (kWh)                   float64
Electric Battery (kWh)                      float64
Diesel (miles)                              float64
Gasoline (miles)                            float64
Liquefied Petroleum Gas (miles)             float64
Compressed Natural Gas (miles)              float64
Hydrogen (miles)                              int64
Other Fuel (miles)                          float64
Electric Propulsion (miles)                 float64
Electric Battery (miles)                    float64
All Fuels (m

  service_df = service_df.apply(pd.to_numeric, errors='ignore')


Unnamed: 0,NTD ID,Mode,Diesel (gal),Gasoline (gal),Liquefied Petroleum Gas (gal equivalent),Compressed Natural Gas (gal equivalent),Bio-Diesel (gal),Hydrogen (kg),Other Fuel (gal/gal equivalent),Electric Propulsion (kWh),Electric Battery (kWh),Diesel (miles),Gasoline (miles),Liquefied Petroleum Gas (miles),Compressed Natural Gas (miles),Hydrogen (miles),Other Fuel (miles),Electric Propulsion (miles),Electric Battery (miles),All Fuels (miles)
0,1.0,DR,394253.0,477232.0,345727.0,0.0,0.0,0,0.0,0.0,0.0,4106703.0,4159439.0,1773387.0,0.0,0,0.0,0.0,0.0,10039529.0
1,1.0,FB,23496.0,0.0,0.0,0.0,207457.0,0,0.0,0.0,0.0,53398.0,0.0,0.0,0.0,0,0.0,0.0,0.0,53398.0
2,1.0,MB,10123152.0,117340.0,0.0,0.0,0.0,0,0.0,0.0,395528.0,43087104.0,1082744.0,0.0,0.0,0,0.0,0.0,152729.0,44322577.0
3,1.0,SR,0.0,0.0,0.0,0.0,0.0,0,0.0,2449153.0,0.0,0.0,0.0,0.0,0.0,0,0.0,211570.0,0.0,211570.0
4,1.0,TB,0.0,0.0,0.0,0.0,0.0,0,0.0,20181683.0,0.0,0.0,0.0,0.0,0.0,0,0.0,3271603.0,0.0,3271603.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1136,99423.0,DR,0.0,24085.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,163185.0,0.0,0.0,0,0.0,0.0,0.0,163185.0
1137,99423.0,MB,0.0,0.0,0.0,301187.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,793336.0,0,0.0,0.0,0.0,793336.0
1138,99424.0,DR,0.0,31701.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,268560.0,0.0,0.0,0,0.0,0.0,0.0,268560.0
1139,99424.0,MB,0.0,0.0,0.0,31701.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,776835.0,0,0.0,0.0,0.0,776835.0


NTD ID                      object
Mode                        object
Passenger Miles Traveled     int64
Unlinked Passenger Trips     int64
Vehicle Miles                int64
Train Miles                  int64
dtype: object


Unnamed: 0,NTD ID,Mode,Passenger Miles Traveled,Unlinked Passenger Trips,Vehicle Miles,Train Miles
0,1,DR,8840480,883312,9876720,0
1,1,DT,1969214,143747,0,0
2,1,FB,3323914,664365,49706,0
3,1,MB,505978822,104261625,44809136,0
4,1,SR,1856308,1685668,211570,211570
...,...,...,...,...,...,...
2005,99423,MB,3300627,1504383,761251,0
2006,99424,DR,302794,80378,263560,0
2007,99424,MB,2749102,1542871,708993,0
2008,99425,DR,439946,89074,310634,0


NTD ID                                       object
Mode                                         object
Diesel (gal)                                float64
Gasoline (gal)                              float64
Liquefied Petroleum Gas (gal equivalent)    float64
Compressed Natural Gas (gal equivalent)     float64
Bio-Diesel (gal)                            float64
Hydrogen (kg)                                 int64
Other Fuel (gal/gal equivalent)             float64
Electric Propulsion (kWh)                   float64
Electric Battery (kWh)                      float64
Diesel (miles)                              float64
Gasoline (miles)                            float64
Liquefied Petroleum Gas (miles)             float64
Compressed Natural Gas (miles)              float64
Hydrogen (miles)                              int64
Other Fuel (miles)                          float64
Electric Propulsion (miles)                 float64
Electric Battery (miles)                    float64
All Fuels (m

Unnamed: 0,NTD ID,Mode,Diesel (gal),Gasoline (gal),Liquefied Petroleum Gas (gal equivalent),Compressed Natural Gas (gal equivalent),Bio-Diesel (gal),Hydrogen (kg),Other Fuel (gal/gal equivalent),Electric Propulsion (kWh),...,Hydrogen (miles),Other Fuel (miles),Electric Propulsion (miles),Electric Battery (miles),All Fuels (miles),Passenger Miles Traveled,Unlinked Passenger Trips,Vehicle Miles,Train Miles,UACE Code
0,1.0,DR,394253.0,477232.0,345727.0,0.0,0.0,0,0.0,0.0,...,0,0.0,0.0,0.0,10039529.0,8840480,883312,9876720,0,80389.0
1,1.0,FB,23496.0,0.0,0.0,0.0,207457.0,0,0.0,0.0,...,0,0.0,0.0,0.0,53398.0,3323914,664365,49706,0,80389.0
2,1.0,MB,10123152.0,117340.0,0.0,0.0,0.0,0,0.0,0.0,...,0,0.0,0.0,152729.0,44322577.0,505978822,104261625,44809136,0,80389.0
3,1.0,SR,0.0,0.0,0.0,0.0,0.0,0,0.0,2449153.0,...,0,0.0,211570.0,0.0,211570.0,1856308,1685668,211570,211570,80389.0
4,1.0,TB,0.0,0.0,0.0,0.0,0.0,0,0.0,20181683.0,...,0,0.0,3271603.0,0.0,3271603.0,34367474,17950742,3271438,0,80389.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1134,99422.0,VP,0.0,38574.0,0.0,0.0,0.0,0,0.0,0.0,...,0,0.0,0.0,0.0,766924.0,3839997,84645,699350,0,85087.0
1135,99423.0,DR,0.0,24085.0,0.0,0.0,0.0,0,0.0,0.0,...,0,0.0,0.0,0.0,163185.0,253741,45144,183626,0,51445.0
1136,99423.0,MB,0.0,0.0,0.0,301187.0,0.0,0,0.0,0.0,...,0,0.0,0.0,0.0,793336.0,3300627,1504383,761251,0,51445.0
1137,99424.0,DR,0.0,31701.0,0.0,0.0,0.0,0,0.0,0.0,...,0,0.0,0.0,0.0,268560.0,302794,80378,263560,0,51445.0


### Unify the NTD Service (PMT) Data

Now that we have the NTD Fuel and Energy data unified into a JSON file, it is now time to do the same for the PMT data. We will be trimming the data quite a bit so that we only store the values important to us as years 2018-2022 take up 50mb when combined. The fields we will be saving are...

`["Total Vehicle Miles", "Vehicle Revenue Miles", "Vehicle Deadhead Miles", "Total Train Miles", "Train Revenue Miles", "PMT"]`

This step is important because in order to properly assess the amount of carbon emitted when a person uses public transit, we need to know how many people were using that service at the same time. It is not reasonable for us to ask users to report how many people were on at the same time as them, so the next best thing is an estimate. We can do this by taking the total passenger miles traveled (PMT) for each agency and then dividing that by the total miles of service the agency provided to get an average of how many passengers per mile they served. 

In [None]:
ntd_service_pmt_data = {}

def refactor_service_pmt_data(url_entry):
    # Load in dataframe
    df = load_dataframe(url_entry, "Annual Service Data By Mode")
    # Get rid of all NaN values
    df = df.fillna(0)
    # Convert all array rows into dictionaries
    converted_rows = convert_and_filter_rows(url_entry["year"], df)
    # Group all of the rows by NTD ID, and then mode that they pertain to
    aggregate_data = group_by_ntd_mode(converted_rows)
    # Add to overall data
    ntd_service_pmt_data[url_entry["year"]] = aggregate_data
    print("Service data year " + url_entry["year"] + " is finished!")

def convert_and_filter_rows(year, df):
    '''
    Convert each array into a dictionary, filter out fields we don't want, and then change field names for unification.
    '''
    # Specify the fields we want to keep so we aren't saving redundant data
    fields_2022 = ["NTD ID","Mode","Actual Vehicle/Passenger Car Miles","Actual Vehicle/Passenger Car Revenue Miles","Actual Vehicle/ Passenger Deadhead Miles","Train Miles","Train Revenue Miles","Passenger Miles Traveled"]  
    fields = ["NTD ID","Mode","Vehicle Miles","Vehicle Revenue Miles","Deadhead Miles","Train Miles","Train \nRevenue\nMiles","Passenger Miles"] 

    # Specify what we want to change the key's name to to unify the data
    convert = ["NTD ID","Mode","Total Vehicle Miles","Vehicle Revenue Miles","Vehicle Deadhead Miles","Total Train Miles","Train Revenue Miles","PMT"]
    fields_2022_convert = dict(zip(fields_2022, convert))
    fields_convert = dict(zip(fields, convert))

    converted_rows = []

    # Converting each row into a dictionary, then filtering out the items in the dictionary based on if the "key" value is in "fields",
    # then we are converting the key into a seperate more concise name
    if int(year) < 2022:
        converted_rows = [{(fields_convert[k]): v for k, v in row.to_dict().items() if k in fields} for _,row in df.iterrows()]
    else:
        converted_rows = [{(fields_2022_convert[k]): v for k, v in row.to_dict().items() if k in fields_2022} for _,row in df.iterrows()]
    
    return converted_rows

def group_by_ntd_mode(converted_rows):
    '''
    Group each row based on NTD id, and then group it once more based upon mode. Each mode must store an array
    of rows because can be multiple instances of same mode in same NTD id. 
    Example Structure:
    {
        "ntd_id": {
            "mode": [
                {
                    "field1": 1,
                    "field2": 2,
                    ...
                },
                {
                    "field1": 1,
                    "field2": 2,
                    ...
                }
            ]
        }
    }
    '''
    aggregate_data = defaultdict(lambda: defaultdict(list))
    for row in converted_rows:
        ntd_id = row["NTD ID"]
        mode = row["Mode"]
        aggregate_data[ntd_id][mode].append(row)
    return aggregate_data

for url_entry in ntd_service_pmt_urls:
    refactor_service_pmt_data(url_entry)

# Add all the data to a file
ntd_service_pmt_data_json = json.dumps(ntd_service_pmt_data, indent=2)
with open("ntd_service_pmt.json", 'w') as file:
    file.write(ntd_service_pmt_data_json)
    print("NTD Service (PMT) dataset can now be found at ntd_fuel_energy.json")

# Delete all old files
delete_old_files("service-pmt", ntd_service_pmt_urls)

### Carbon calculations

In [None]:
# Temporary constants
# https://www.epa.gov/energy/greenhouse-gases-equivalencies-calculator-calculations-and-references
KG_CO2_PER_GALLON_GASOLINE = 8.89
KG_CO2_PER_GALLON_DIESEL = 10.18
KG_CO2_PER_GALLON_BIODIESEL = KG_CO2_PER_GALLON_DIESEL * .26 # https://afdc.energy.gov/fuels/biodiesel-benefits
KG_CO2_PER_GALLON_LPG = 5.75 # https://www.eia.gov/environment/emissions/co2_vol_mass.php
KG_CO2_PER_GALLON_CNG = KG_CO2_PER_GALLON_GASOLINE * 1.22 # https://www.ctc-n.org/technology-library/vehicle-and-fuel-technologies/compressed-natural-gas-cng-fuel
KG_CO2_PER_KG_HYDROGEN = 0 
KG_CO2_PER_KWH_ELECTRICITY = 0.5 # Figure out way to integrate this with eGrid work

DIESEL_GGE = 1.136 # from energy.gov
KWH_PER_GALLON_GASOLINE = 33.7 # from the EPA, used as the basis for MPGe
KWH_PER_GALLON_DIESEL = KWH_PER_GALLON_GASOLINE * 1.14
# GGE constants found from https://epact.energy.gov/fuel-conversion-factors
KWH_PER_GALLON_BIODIESEL = KWH_PER_GALLON_GASOLINE * 1.05 
KWH_PER_GALLON_LPG = KWH_PER_GALLON_GASOLINE * .74
KWH_PER_GALLON_CNG = KWH_PER_GALLON_GASOLINE * .26
KWH_PER_KG_HYDROGEN = KWH_PER_GALLON_GASOLINE * 1.00

# Import data
with open('ntd_fuel_energy.json', 'r') as file:
    fuel_energy_data = json.load(file)

with open("ntd_service_pmt.json", 'r') as file:
    service_pmt_data = json.load(file)

factors = {
    "Gasoline": {"kWh_per_unit":  KWH_PER_GALLON_GASOLINE, "kg_CO2_per_unit":  KG_CO2_PER_GALLON_GASOLINE},
    "Diesel": {"kWh_per_unit":  KWH_PER_GALLON_DIESEL, "kg_CO2_per_unit":  KG_CO2_PER_GALLON_DIESEL},
    "Bio-Diesel": {"kWh_per_unit":  KWH_PER_GALLON_BIODIESEL, "kg_CO2_per_unit":  KG_CO2_PER_GALLON_BIODIESEL},
    "Liquefied Petroleum Gas": {"kWh_per_unit":  KWH_PER_GALLON_LPG, "kg_CO2_per_unit":  KG_CO2_PER_GALLON_LPG},
    "Compressed Natural Gas": {"kWh_per_unit":  KWH_PER_GALLON_CNG, "kg_CO2_per_unit":  KG_CO2_PER_GALLON_CNG},
    "Hydrogen": {"kWh_per_unit":  KWH_PER_KG_HYDROGEN, "kg_CO2_per_unit":  KG_CO2_PER_KG_HYDROGEN},
    "Electric Propulsion": {"kWh_per_unit": 1, "kg_CO2_per_unit":  KG_CO2_PER_KWH_ELECTRICITY},
    "Electric Battery": {"kWh_per_unit": 1, "kg_CO2_per_unit":  KG_CO2_PER_KWH_ELECTRICITY},
    "Other Fuel": {"kWh_per_unit":  KWH_PER_GALLON_GASOLINE, "kg_CO2_per_unit":  KG_CO2_PER_GALLON_GASOLINE},
}

mode_conversion = {
    "Bus": ["CB", "MB", "RB", "TB"],
    "Train": ["LR", "CC", "SR", "TR", "CR", "HR", "MG", "YR"],
    "": []
}

def get_ntd_ids_by_uace(code, year):
    '''
    Given an UACE code, find all the NTD ids within it. Necessary because the PMT data
    only has NTD ids, so this helps us bridge the gap between Fuel + Energy data and Service (PMT) data.
    '''
    ids = set()
    for row in fuel_energy_data[year][code]:
        ids.add(str(row["NTD ID"]))
    return ids

def average_passengers(code, modes, year):
    '''
    Calculate the average number of passengers using public transit given the constraints of UACE code, modes, and year.
    To do this we do the following steps:

    1) Gather all NTD ids in a given UACE code. 
       - This is necessary as the Service/PMT data uses NTD ids instead of UACE codes, so we must convert our UACE code into its corresponding NTD ids that make it up. 
    2) Search through each NTD id (aka agency) in our UACE code and see if it has data on our modes we are searching for.
    3) If the agency has information on the modes we are looking for, add it to our aggregate_modes array.
    4) Sum up all the instances of "Total Miles" and "PMT" in all the mode data we found.
    '''
    # Find all NTD id's by UACE code
    ntd_ids = get_ntd_ids_by_uace(code, year)
    # Goal is to collect all data about each mode 
    aggregate_modes = []
    # Search through each ntd_id in a given UACE code
    for ntd_id in ntd_ids:
        # Get the agency based on the ntd_id, if it doesn't exist we can skip
        agency = service_pmt_data[year].get(ntd_id, None)
        if agency == None: continue
        # Given an agency with an ntd_id in our UACE code, search through the agency's modes for the ones we are looking for
        for mode in agency:
            # If we find a mode within the agency we are looking for, add to our aggregate data.
            if mode in modes:
                aggregate_modes.append(agency[mode])
    # Sum up all the miles
    total_miles = sum(
        (int(mode["Vehicle Revenue Miles"].replace(",", "")) + int(mode["Train Revenue Miles"].replace(",", "")) )
        # (int(mode["Total Vehicle Miles"].replace(",", "")) + int(mode["Total Train Miles"].replace(",", "")) )
        for modes in aggregate_modes
        for mode in modes
    )
    # Sum up all the PMTs
    total_pmt = sum(
        int(mode["PMT"].replace(",", ""))
        for modes in aggregate_modes
        for mode in modes
    )
    # Convert to km
    total_kms =  total_miles * 1.60934
    total_pkt = total_pmt * 1.60934
    avg = float(total_pkt) / float(total_kms)
    return (avg, aggregate_modes)

def aggregate_fuel_data(code, modes, year, fields, get_factor):
    '''
    Aggregate and sum all fields provided in a given year, area code, and modes. Then applies a constant
    factor to the item which is obtained by passing in a "field_name" to get_factor().
    '''
    # Store all the totals in one big dictionary
    totals = defaultdict(int)
    aggregate_data = []
    # Look through every entry in our fuel data based on year and area code
    for entry in fuel_energy_data[year][code]:
        # Only care about modes that we have specified
        if entry["Mode"] in modes:
            # Create new trimmed down object for all the fields we care about
            new_entry = {"NTD ID": entry["NTD ID"], "Mode": entry["Mode"]}
            # Keep track of total that all fields sum up to
            total_value = 0
            # Extract and sum the data for each field
            for field in fields:
                field_num = int(entry[field].replace(",",""))
                total_value += field_num
                # Copy field we want into our new trimmed object
                new_entry[field] = field_num
                # Put data into total 
                field_name = field.split(" (").pop(0)
                totals[field_name] += field_num * get_factor(field_name)
            # Add in the total_value
            new_entry["Total"] = total_value
            # Add filtered dictionary to aggregate_data
            aggregate_data.append(new_entry)
        
    return (totals, aggregate_data)

def aggregate_total_whs(code, modes, year):
    '''
    Finds total kWh in a given year, area code, and between modes 
    '''
    fields = ["Diesel (gal)","Gasoline (gal)","Liquefied Petroleum Gas (gal equivalent)","Compressed Natural Gas (gal equivalent)","Other Fuel (gal/gal equivalent)","Electric Propulsion (kWh)","Electric Battery (kWh)"]
    if int(year) >= 2022:
        fields.append("Hydrogen (kg)")
    get_factor = lambda factor: factors[factor]["kWh_per_unit"] * 1000
    (total_kwh, aggregate_gallons_data) = aggregate_fuel_data(code, modes, year, fields, get_factor)
    return (total_kwh, aggregate_gallons_data)

def aggregate_total_kms(code, modes, year):
    '''
    Finds total KMs in a given year, area code, and between modes
    '''
    fields = ["Diesel (miles)","Gasoline (miles)","Liquefied Petroleum Gas (miles)","Compressed Natural Gas (miles)","Other Fuel (miles)","Electric Propulsion (miles)","Electric Battery (miles)"]
    if int(year) >= 2022:
        fields.append("Hydrogen (miles)")
    get_factor = lambda _: 1.60934
    (total_kms, aggregate_miles_data) = aggregate_fuel_data(code, modes, year, fields, get_factor)
    return (total_kms, aggregate_miles_data)

def calculate_weights(aggregate_gallons_data, aggregate_modes):
    '''
    Calculate the weights of each fuel type. The calculations of how to get it are described in this github issue comment:
    https://github.com/JGreenlee/e-mission-common/pull/2#issuecomment-2252070684
    '''

    refactored = defaultdict(dict)
    # First thing we want to do is to combine the aggregate_modes into the ntd_id: { mode1: {}, mode2:{} } format
    for modes in aggregate_modes:
        for mode in modes:
            ntd_id = mode['NTD ID']
            mode_type = mode['Mode']
            pkt = int(mode["PMT"].replace(",","")) * 1.60934
            if ntd_id in refactored and mode_type in refactored[ntd_id]:
                # Check to see if mode already exists within ntd_id, if it does then add the values
                refactored[ntd_id][mode_type]["PKT"] += pkt
            else:
                # If mode doesn't exist, just add it in
                refactored[ntd_id][mode_type] = {"PKT": pkt}

    # Now we want to add all the gallon data into refactored
    for agency in aggregate_gallons_data:
        ntd_id = agency["NTD ID"]
        mode = agency["Mode"]
        # Add in the gallon data, and remove NTD ID + Mode because we already store that info
        refactored[ntd_id][mode].update(agency)
        refactored[ntd_id][mode].pop("NTD ID")
        refactored[ntd_id][mode].pop("Mode")

    # Find the percentage of each fuel type used and add up total passenger km
    total_passenger_km = 0
    for agency in refactored:
        for mode in refactored[agency]:
            for field in refactored[agency][mode]:
                if field == "PKT":
                    # Add up total passenger kms 
                    total_passenger_km += refactored[agency][mode][field]
                elif field != "Total":
                    # Calculating averages for each fuel type within each mode
                    if refactored[agency][mode][field] != 0:
                        refactored[agency][mode][field] /= refactored[agency][mode]["Total"]

    # Calculate the weight by agency & mode based off of the PKT, and then adjust the percentages of the fuel types using that weight
    for agency in refactored:
        for mode in refactored[agency]:
            # Calculate the weight by passenger kilometers traveled
            refactored[agency][mode]["weight_by_pkt"] = refactored[agency][mode]["PKT"] / total_passenger_km
            for field in refactored[agency][mode]:
                if field != "weight_by_pkt" and field != "PKT" and field != "Total":
                    if refactored[agency][mode][field] != 0:
                        refactored[agency][mode][field] *= refactored[agency][mode]["weight_by_pkt"]

    fuel_type_weights = defaultdict(int)

    # Add up all the percentages of each fuel type
    for agency in refactored:
        for mode in refactored[agency]:
            for field in refactored[agency][mode]:
                if field != "weight_by_pkt" and field != "PKT" and field != "Total":
                    fuel_name = field.split(" (").pop(0)
                    fuel_type_weights[fuel_name] += refactored[agency][mode][field]

    # print(json.dumps(fuel_type_weights, indent=4))
    return fuel_type_weights

def calculate(trip, modes):
    year = trip["year"]
    code = trip["code"]

    (total_kms, aggregate_miles_data) = aggregate_total_kms(code, modes, year)
    (total_whs, aggregate_gallons_data) = aggregate_total_whs(code, modes, year)
    (average_number_passengers, aggregate_modes) = average_passengers(code, modes, year)

    data = {}

    weights = calculate_weights(aggregate_gallons_data, aggregate_modes)

    for fuel in total_kms:
        wh_per_km = 0
        wh_per_km_passenger = 0
        if total_kms[fuel] != 0:
            wh_per_km = total_whs[fuel] / total_kms[fuel]
            wh_per_km_passenger = wh_per_km / average_number_passengers

        data[fuel] = {
            "wh_per_km": wh_per_km_passenger,
            "weight": weights[fuel]
        }

    print(average_number_passengers)    
    
    print(json.dumps(data, indent=4))



fake_trip = {
    "year": "2022",
    "distance": 1000,
    "code": "9271"
}
modes = ["LR", "HR"]

calculate(fake_trip, modes)

### Running data analysis

In [None]:
# Didn't add Bio-Diesel b/c there is no Bio-Deisel (miles) field

# Some data is weird, 20169 in fuel_energy has 0 diesel gallons, but 1.2 million diesel miles

# How to do error handling