# Begin with Credentials and Connection to Trino

In [None]:
import os
import pathlib
from dotenv import load_dotenv

# Load some standard environment variables from a dot-env file, if it exists.
# If no such file can be found, does not fail, and so allows these environment vars to
# be populated in some other way
dotenv_dir = os.environ.get('CREDENTIAL_DOTENV_DIR', os.environ.get('PWD', '/opt/app-root/src'))
dotenv_path = pathlib.Path(dotenv_dir) / 'credentials.env'
if os.path.exists(dotenv_path):
    load_dotenv(dotenv_path=dotenv_path,override=True)

Set session variable CATALOG to make query terms much more compact

In [None]:
import trino
from sqlalchemy.engine import create_engine

env_var_prefix = 'TRINO'

sqlstring = 'trino://{user}@{host}:{port}/'.format(
    user = os.environ[f'{env_var_prefix}_USER'],
    host = os.environ[f'{env_var_prefix}_HOST'],
    port = os.environ[f'{env_var_prefix}_PORT']
)
sqlargs = {
    'auth': trino.auth.JWTAuthentication(os.environ[f'{env_var_prefix}_PASSWD']),
    'http_scheme': 'https',
    'catalog': 'osc_datacommons_dev'
}
engine = create_engine(sqlstring, connect_args = sqlargs)
connection = engine.connect()

import pandas as pd

In [None]:
from osc_ingest_trino import *

trino_bucket = attach_s3_bucket("S3_DEV")

Leftmost in the diagram are the facilities.  Some facilities are owned by entities with known LEIs, others are not.

In [None]:
fac_lei_list = engine.execute("""
select count (*), lei from ghgrp_demo.parent_attribution as PA
where PA.lei is not null and PA.reporting_year=DATE('2020-01-01')
group by lei
order by count (*) desc""").fetchall()

fac_nolei_list = engine.execute("""
select count (*), parent_company_name from ghgrp_demo.parent_attribution as PA
where PA.lei is null and PA.reporting_year=DATE('2020-01-01')
group by parent_company_name
order by count (*) desc""").fetchall()

Companies reporting to the SEC DERA database list their primary SIC code.  `SIC_NOLIST_LEI_LIST` contains a list of SIC codes reported to SEC DERA that are not otherwise listed in the EPA GHGRP data.

In [None]:
sic_nolist_lei_list = engine.execute("""
select count (*), S.sic
from sec_dera.sub as S inner join
  (select S2.lei as lei
   from sec_dera.sub as S2
   where S2.fp='FY' and S2.fy=DATE('2020-01-01') and S2.form='10-K'
   except
   select distinct(PA.lei) from ghgrp_demo.parent_attribution as PA where PA.lei IS NOT NULL and reporting_year=DATE('2020-01-01')) as S2_lei 
on S.lei=S2_lei.lei
where S.fp='FY' and S.fy=DATE('2020-01-01') and S.form='10-K'
group by S.sic
order by count (*) desc
""").fetchall()

Facilities in the EPA GHGRP dataset are categorized by a NAICS (not SIC) code.  `naics_nolist_nolei_list` contains the list of NAICS codes that will not be overriden by the SIC code of a publicly reporting company.

In [None]:
naics_nolist_nolei_list = engine.execute("""
select count (*), fnc
from (select PA.facility_naics_code as fnc
      from epa_ghgrp.parent_company as PA
      where PA.reporting_year=DATE('2020-01-01') and PA.lei IS NULL
      except
      select distinct(PA2.facility_naics_code) as fnc
      from ghgrp_demo.parent_attribution as PA2 inner join sec_dera.sub as S
      on PA2.lei=S.lei and PA2.reporting_year=S.fy
      where PA2.reporting_year=DATE('2020-01-01') and S.form='10-K' and S.fp='FY')
group by fnc
order by count (*) desc
""").fetchall()

The initial left-hand side of the Sankey diagram:

In [None]:
sankey_data = [
    ['SIC not listed', 'SIC', len(sic_nolist_lei_list)],
    ['GHGRP facilities', 'facilities w/ LEI', sum([x[0] for x in fac_lei_list])],
    ['facilities w/ LEI', 'GHGRP LEI', len(fac_lei_list)],
    ['GHGRP facilities', 'facilities w/o LEI', sum([x[0] for x in fac_nolei_list])],
    ['facilities w/o LEI', 'GHGRP no LEI', len(fac_nolei_list)],
    ['NAICS not listed', 'NAICS', len(naics_nolist_nolei_list)],
]
display(sankey_data)

In [None]:
engine.execute("select distinct(table_source) from ghgrp_demo.parent_attribution where reporting_year=DATE('2020-01-01')").fetchall()

# Note here that PARENT_ATTRIBUTION doesn't yet have source data from SUPPLIERS, CO2_INJECTION, nor GEOLOGIC_SEQUESTRATION_OF_CO2

Build the next stages of the Sankey Diagram:
* Facilities (LEI and no LEI) -> Parent Companies (LEI and no LEI)
* Parent Companies -> Emitter Categories (Direct Emitters, etc)
* Emitter Categories -> SIC/NAICS codes
* SIC/NAICS codes -> grouped SIC/NAICS codes

In [None]:
sic_others_cutoff = 30
naics_others_cutoff = 31

sic_df = pd.read_sql("select * from us_census.sic_ranges", engine)

def get_range_string(df):
    return sic_df[(df.l <= sic) & (df.r >= sic)][['range', 'range_desc']].agg(': '.join, axis=1).squeeze()

for tname in ['direct_emitters', 'onshore_oil_gas_prod', 'gathering_boosting',
             'transmission_pipelines', 'ldc_direct_emissions', 'sf6_from_elec_equip',
             # 'suppliers', 'co2_injection', 'geologic_sequestration_of_co2'
             ]:
    lei_2_epa_class_cnt = engine.execute(f"""
select count (*) from
  (select distinct(lei) from ghgrp_demo.parent_attribution as PA
   where PA.lei IS NOT NULL and table_source='{tname}' and reporting_year=DATE('2020-01-01'))
""").fetchall()[0][0]
    sankey_data.append(['GHGRP LEI', f'{tname} LEI', lei_2_epa_class_cnt])
    epa_lei_class_2_sic_list = engine.execute(f"""
select count (*), sic
from ghgrp_demo.parent_attribution as PA join sec_dera.sub as S
on PA.lei=S.lei and PA.reporting_year=S.fy
where PA.reporting_year=DATE('2020-01-01') and S.fp='FY' and S.form='10-K' and table_source='{tname}'
group by sic
order by count (*) desc
""").fetchall()
    for row in epa_lei_class_2_sic_list:
        if row[0]==0:
            continue
        sic = row[1]
        range_string = get_range_string(sic_df)
        if row[0] < sic_others_cutoff:
            sankey_data.append([f"SIC others", f"SIC RANGE {range_string}", row[0]])
            continue
        sankey_data.append([f'{tname} LEI', f"SIC {sic}", row[0]])
        sankey_data.append([f"SIC {sic}", f"SIC RANGE {range_string}", row[0]])
    sankey_data.append([f'{tname} LEI', f"SIC others", sum([x[0] for x in epa_lei_class_2_sic_list if x[0] < sic_others_cutoff])])
    
    nolei_2_epa_class_cnt = engine.execute(f"""
select count (*) from
  (select distinct(parent_company_name) from ghgrp_demo.parent_attribution as PA
   where PA.lei IS NULL and table_source='{tname}' and reporting_year=DATE('2020-01-01'))
""").fetchall()[0][0]
    sankey_data.append(['GHGRP no LEI', f'{tname} no LEI', lei_2_epa_class_cnt])
    epa_nolei_class_2_naics_list = engine.execute(f"""
select count (*), facility_naics_code
from ghgrp_demo.parent_attribution as PA
where PA.reporting_year=DATE('2020-01-01') and PA.lei IS NULL and table_source='{tname}'
group by facility_naics_code
order by count (*) desc
""").fetchall()
    for row in epa_nolei_class_2_naics_list:
        if row[0] < naics_others_cutoff:
            continue
        naics = row[1]
        sankey_data.append([f'{tname} no LEI', f"NAICS {naics}", row[0]])
    sankey_data.append([f'{tname} no LEI', f"NAICS others", sum([x[0] for x in epa_nolei_class_2_naics_list if x[0] < naics_others_cutoff])])

Build the right-hand side of the diagram: SIC groups to Revenues and CO2e (arbitrarily scaled)

In [None]:
revenue_scale_factor = 1e3
co2e_scale_factor = 7e5

qres = engine.execute("""
select PA.lei, sic, sum(fractional_emissions) as tot_co2e, round(max(value)/1000000,3) as tot_revenue
from ghgrp_demo.parent_attribution as PA join sec_dera.sub as S on PA.lei=S.lei and PA.reporting_year=S.fy
     join sec_dera.num as N on S.adsh=N.adsh
where PA.reporting_year=DATE('2020-01-01')
and S.fp='FY' and S.form='10-K'
and ddate>=DATE('2020-01-01') and ddate<DATE('2021-01-01')
and coreg is NULL
and (N.tag='Revenues'
     or N.tag='RevenueFromContractWithCustomerIncludingAssessedTax'
     or N.tag='RevenueFromContractWithCustomerExcludingAssessedTax'
     or N.tag='RevenuesNetOfInterestExpense'
     or N.tag='RegulatedAndUnregulatedOperatingRevenue'
     or N.tag='RegulatedOperatingRevenuePipelines')
and N.qtrs=4
group by PA.lei, sic
""").fetchall()

for row in qres:
    sic = row[1]
    co2e = row[2]/co2e_scale_factor
    rev = row[3]/revenue_scale_factor
    range_string = get_range_string(sic_df)
    sankey_data.append([f"SIC RANGE {range_string}", "CO2e", co2e])
    sankey_data.append([f"SIC RANGE {range_string}", "Revenue", rev])

Build the right-hand side of the diagram: NAICS codes to CO2e (arbitrarily scaled).  We don't have revenue information for these emissions.

In [None]:
qres = engine.execute("""
select count (*), PA.facility_naics_code, sum(PA.fractional_emissions) as tot_co2e
from ghgrp_demo.parent_attribution as PA
where PA.lei IS NULL and PA.reporting_year=DATE('2020-01-01')
group by PA.facility_naics_code
""").fetchall()

for row in qres:
    cnt = row[0]
    naics = row[1]
    co2e = row[2]/co2e_scale_factor
    if cnt < naics_others_cutoff:
        sankey_data.append([f"NAICS others", "CO2e", co2e])
    else:
        sankey_data.append([f"NAICS {naics}", "CO2e", co2e])

In [None]:
df = pd.DataFrame(data=sankey_data, columns=('source', 'target', 'qty'))

df['year'] = pd.to_datetime("2020-01-01")
df = df.convert_dtypes()

In [None]:
def unmanaged_parquet_tabledef(df, catalog, schema, table, bucket, partition_columns = [], verbose = False):
    if not isinstance(df, pd.DataFrame):
        raise ValueError("df must be a pandas DataFrame")
    if not isinstance(partition_columns, list):
        raise ValueError("partition_columns must be list of column names")

    columnschema = create_table_schema_pairs(df, typemap={'datetime64[ns]':'timestamp(3)'})

    tabledef = f"create table if not exists {catalog}.{schema}.{table} (\n"
    tabledef += f"{columnschema}\n"
    tabledef += ") with (\n    format = 'parquet',\n"
    if len(partition_columns) > 0:
        tabledef += f"    partitioned_by = array{partition_columns},\n"
    tabledef += f"    external_location = 's3a://{bucket.name}/trino/{schema}/{table}/'\n)"

    if verbose: print(tabledef)
    return tabledef

In [None]:
qres = engine.execute('show tables in ghgrp_demo')
qres.fetchall()

drop_unmanaged_table("osc_datacommons_dev", "ghgrp_demo", "epa_sankey", engine, trino_bucket, verbose=True)

drop_unmanaged_data("ghgrp_demo", "epa_sankey", trino_bucket, verbose=True)

ingest_unmanaged_parquet(df, "ghgrp_demo", "epa_sankey", trino_bucket, partition_columns=[],
                         append=False, workdir='/tmp', verbose=True)

sql = unmanaged_parquet_tabledef(df, "osc_datacommons_dev", "ghgrp_demo", "epa_sankey", trino_bucket, partition_columns = [],
                                 verbose=True)
qres = engine.execute(sql)
display(qres.fetchall())

qres = engine.execute("select * from ghgrp_demo.epa_sankey")
qres.fetchall()

In [None]:
len(sankey_data)

In [None]:
[x for x in sankey_data if '562212' in x[1]]

In [None]:
engine.execute("select count(*) from ghgrp_demo.parent_attribution PA where reporting_year=DATE('2020-01-01') and facility_naics_code='562212' and lei is null and table_source='direct_emitters'").fetchall()