In [1]:
import os
import pathlib
from dotenv import load_dotenv

# Load some standard environment variables from a dot-env file, if it exists.
# If no such file can be found, does not fail, and so allows these environment vars to
# be populated in some other way
dotenv_dir = os.environ.get('CREDENTIAL_DOTENV_DIR', os.environ.get('PWD', '/opt/app-root/src'))
dotenv_path = pathlib.Path(dotenv_dir) / 'credentials.env'
if os.path.exists(dotenv_path):
    load_dotenv(dotenv_path=dotenv_path,override=True)

Set session variable CATALOG to make query terms much more compact

In [2]:
import trino
from sqlalchemy.engine import create_engine

sqlstring = 'trino://{user}@{host}:{port}/'.format(
    user = os.environ['TRINO_USER'],
    host = os.environ['TRINO_HOST'],
    port = os.environ['TRINO_PORT']
)
sqlargs = {
    'auth': trino.auth.JWTAuthentication(os.environ['TRINO_PASSWD']),
    'http_scheme': 'https',
    'catalog': 'osc_datacommons_dev'
}
engine = create_engine(sqlstring, connect_args = sqlargs)
connection = engine.connect()

import pandas as pd

The EPA's GHG Reporting Program (GHGRP) seems to be a gold standard in terms of creating a bottoms-up list that's good enough to play a major role in tops-down estimates.

In [3]:
qres = engine.execute(f"show tables in epa_ghgrp")
display(qres.fetchall())

[('co2_injection',),
 ('direct_emitters',),
 ('gathering_boosting',),
 ('geologic_sequestration_of_co2',),
 ('ldc_direct_emissions',),
 ('onshore_oil_gas_prod',),
 ('parent_company',),
 ('sf6_from_elec_equip',),
 ('suppliers',),
 ('transmission_pipelines',)]

Direct Emitters are the lion's share of CO2 emissions.  There are other activities and industries with non-trivial emissions, but nothing compares to this.

In [4]:
qres = engine.execute(f"describe epa_ghgrp.direct_emitters")
display(qres.fetchall())

[('facility_id', 'bigint', '', ''),
 ('frs_id', 'varchar', '', ''),
 ('facility_name', 'varchar', '', ''),
 ('city', 'varchar', '', ''),
 ('state', 'varchar', '', ''),
 ('zip_code', 'varchar', '', ''),
 ('address', 'varchar', '', ''),
 ('county', 'varchar', '', ''),
 ('latitude', 'double', '', ''),
 ('longitude', 'double', '', ''),
 ('primary_naics_code', 'varchar', '', ''),
 ('latest_reported_industry_type_subparts', 'varchar', '', ''),
 ('latest_reported_industry_type_sectors', 'varchar', '', ''),
 ('total_reported_direct_emissions', 'double', '', ''),
 ('year', 'timestamp(3)', '', '')]

In [5]:
qres = engine.execute(f"select year, sum(total_reported_direct_emissions) from epa_ghgrp.direct_emitters group by year order by year desc")
display(qres.fetchall())

[('2020-01-01 00:00:00.000', 2400335017.1144834),
 ('2019-01-01 00:00:00.000', 2626532098.5932565),
 ('2018-01-01 00:00:00.000', 2779470637.179425),
 ('2017-01-01 00:00:00.000', 2735840754.8973103),
 ('2016-01-01 00:00:00.000', 2805104747.5443416),
 ('2015-01-01 00:00:00.000', 2939443583.3242407),
 ('2014-01-01 00:00:00.000', 3084068793.216346),
 ('2013-01-01 00:00:00.000', 3073213896.2733335),
 ('2012-01-01 00:00:00.000', 3058075792.5988026),
 ('2011-01-01 00:00:00.000', 3207582994.0028186)]

Here's a look at how they stack up (from a Database perspective--we should also look at this in Super Set).

In [6]:
qres = engine.execute(f"""select count (*), latest_reported_industry_type_sectors, sum(total_reported_direct_emissions)
from epa_ghgrp.direct_emitters
where year>=DATE('2019-01-01') and year<DATE('2020-01-01')
group by latest_reported_industry_type_sectors
order by sum(total_reported_direct_emissions) desc
""")
display(qres.fetchall())

[(1338, 'Power Plants', 1593302651.1495945),
 (371, 'Minerals', 112044493.16399996),
 (328, 'Chemicals', 106404264.83600003),
 (1066, 'Other', 101353022.50774214),
 (1187, 'Petroleum and Natural Gas Systems', 101032012.85843231),
 (1204, 'Waste', 100624446.79977302),
 (281, 'Metals', 69494113.83299993),
 (69, 'Petroleum Product Suppliers,Refineries', 65079994.47800001),
 (41, 'Chemicals,Petroleum Product Suppliers,Refineries', 64019038.494),
 (29, 'Chemicals,Suppliers of CO2', 27435824.12000001),
 (84, 'Pulp and Paper,Waste', 24953431.80600001),
 (7, 'Power Plants,Waste', 24875284.23),
 (11, 'Chemicals,Waste', 17522490.718),
 (10, 'Other,Power Plants', 17315702.882),
 (112, 'Other,Waste', 17100924.523999985),
 (3, 'Power Plants,Suppliers of CO2', 14904814.286),
 (17, 'Metals,Waste', 14615880.466000002),
 (4, 'Chemicals,Refineries,Suppliers of CO2', 13389032.404000001),
 (132, 'Pulp and Paper', 13028976.956000002),
 (97, 'Natural Gas and Natural Gas Liquids Suppliers,Petroleum and Natur

This looks at the `Metals` industry.  We see that the top emitters have multiple facility locations.

In [7]:
qres = engine.execute(f"""select count (*), parent_company_name, sum(total_reported_direct_emissions)
from epa_ghgrp.direct_emitters, epa_ghgrp.parent_company
where year>=DATE('2019-01-01') and year<DATE('2020-01-01') and year=reporting_year
      and latest_reported_industry_type_sectors='Minerals'
      and epa_ghgrp.direct_emitters.facility_id=epa_ghgrp.parent_company.ghgrp_facility_id
group by parent_company_name
order by sum(total_reported_direct_emissions) desc
limit 20
""")
display(qres.fetchall())

[(12, 'HOLCIM PARTICIPATIONS (US) INC', 10775100.462),
 (9, 'CEMEX INC', 7837412.011999999),
 (12, 'LEHIGH HANSON INC', 7170874.248),
 (8, 'RC LONESTAR INC', 6670842.868),
 (11, 'LHOIST NORTH AMERICA INC', 6095893.518),
 (11, 'CARMEUSE LIME INC', 4957669.602),
 (9, 'GRAYMONT INC', 4145991.5659999996),
 (8, 'CRH AMERICAS INC', 4038046.516),
 (4, 'MARTIN MARIETTA MATERIALS INC', 3939577.8740000003),
 (4, 'ARGOS USA LLC', 3571057.906),
 (11, 'EAGLE MATERIALS INC', 3460129.4600000004),
 (3, 'MISSISSIPPI LIME CO', 3386972.96),
 (3, 'TAIHEIYO CEMENT USA INC', 3268649.0960000004),
 (5, 'GCC OF AMERICA INC', 2311521.1900000004),
 (2, 'TITAN AMERICA LLC', 2230059.0100000002),
 (1, 'GENESIS ENERGY LP', 1824295.1),
 (1, 'LAFARGEHOLCIM NORTH AMERICA INC', 1792093.716),
 (2, 'NATIONAL CEMENT', 1633243.352),
 (2, 'SUMMIT MATERIALS INC', 1555637.794),
 (1, 'TATA CHEMICALS (SODA ASH) PARTNERS NORTH AMERICA', 1536763.272)]

A look at some other table shapes...

In [8]:
qres = engine.execute(f"describe epa_ghgrp.onshore_oil_gas_prod")
display(qres.fetchall())

[('facility_id', 'bigint', '', ''),
 ('frs_id', 'varchar', '', ''),
 ('facility_name', 'varchar', '', ''),
 ('basin', 'varchar', '', ''),
 ('reported_city', 'varchar', '', ''),
 ('reported_state', 'varchar', '', ''),
 ('reported_zip_code', 'varchar', '', ''),
 ('reported_address', 'varchar', '', ''),
 ('reported_county', 'varchar', '', ''),
 ('reported_latitude', 'double', '', ''),
 ('reported_longitude', 'double', '', ''),
 ('primary_naics_code', 'varchar', '', ''),
 ('latest_reported_industry_type_subparts', 'varchar', '', ''),
 ('total_reported_emissions_from_onshore_oil_gas_production', 'double', '', ''),
 ('year', 'timestamp(3)', '', '')]

In [9]:
qres = engine.execute(f"describe epa_ghgrp.suppliers")
display(qres.fetchall())

[('facility_id', 'bigint', '', ''),
 ('frs_id', 'varchar', '', ''),
 ('facility_name', 'varchar', '', ''),
 ('city', 'varchar', '', ''),
 ('state', 'varchar', '', ''),
 ('zip_code', 'varchar', '', ''),
 ('address', 'varchar', '', ''),
 ('county', 'varchar', '', ''),
 ('latitude', 'double', '', ''),
 ('longitude', 'double', '', ''),
 ('primary_naics_code', 'varchar', '', ''),
 ('latest_reported_industry_type_subparts', 'varchar', '', ''),
 ('coal_based_liquid_fuel_production_ghg', 'double', '', ''),
 ('petroleum_products_produced_ghg', 'double', '', ''),
 ('petroleum_products_imported_ghg', 'double', '', ''),
 ('petroleum_products_exported_ghg', 'double', '', ''),
 ('natural_gas_supply_ghg', 'double', '', ''),
 ('natural_gas_liquids_supply_ghg', 'double', '', ''),
 ('co2_supply_ghg', 'double', '', ''),
 ('year', 'timestamp(3)', '', '')]

A quick summary of how many rows of data we have in `epa_ghgrp`.

68k rows in `direct_emitters`: lots of facilities  
103k rows in `parent_company`: lots of facility/owner relationships

In [10]:
qres = engine.execute('show tables in osc_datacommons_dev.epa_ghgrp')
l = qres.fetchall()
totalrows = 0
for e in l:
    s = f'select count (*) from osc_datacommons_dev.epa_ghgrp.{e[0]}'
    print(s)
    qres = engine.execute(s)
    rowcount = qres.fetchall()[0][0]
    totalrows += rowcount
    print(rowcount)
    
print(f'total rows = {totalrows}')

select count (*) from osc_datacommons_dev.epa_ghgrp.co2_injection
954
select count (*) from osc_datacommons_dev.epa_ghgrp.direct_emitters
68472
select count (*) from osc_datacommons_dev.epa_ghgrp.gathering_boosting
1703
select count (*) from osc_datacommons_dev.epa_ghgrp.geologic_sequestration_of_co2
20
select count (*) from osc_datacommons_dev.epa_ghgrp.ldc_direct_emissions
1730
select count (*) from osc_datacommons_dev.epa_ghgrp.onshore_oil_gas_prod
5068
select count (*) from osc_datacommons_dev.epa_ghgrp.parent_company
103043
select count (*) from osc_datacommons_dev.epa_ghgrp.sf6_from_elec_equip
1012
select count (*) from osc_datacommons_dev.epa_ghgrp.suppliers
8539
select count (*) from osc_datacommons_dev.epa_ghgrp.transmission_pipelines
780
total rows = 191321


We have very specifically shaped tables, which is needed to prepare this presetnation in Pandas.

The key metric is total_emissions (in metric tons of CO2e), but the name of the metric depends on the source/process.  Nevertheless, we know that `year` is our last metric and that the CO2e metric is 2nd-to-last (hence the `-2` index).

We also know that when building our final summary table, the sums feeding into it are all only one row per year.  We use `iat[0,1` to access the 0th row and the 1st column (which will be named specifically to the source/process).  By using `iat`, we get a scalar value we can sum, instead of a Series object we'd have to `squeeze`.

In [11]:
import pandas as pd

emission_tables = ['direct_emitters', 'onshore_oil_gas_prod', 'gathering_boosting', 'transmission_pipelines', 'ldc_direct_emissions', 'sf6_from_elec_equip']
q_dict = {}

# A function that excludes terms using SQL to say "and X!=Y"
def excl_text(excl):
    return ' and '.join([f"latest_reported_industry_type_sectors!='{e}'" for e in excl])

# A function that includes text that matches; SQL that says "or X like '%Y%'"
def incl_text(excl):
    return ' or '.join([f"latest_reported_industry_type_sectors like '%{e}%'" for e in excl])

t = 'direct_emitters'
qres = engine.execute(f"describe epa_ghgrp.{t}")
t_cols = qres.fetchall()
total_emission_cname = t_cols[-2][0]

incl = [ 'Power', 'Petroleum']
qres = engine.execute(f"""
select year, sum({total_emission_cname}) from epa_ghgrp.{t}
where {incl_text(incl)}
group by year
""")
q_dict[t + f" (incl {','.join(incl)})"] = pd.DataFrame(qres.fetchall(), columns=['year', total_emission_cname + f" (matching {','.join(incl)})"])

excl = [ 'Minerals', 'Other', 'Waste', 'Chemicals', 'Pulp and Paper,Waste', 'Metals,Waste', 'Pulp and Paper']
qres = engine.execute(f"""
select year, sum({total_emission_cname}) from epa_ghgrp.{t}
where {excl_text(excl)}
group by year
""")
q_dict[t + f" (excl {','.join(excl)})"] = pd.DataFrame(qres.fetchall(), columns=['year', total_emission_cname + f" (excl {','.join(excl)})"])

for t in emission_tables:
    qres = engine.execute(f"describe epa_ghgrp.{t}")
    tr = qres.fetchall()
    total_emission_cname = tr[-2][0]
    qres = engine.execute(f"select year, sum({total_emission_cname}) from epa_ghgrp.{t} group by year")
    q_dict[t] = pd.DataFrame(qres.fetchall(), columns=['year', total_emission_cname])

grand_total = {}

for year in q_dict['direct_emitters'].year:
    grand_total[year] = sum([q_dict[t][q_dict[t].year==year].iat[0,1] for t in emission_tables if year in q_dict[t].year.values])

df = pd.DataFrame.from_dict(grand_total, orient='index', columns=['total_co2e']).reset_index()
df.rename(columns={'index':'year'}, inplace=True)
q_dict['grand_total'] = df

This gem comes from https://stackoverflow.com/questions/44327999/python-pandas-merge-multiple-dataframes

In [12]:
from functools import reduce

df_merged = reduce(lambda left,right: pd.merge(left,right,on=['year'], how='outer'), q_dict.values()).fillna(0)

A summary table showing (1) direct emitters that match "Power" or "Petroleum", (2) direct emitters that are not the top other industrie, and then (3..N): the totals from the GHGRP.

In [13]:
df_merged.rename(columns={v:v.replace('_', ' ') for v in df_merged.columns.values}).sort_values(by='year', ascending=False)

Unnamed: 0,year,"total reported direct emissions (matching Power,Petroleum)","total reported direct emissions (excl Minerals,Other,Waste,Chemicals,Pulp and Paper,Waste,Metals,Waste,Pulp and Paper)",total reported direct emissions,total reported emissions from onshore oil gas production,total reported emissions from gathering boosting,total reported direct emissions from transmission pipelines,total reported direct emissions from local dist companies,total reported direct emissions from electrical equipment use,total co2e
5,2020-01-01 00:00:00.000,1768353000.0,1947463000.0,2400335000.0,93488110.0,90028670.0,3497590.0,12641100.0,2004836.0,2601995000.0
3,2019-01-01 00:00:00.000,1958369000.0,2153508000.0,2626532000.0,120174300.0,92765660.0,2859475.0,12847020.0,2510832.0,2857689000.0
0,2018-01-01 00:00:00.000,2099221000.0,2304343000.0,2779471000.0,111958800.0,83325600.0,3050315.0,13236260.0,2270228.0,2993312000.0
7,2017-01-01 00:00:00.000,2070082000.0,2270354000.0,2735841000.0,96241460.0,77830580.0,2699047.0,13670430.0,2555766.0,2928838000.0
2,2016-01-01 00:00:00.000,2144348000.0,2337524000.0,2805105000.0,86898250.0,82597010.0,3183982.0,14002290.0,2930497.0,2994717000.0
4,2015-01-01 00:00:00.000,2261725000.0,2452045000.0,2939444000.0,101748500.0,0.0,0.0,14558310.0,2472281.0,3058223000.0
8,2014-01-01 00:00:00.000,2392070000.0,2592859000.0,3084069000.0,101951700.0,0.0,0.0,14771850.0,3220287.0,3204013000.0
1,2013-01-01 00:00:00.000,2392082000.0,2593335000.0,3073214000.0,97959460.0,0.0,0.0,15161470.0,3258298.0,3189593000.0
6,2012-01-01 00:00:00.000,2376107000.0,2579361000.0,3058076000.0,92539660.0,0.0,0.0,15412350.0,3236291.0,3169264000.0
9,2011-01-01 00:00:00.000,2509918000.0,2728839000.0,3207583000.0,91190570.0,0.0,0.0,15667940.0,3920547.0,3318362000.0


Cross-check with ESSD tops-down dataset

In [14]:
qres = engine.execute('describe essd.ghg_data')
qres.fetchall()

[('iso', 'varchar', '', ''),
 ('country', 'varchar', '', ''),
 ('region_ar6_6', 'varchar', '', ''),
 ('region_ar6_10', 'varchar', '', ''),
 ('region_ar6_22', 'varchar', '', ''),
 ('region_ar6_dev', 'varchar', '', ''),
 ('sector_title', 'varchar', '', ''),
 ('subsector_title', 'varchar', '', ''),
 ('gas', 'varchar', '', ''),
 ('gwp100_ar5', 'integer', '', ''),
 ('value', 'double', '', ''),
 ('year', 'timestamp(3)', '', '')]

A quick look at *just* CO2.  We'll look at CO2e in the next set of cells.

In [15]:
qres = engine.execute("select year, sum(value), gas from essd.ghg_data where sector_title='Energy systems' and gas='CO2' and year>DATE('2010-01-01') and ISO='USA' group by year, gas order by year desc")
qres.fetchall()

[('2020-01-01 00:00:00.000', 1751743410.33274, 'CO2'),
 ('2019-01-01 00:00:00.000', 1987569148.7149098, 'CO2'),
 ('2018-01-01 00:00:00.000', 2130195623.42906, 'CO2'),
 ('2017-01-01 00:00:00.000', 2106892474.32796, 'CO2'),
 ('2016-01-01 00:00:00.000', 2180627611.48211, 'CO2'),
 ('2015-01-01 00:00:00.000', 2275835961.04433, 'CO2'),
 ('2014-01-01 00:00:00.000', 2411676648.3933206, 'CO2'),
 ('2013-01-01 00:00:00.000', 2410360932.0616, 'CO2'),
 ('2012-01-01 00:00:00.000', 2390450900.39202, 'CO2'),
 ('2011-01-01 00:00:00.000', 2514780563.17824, 'CO2')]

In [16]:
qres = engine.execute('describe essd.gwp100_data')
qres.fetchall()

[('iso', 'varchar', '', ''),
 ('country', 'varchar', '', ''),
 ('region_ar6_6', 'varchar', '', ''),
 ('region_ar6_10', 'varchar', '', ''),
 ('region_ar6_22', 'varchar', '', ''),
 ('region_ar6_dev', 'varchar', '', ''),
 ('sector_title', 'varchar', '', ''),
 ('subsector_title', 'varchar', '', ''),
 ('co2', 'double', '', ''),
 ('ch4', 'double', '', ''),
 ('n2o', 'double', '', ''),
 ('fgas', 'double', '', ''),
 ('ghg', 'double', '', ''),
 ('year', 'timestamp(3)', '', '')]

A look at CO2e (presuming that's what GHG gives us from the GWP100 table).

In [17]:
qres = engine.execute("select year, sum(GHG) from essd.gwp100_data where sector_title='Energy systems' and year>DATE('2010-01-01') and ISO='USA' group by year order by year desc")
qres.fetchall()

[('2020-01-01 00:00:00.000', 1751743410.33274),
 ('2019-01-01 00:00:00.000', 2349291999.2192674),
 ('2018-01-01 00:00:00.000', 2477543798.400163),
 ('2017-01-01 00:00:00.000', 2451547006.5038967),
 ('2016-01-01 00:00:00.000', 2514167924.00994),
 ('2015-01-01 00:00:00.000', 2634213163.051593),
 ('2014-01-01 00:00:00.000', 2784719011.4105005),
 ('2013-01-01 00:00:00.000', 2779046573.6036997),
 ('2012-01-01 00:00:00.000', 2759253824.43615),
 ('2011-01-01 00:00:00.000', 2890949806.5313897)]

In [18]:
qres = engine.execute("describe us_census.all_sector_survey_2017")
display(qres.fetchall())
qres = engine.execute("select * from us_census.all_sector_survey_2017 where naics2012='221112'")
display(qres.fetchall())


[('geo_id', 'varchar', '', ''),
 ('name', 'varchar', '', ''),
 ('geo_id_f', 'varchar', '', ''),
 ('naics2012', 'varchar', '', ''),
 ('naics2012_f', 'varchar', '', ''),
 ('naics2012_label', 'varchar', '', ''),
 ('year', 'varchar', '', ''),
 ('estab', 'varchar', '', ''),
 ('rcptot', 'varchar', '', ''),
 ('payann', 'varchar', '', ''),
 ('emp', 'varchar', '', '')]

[('0100000US', 'United States', None, '221112', None, 'Fossil fuel electric power generation', '2012', '1416', '81473633', '7997908', '82071'),
 ('0100000US', 'United States', None, '221112', None, 'Fossil fuel electric power generation', '2017', '1711', '75455040', '8192622', '76058')]

Exercise the connection to NAICS and sector information provided by US Department of Commerce (US_CENSUS)

In [19]:
# qres = engine.execute("select count (*), epa_ghgrp.direct_emitters.year, primary_naics_code from epa_ghgrp.direct_emitters where primary_naics_code='221112' group by epa_ghgrp.direct_emitters.year, primary_naics_code order by count (*) desc limit 20")
# display(qres.fetchall())

qres = engine.execute(f"""
select count (*), epa_ghgrp.direct_emitters.year, primary_naics_code, naics2012_label
from epa_ghgrp.direct_emitters, us_census.all_sector_survey_2017
where primary_naics_code=naics2012 and us_census.all_sector_survey_2017.year='2017'
group by epa_ghgrp.direct_emitters.year, primary_naics_code, naics2012_label order by count (*) desc limit 20
""")
display(qres.fetchall())

[(1483, '2012-01-01 00:00:00.000', '221112', 'Fossil fuel electric power generation'),
 (1467, '2011-01-01 00:00:00.000', '221112', 'Fossil fuel electric power generation'),
 (1454, '2013-01-01 00:00:00.000', '221112', 'Fossil fuel electric power generation'),
 (1425, '2014-01-01 00:00:00.000', '221112', 'Fossil fuel electric power generation'),
 (1368, '2015-01-01 00:00:00.000', '221112', 'Fossil fuel electric power generation'),
 (1304, '2016-01-01 00:00:00.000', '221112', 'Fossil fuel electric power generation'),
 (1294, '2018-01-01 00:00:00.000', '221112', 'Fossil fuel electric power generation'),
 (1281, '2017-01-01 00:00:00.000', '221112', 'Fossil fuel electric power generation'),
 (1274, '2019-01-01 00:00:00.000', '221112', 'Fossil fuel electric power generation'),
 (1247, '2012-01-01 00:00:00.000', '562212', 'Solid waste landfill'),
 (1244, '2020-01-01 00:00:00.000', '221112', 'Fossil fuel electric power generation'),
 (1235, '2011-01-01 00:00:00.000', '562212', 'Solid waste la

See how many `PARENT_COMPANY` records have LEIs we know.  Note that there are about 5x facilities for each known LEI.  There are fewer average facilities for parent copmanies with unknown LEIs.

In [20]:
qres = engine.execute(f"select count (*), reporting_year from (select lei, reporting_year from epa_ghgrp.parent_company where LEI is not null group by lei, reporting_year) group by reporting_year order by reporting_year desc")
qres.fetchall()

[(1542, '2020-01-01 00:00:00.000'),
 (1559, '2019-01-01 00:00:00.000'),
 (1626, '2018-01-01 00:00:00.000'),
 (1562, '2017-01-01 00:00:00.000'),
 (1560, '2016-01-01 00:00:00.000'),
 (1541, '2015-01-01 00:00:00.000'),
 (1712, '2014-01-01 00:00:00.000'),
 (1695, '2013-01-01 00:00:00.000'),
 (1657, '2012-01-01 00:00:00.000'),
 (1578, '2011-01-01 00:00:00.000'),
 (1309, '2010-01-01 00:00:00.000')]

In [21]:
qres = engine.execute(f"describe sec_dera.sub")
qres.fetchall()

[('adsh', 'varchar', '', ''),
 ('cik', 'integer', '', ''),
 ('name', 'varchar', '', ''),
 ('lei', 'varchar', '', ''),
 ('sic', 'integer', '', ''),
 ('countryba', 'varchar', '', ''),
 ('stprba', 'varchar', '', ''),
 ('cityba', 'varchar', '', ''),
 ('zipba', 'varchar', '', ''),
 ('bas1', 'varchar', '', ''),
 ('bas2', 'varchar', '', ''),
 ('baph', 'varchar', '', ''),
 ('countryma', 'varchar', '', ''),
 ('stprma', 'varchar', '', ''),
 ('cityma', 'varchar', '', ''),
 ('zipma', 'varchar', '', ''),
 ('mas1', 'varchar', '', ''),
 ('mas2', 'varchar', '', ''),
 ('countryinc', 'varchar', '', ''),
 ('stprinc', 'varchar', '', ''),
 ('ein', 'bigint', '', ''),
 ('former', 'varchar', '', ''),
 ('changed', 'varchar', '', ''),
 ('afs', 'varchar', '', ''),
 ('wksi', 'boolean', '', ''),
 ('fye', 'varchar', '', ''),
 ('form', 'varchar', '', ''),
 ('period', 'timestamp(3)', '', ''),
 ('fy', 'timestamp(3)', '', ''),
 ('fp', 'varchar', '', ''),
 ('filed', 'timestamp(3)', '', ''),
 ('accepted', 'timestamp(3)',

In [22]:
qres = engine.execute(f"describe sec_dera.num")
qres.fetchall()

[('adsh', 'varchar', '', ''),
 ('tag', 'varchar', '', ''),
 ('version', 'varchar', '', ''),
 ('coreg', 'varchar', '', ''),
 ('ddate', 'timestamp(3)', '', ''),
 ('qtrs', 'integer', '', ''),
 ('uom', 'varchar', '', ''),
 ('value', 'double', '', ''),
 ('footnote', 'varchar', '', ''),
 ('uuid', 'varchar', '', ''),
 ('quarter', 'varchar', '', '')]

In [23]:
qres = engine.execute("select count (*) from sfi_geoasset.cement")
display(qres.fetchall())
qres = engine.execute("select count (*) from sfi_geoasset.steel")
display(qres.fetchall())

[(3117,)]

[(1598,)]

In [24]:
from osc_ingest_trino import *
import pyarrow as pa
import pyarrow.parquet as pq
import json


df = pd.read_sql("select facility_id, year, latest_reported_industry_type_sectors, total_reported_direct_emissions from epa_ghgrp.direct_emitters", engine)
df.dropna(how='any', inplace=True)
df.facility_id = df.facility_id.astype('int64')
df.year = df.year.astype('datetime64[ns, UTC]')
df.total_reported_direct_emissions = df.total_reported_direct_emissions.astype('float64')

sector_groupings = [(s[0], len(s)-1, s[1:]) for s in df.latest_reported_industry_type_sectors.str.split(',') if type(s)==list]
from collections import defaultdict
d = defaultdict(list)
for s1, s2, s3 in sector_groupings:
    if s2:
        d[f'{s1} ({s2+1})']  = ','.join([s1] + s3)
    else:
        d[s1] = s1

dinv_map = {v: k for k, v in d.items()}
df['sector_groupings'] = df.latest_reported_industry_type_sectors.map(dinv_map)

In [25]:
for sl in df.latest_reported_industry_type_sectors.str.split(','):
    if type(sl)!=list:
        continue
    # Ensure all primary (and if listed, secondary) sectors are represented
    if f's_{sl[0]}' not in df.columns:
        df[f's_{sl[0]}'] = 0.0
    if len(sl)>1 and f's_{sl[1]}' not in df.columns:
        df[f's_{sl[1]}'] = 0.0

In [26]:
attribution_vector = [ pd.Series([1.0]),
                       pd.Series([2.0/3.0, 1.0/3.0]),
                       pd.Series([0.5, 0.3, 0.2]),
                       pd.Series([0.4, 0.3, 0.2, 0.1]),
                       pd.Series([0.30, 0.25, 0.20, 0.15, 0.10]),
                       pd.Series([0.30, 0.24, 0.19, 0.14, 0.09, 0.04])]

def apply_attribution(x):
    sl = x.latest_reported_industry_type_sectors.split(',')
    # Tertiary sectors not previously mentioned are silently converted to Other, keeping our attribution columns from exploding
    appropriate_columns = set([f's_{s}' if f's_{s}' in x else 's_Other' for s in sl])
    x[ appropriate_columns ] = x.total_reported_direct_emissions * attribution_vector[len(appropriate_columns)-1].values
    return x

df_emitters = df.apply(apply_attribution, axis=1)
enforce_sql_column_names(df_emitters, inplace=True)

In [27]:
df_emitters[df_emitters.latest_reported_industry_type_sectors.str.contains(',')]

Unnamed: 0,facility_id,year,latest_reported_industry_type_sectors,total_reported_direct_emissions,sector_groupings,s_waste,s_power_plants,s_petroleum_and_natural_gas_systems,s_minerals,s_other,...,s_industrial_gas_suppliers,s_metals,s_suppliers_of_co2,s_pulp_and_paper,s_petroleum_product_suppliers,s_refineries,s_injection_of_co2,s_natural_gas_and_natural_gas_liquids_suppliers,s_import_and_export_of_equipment_containing_fluorintaed_ghgs,s_coal_based_liquid_fuel_supply
8,1004206,2020-01-01 00:00:00+00:00,"Chemicals,Industrial Gas Suppliers",4.401626e+04,Chemicals (2),0.000,0.000000e+00,0.000000,0.000000,0.00,...,14672.088008,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
9,1006665,2020-01-01 00:00:00+00:00,"Chemicals,Industrial Gas Suppliers",6.578659e+05,Chemicals (2),0.000,0.000000e+00,0.000000,0.000000,0.00,...,219288.632467,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
10,1004836,2020-01-01 00:00:00+00:00,"Chemicals,Industrial Gas Suppliers,Minerals",4.954027e+04,,0.000,0.000000e+00,0.000000,14862.081897,0.00,...,9908.054598,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
30,1002627,2020-01-01 00:00:00+00:00,"Other,Waste",1.425194e+05,Other (2),47506.450,0.000000e+00,0.000000,0.000000,95012.90,...,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
33,1004761,2020-01-01 00:00:00+00:00,"Other,Suppliers of CO2",6.930084e+04,,0.000,0.000000e+00,0.000000,0.000000,46200.56,...,0.000000,0.0,23100.280000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68335,1001822,2011-01-01 00:00:00+00:00,"Minerals,Waste",1.017244e+05,Minerals (2),33908.144,0.000000e+00,0.000000,67816.288000,0.00,...,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
68373,1000912,2011-01-01 00:00:00+00:00,"Petroleum and Natural Gas Systems,Power Plants",2.450750e+06,,0.000,1.633833e+06,816916.640667,0.000000,0.00,...,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
68412,1004143,2011-01-01 00:00:00+00:00,"Petroleum and Natural Gas Systems,Suppliers of...",5.946525e+04,Petroleum and Natural Gas Systems (2),0.000,0.000000e+00,39643.500000,0.000000,0.00,...,0.000000,0.0,19821.750000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
68414,1004137,2011-01-01 00:00:00+00:00,"Petroleum and Natural Gas Systems,Suppliers of...",5.236050e+04,Petroleum and Natural Gas Systems (2),0.000,0.000000e+00,34906.997333,0.000000,0.00,...,0.000000,0.0,17453.498667,0.0,0.0,0.0,0.0,0.000000,0.0,0.0


In [28]:
import uuid
ingest_uuid = str(uuid.uuid4())

custom_meta_key_fields = 'metafields'
custom_meta_key = 'metaset'

from datetime import *
ingest_schema = 'ghgrp_demo'
timestamp = str(datetime.now())

qres = engine.execute(f'create schema if not exists {ingest_schema}')
qres.fetchall()

[(True,)]

In [29]:
def create_trino_pipeline (s3, schemaname, tablename, timestamp, df, meta_fields, meta_content):
    global ingest_uuid
    global custom_meta_key_fields, custom_meta_key
    
    # First convert dataframe to pyarrow for type conversion and basic metadata
    table = pa.Table.from_pandas(df)
    # Second, since pyarrow tables are immutable, create a new table with additional combined metadata
    if meta_fields or meta_content:
        meta_json_fields = json.dumps(meta_fields)
        meta_json = json.dumps(meta_content)
        existing_meta = table.schema.metadata
        combined_meta = {
            custom_meta_key_fields.encode(): meta_json_fields.encode(),
            custom_meta_key.encode(): meta_json.encode(),
            **existing_meta
        }
        table = table.replace_schema_metadata(combined_meta)
    # Third, convert table to parquet format (which cannot be written directly to s3)
    pq.write_table(table, f'/tmp/{schemaname}.{tablename}.{ingest_uuid}.{timestamp}.parquet')
    # df.to_parquet(f'/tmp/{schemaname}.{tablename}.{uuid}.parquet', index=False)
    # Fourth, put the parquet-ified data into our S3 bucket for trino.  We cannot compute parquet format directly to S3 but we can copy it once computed
    s3.upload_file(
        Bucket=os.environ['S3_DEV_BUCKET'],
        Key=f'trino/{schemaname}/{tablename}/{ingest_uuid}/{timestamp}/data.parquet',
        Filename=f'/tmp/{schemaname}.{tablename}.{ingest_uuid}.{timestamp}.parquet'
    )
    # Finally, create the trino table backed by our parquet files enhanced by our metadata
    qres = engine.execute(f'drop table if exists {schemaname}.{tablename}')
    print(f'dropping table: {tablename}')
    qres.fetchall()
    
    columnschema = create_table_schema_pairs(df)

    tabledef = f"""create table if not exists {schemaname}.{tablename}(
{columnschema}
) with (
    format = 'parquet',
    external_location = 's3a://{os.environ['S3_DEV_BUCKET']}/trino/{schemaname}/{tablename}/{ingest_uuid}/{timestamp}'
)"""
    print(tabledef)

    # tables created externally may not show up immediately in cloud-beaver
    qres = engine.execute(tabledef)
    qres.fetchall()

In [30]:
import boto3

# Create an S3 client.  We will user later when we write out data and metadata
s3_trino = boto3.client(
    service_name="s3",
    endpoint_url=os.environ['S3_DEV_ENDPOINT'],
    aws_access_key_id=os.environ['S3_DEV_ACCESS_KEY'],
    aws_secret_access_key=os.environ['S3_DEV_SECRET_KEY'],
)

In [31]:
custom_meta_content = { 'foo':'bar'}
custom_meta_fields = { 'GHGRP FACILITY ID':'GHGRP FACILITY ID'}

tablename = 'direct_emitters_attributed'
create_trino_pipeline (s3_trino, ingest_schema, tablename, timestamp, df_emitters, custom_meta_fields, custom_meta_content)

dropping table: direct_emitters_attributed
create table if not exists ghgrp_demo.direct_emitters_attributed(
    facility_id bigint,
    year timestamp,
    latest_reported_industry_type_sectors varchar,
    total_reported_direct_emissions double,
    sector_groupings varchar,
    s_waste double,
    s_power_plants double,
    s_petroleum_and_natural_gas_systems double,
    s_minerals double,
    s_other double,
    s_chemicals double,
    s_industrial_gas_suppliers double,
    s_metals double,
    s_suppliers_of_co2 double,
    s_pulp_and_paper double,
    s_petroleum_product_suppliers double,
    s_refineries double,
    s_injection_of_co2 double,
    s_natural_gas_and_natural_gas_liquids_suppliers double,
    s_import_and_export_of_equipment_containing_fluorintaed_ghgs double,
    s_coal_based_liquid_fuel_supply double
) with (
    format = 'parquet',
    external_location = 's3a://ocp-odh-os-demo-s3/trino/ghgrp_demo/direct_emitters_attributed/023dcd39-5a85-4617-8a30-7ddcb1fc8820/20

`direct_emitters_oc` gives us an "owner count (oc)" so we can pick facilities that make joins and math easier.

### When we join tables, it is important to select not only where oc=1 but also where parent_company.lsi is not null (and year=reporting_year)

In [32]:
# Count how many rows of data we have from EPA GHGRP data.  The original datset has about 68K rows for all 10 years.

qres = engine.execute('drop table if exists ghgrp_demo.direct_emitters_oc')
qres.fetchall()
qres = engine.execute(f"""
create table ghgrp_demo.direct_emitters_oc as select facility_id, year, count (ghgrp_facility_id) as oc
from ghgrp_demo.direct_emitters_attributed, epa_ghgrp.parent_company
where ghgrp_demo.direct_emitters_attributed.facility_id=epa_ghgrp.parent_company.ghgrp_facility_id
and ghgrp_demo.direct_emitters_attributed.year=parent_company.reporting_year
and parent_company.lei is not null
group by facility_id, year
""")
qres.fetchall()

[(52074,)]

Here's an example of a facility with many owners...

In [33]:
qres = engine.execute("select * from epa_ghgrp.parent_company where reporting_year=DATE('2020-01-01') and ghgrp_facility_id=1005071 order by lei")
qres.fetchall()

[(1005071, '110000702730', '2549000NXAL5JJHJYT18', '2020-01-01 00:00:00.000', 'North Terrebonne Gas Plant', '449 Shell E&P Court', 'Gibson', 'LA', '70358', 'TERREBONNE PARISH', 'ENERGY RESOURCES TECHNOLOGY LAND INC', '400 N Sam Houston Parkway East, Suite 400', 'Houston', 'TX', '77060', 0.95, '211130'),
 (1005071, '110000702730', '54930000S35EESPK1C27', '2020-01-01 00:00:00.000', 'North Terrebonne Gas Plant', '449 Shell E&P Court', 'Gibson', 'LA', '70358', 'TERREBONNE PARISH', 'BYRON ENERGY LLC', '100 Rue Iberville #110', 'Lafayette', 'LA', '70508', 0.64, '211130'),
 (1005071, '110000702730', '5493003QENHHS261UR94', '2020-01-01 00:00:00.000', 'North Terrebonne Gas Plant', '449 Shell E&P Court', 'Gibson', 'LA', '70358', 'TERREBONNE PARISH', 'TARGA RESOURCES CORP', '1000 Louisiana', 'Houston', 'TX', '77002', 4.75, '211130'),
 (1005071, '110000702730', '5493005Y7TJPYWLDEO18', '2020-01-01 00:00:00.000', 'North Terrebonne Gas Plant', '449 Shell E&P Court', 'Gibson', 'LA', '70358', 'TERREBON

...meaning 10 rows of data that's outside our easy-to-aggregate data

In [34]:
qres = engine.execute('select * from epa_ghgrp.direct_emitters where facility_id=1005071 order by year')
qres.fetchall()

[(1005071, '110000702730', 'North Terrebonne Gas Plant', 'Gibson', 'LA', '70358', '449 Shell E&P Court', 'TERREBONNE PARISH', 29.6257, -90.9289, '211130', 'C,W-PROC', 'Petroleum and Natural Gas Systems', 383446.646, '2011-01-01 00:00:00.000'),
 (1005071, '110000702730', 'North Terrebonne Gas Plant', 'Gibson', 'LA', '70358', '449 Shell E&P Court', 'TERREBONNE PARISH', 29.6257, -90.9289, '211130', 'C,W-PROC', 'Petroleum and Natural Gas Systems', 339163.524, '2012-01-01 00:00:00.000'),
 (1005071, '110000702730', 'North Terrebonne Gas Plant', 'Gibson', 'LA', '70358', '449 Shell E&P Court', 'TERREBONNE PARISH', 29.6257, -90.9289, '211130', 'C,W-PROC', 'Petroleum and Natural Gas Systems', 313640.418, '2013-01-01 00:00:00.000'),
 (1005071, '110000702730', 'North Terrebonne Gas Plant', 'Gibson', 'LA', '70358', '449 Shell E&P Court', 'TERREBONNE PARISH', 29.6257, -90.9289, '211130', 'C,W-PROC', 'Petroleum and Natural Gas Systems', 312585.924, '2014-01-01 00:00:00.000'),
 (1005071, '110000702730

Of the 52K rows of emissions we have, how many **_facilities_** are present in CY2020 data with OC=1?

In [35]:
qres = engine.execute(f"""select count (*), ghgrp_demo.direct_emitters_oc.year
from epa_ghgrp.parent_company, ghgrp_demo.direct_emitters_oc
where epa_ghgrp.parent_company.ghgrp_facility_id=ghgrp_demo.direct_emitters_oc.facility_id
and epa_ghgrp.parent_company.reporting_year=ghgrp_demo.direct_emitters_oc.year
and ghgrp_demo.direct_emitters_oc.oc=1
and reporting_year >= DATE('2020-01-01')
group by ghgrp_demo.direct_emitters_oc.year
""")
qres.fetchall()

[(4823, '2020-01-01 00:00:00.000')]

Of the 4823 rows of CY2020 emissions data we have, how many **_facilities owned by public companies_** match to corporate reports we can see using the SEC's DERA dataset?

In [36]:
qres = engine.execute(f"""select count (*), ghgrp_demo.direct_emitters_oc.year
from epa_ghgrp.parent_company, ghgrp_demo.direct_emitters_oc, sec_dera.sub
where form='10-K'
and epa_ghgrp.parent_company.ghgrp_facility_id=ghgrp_demo.direct_emitters_oc.facility_id
and epa_ghgrp.parent_company.reporting_year=ghgrp_demo.direct_emitters_oc.year
and ghgrp_demo.direct_emitters_oc.oc=1
and reporting_year >= DATE('2020-01-01') and filed >= DATE('2020-01-01') and filed < DATE('2021-01-01')
and epa_ghgrp.parent_company.lei=sec_dera.sub.lei and epa_ghgrp.parent_company.lei is not null
group by ghgrp_demo.direct_emitters_oc.year
order by ghgrp_demo.direct_emitters_oc.year desc
""")
qres.fetchall()

[(2651, '2020-01-01 00:00:00.000')]

We can tie these companies to ticker symbols...

In [37]:
qres = engine.execute(f"""select * from sec_dera.ticker limit 10""")
qres.fetchall()

[(320193, 'aapl'),
 (789019, 'msft'),
 (1652044, 'goog'),
 (1018724, 'amzn'),
 (1318605, 'tsla'),
 (1326801, 'fb'),
 (40545, 'ge'),
 (1067983, 'brk-a'),
 (1046179, 'tsm'),
 (1293451, 'tcehy')]

How many distinct companies own these facilities (and what are their ticker symbols)?

In [38]:
qres = engine.execute(f"""
with leis as (select DISTINCT(sec_dera.sub.lei), if(tname IS NULL, '<private>', tname) as ticker
              from epa_ghgrp.parent_company,sec_dera.sub,sec_dera.ticker
              where epa_ghgrp.parent_company.lei=sec_dera.sub.lei and form='10-K'
              and sec_dera.sub.cik=sec_dera.ticker.cik
              and period>=DATE('2020-01-01')
              and period<DATE('2021-01-01'))
select count (*), ticker, leis.lei, parent_company_name, reporting_year
from epa_ghgrp.parent_company, ghgrp_demo.direct_emitters_oc, leis
where ghgrp_demo.direct_emitters_oc.oc=1
and reporting_year >= DATE('2020-01-01') and year=reporting_year
and epa_ghgrp.parent_company.lei=leis.lei
and epa_ghgrp.parent_company.ghgrp_facility_id=ghgrp_demo.direct_emitters_oc.facility_id
group by leis.ticker, leis.lei, parent_company_name, reporting_year
order by count(*) desc
-- limit 10
""")
ticker_list = qres.fetchall()
print(len(ticker_list))

365


Note that some comapnies have more than one ticker symbol!

In [39]:
ticker_list[0:50]

[(228, 'wm', '549300YX8JIID70NFS41', 'WASTE MANAGEMENT INC', '2020-01-01 00:00:00.000'),
 (162, 'rsg', 'NKNQHM6BLECKVOQP7O46', 'REPUBLIC SERVICES INC', '2020-01-01 00:00:00.000'),
 (160, 'ep-pc', '549300WR7IX8XE0TBO16', 'KINDER MORGAN INC', '2020-01-01 00:00:00.000'),
 (160, 'kmi', '549300WR7IX8XE0TBO16', 'KINDER MORGAN INC', '2020-01-01 00:00:00.000'),
 (108, 'et-pc', 'MTLVN9N7JE8MIBIJ1H73', 'ENERGY TRANSFER LP', '2020-01-01 00:00:00.000'),
 (108, 'et-pe', 'MTLVN9N7JE8MIBIJ1H73', 'ENERGY TRANSFER LP', '2020-01-01 00:00:00.000'),
 (108, 'et', 'MTLVN9N7JE8MIBIJ1H73', 'ENERGY TRANSFER LP', '2020-01-01 00:00:00.000'),
 (108, 'et-pd', 'MTLVN9N7JE8MIBIJ1H73', 'ENERGY TRANSFER LP', '2020-01-01 00:00:00.000'),
 (82, 'brk-b', '5493000C01ZX7D35SD85', 'BERKSHIRE HATHAWAY INC', '2020-01-01 00:00:00.000'),
 (82, 'brk-a', '5493000C01ZX7D35SD85', 'BERKSHIRE HATHAWAY INC', '2020-01-01 00:00:00.000'),
 (55, 'wmb', 'D71FAKCBLFS2O0RBPG08', 'THE WILLIAMS COS INC', '2020-01-01 00:00:00.000'),
 (50, 'epd',

We can try to add up all the faciltiies for all the tickers, but that leads to counting duplicates for companies that have multiple ticker symbols...(should be 2651, not 3935)

In [40]:
sum([te[0] for te in ticker_list])

3935

Compute intensity in metric tons per million dollars

In [41]:
qres = engine.execute(f"""
select DISTINCT(sec_dera.sub.lei), ghgrp_facility_id, adsh
              from epa_ghgrp.parent_company,ghgrp_demo.direct_emitters_oc,sec_dera.sub
              where epa_ghgrp.parent_company.ghgrp_facility_id=ghgrp_demo.direct_emitters_oc.facility_id
              and epa_ghgrp.parent_company.reporting_year=ghgrp_demo.direct_emitters_oc.year
              and epa_ghgrp.parent_company.reporting_year>=DATE('2020-01-01')
              and ghgrp_demo.direct_emitters_oc.oc=1
              and epa_ghgrp.parent_company.lei=sec_dera.sub.lei and form='10-K'
              and period>=DATE('2020-01-01') and period<DATE('2021-01-01')""")
l = qres.fetchall()
print(len(l))
# display(l)

2700


In [42]:
qres = engine.execute(f"""
with leis as (select DISTINCT(sec_dera.sub.lei), ghgrp_facility_id, adsh
              from epa_ghgrp.parent_company,ghgrp_demo.direct_emitters_oc,sec_dera.sub
              where epa_ghgrp.parent_company.ghgrp_facility_id=ghgrp_demo.direct_emitters_oc.facility_id
              and epa_ghgrp.parent_company.reporting_year=ghgrp_demo.direct_emitters_oc.year
              and epa_ghgrp.parent_company.reporting_year>=DATE('2020-01-01')
              and ghgrp_demo.direct_emitters_oc.oc=1
              and epa_ghgrp.parent_company.lei=sec_dera.sub.lei and form='10-K'
              and period>=DATE('2020-01-01') and period<DATE('2021-01-01'))
select leis.lei as lei, parent_company_name, sum(total_reported_direct_emissions) as tot_co2e, reporting_year, uom, sum(value) as tot_revenue, 1000000*sum(total_reported_direct_emissions)/sum(value) as intensity, 'tCO2e / $MM' as intensity_metric
from leis, epa_ghgrp.parent_company, ghgrp_demo.direct_emitters_oc, ghgrp_demo.direct_emitters_attributed, sec_dera.num
where ghgrp_demo.direct_emitters_oc.oc=1
and epa_ghgrp.parent_company.ghgrp_facility_id=ghgrp_demo.direct_emitters_oc.facility_id
and epa_ghgrp.parent_company.ghgrp_facility_id=ghgrp_demo.direct_emitters_attributed.facility_id
and epa_ghgrp.parent_company.ghgrp_facility_id=leis.ghgrp_facility_id
and epa_ghgrp.parent_company.lei=leis.lei
and reporting_year >= DATE('2020-01-01')
and reporting_year=ghgrp_demo.direct_emitters_oc.year
and reporting_year=ghgrp_demo.direct_emitters_attributed.year
and leis.adsh=sec_dera.num.adsh
and sec_dera.num.tag = 'Revenues'
and sec_dera.num.qtrs=4
and total_reported_direct_emissions>0
group by leis.lei, parent_company_name, reporting_year, uom
order by intensity desc
-- limit 100
""")
list = qres.fetchall()
print(len(list))
display(list)

161


[('549300O4B5CVWMKUES27', 'MIDWEST ENERGY INC', 59963.176, '2020-01-01 00:00:00.000', 'USD', 19575475.0, 3063.1785946445743, 'tCO2e / $MM'),
 ('529900EECJ7CSX1S1S58', 'NATURAL RESOURCE PARTNERS LP', 4878785.088, '2020-01-01 00:00:00.000', 'USD', 2050080000.0, 2379.8022945446032, 'tCO2e / $MM'),
 ('5493003JOBJGLZSDDQ28', 'PNM RESOURCES INC', 90250765.12799995, '2020-01-01 00:00:00.000', 'USD', 61961417000.0, 1456.5639311960854, 'tCO2e / $MM'),
 ('254900GKEQRHOI2SSC19', 'HALLADOR ENERGY CO', 2396625.0, '2020-01-01 00:00:00.000', 'USD', 1706271000.0, 1404.5980972541877, 'tCO2e / $MM'),
 ('9N3UAJSNOUXFKQLF3V18', 'PPL CORP', 403635742.11000025, '2020-01-01 00:00:00.000', 'USD', 295770000000.0, 1364.6946685262205, 'tCO2e / $MM'),
 ('WS423EPRKJIIJUITXD73', 'Tampa Electric CO', 24036405.102, '2020-01-01 00:00:00.000', 'USD', 21600000000.0, 1112.7965325, 'tCO2e / $MM'),
 ('5493002H80P81B3HXL31', 'CLECO CORPORATE HOLDINGS LLC', 60698341.05599997, '2020-01-01 00:00:00.000', 'USD', 54677371000.0, 