# Begin with Credentials and Connection to Trino

In [1]:
import os
import pathlib
from dotenv import load_dotenv

# Load some standard environment variables from a dot-env file, if it exists.
# If no such file can be found, does not fail, and so allows these environment vars to
# be populated in some other way
dotenv_dir = os.environ.get('CREDENTIAL_DOTENV_DIR', os.environ.get('PWD', '/opt/app-root/src'))
dotenv_path = pathlib.Path(dotenv_dir) / 'credentials.env'
if os.path.exists(dotenv_path):
    load_dotenv(dotenv_path=dotenv_path,override=True)

Set session variable CATALOG to make query terms much more compact

In [2]:
import trino
from sqlalchemy.engine import create_engine

sqlstring = 'trino://{user}@{host}:{port}/'.format(
    user = os.environ['TRINO_USER'],
    host = os.environ['TRINO_HOST'],
    port = os.environ['TRINO_PORT']
)
sqlargs = {
    'auth': trino.auth.JWTAuthentication(os.environ['TRINO_PASSWD']),
    'http_scheme': 'https',
    'catalog': 'osc_datacommons_dev'
}
engine = create_engine(sqlstring, connect_args = sqlargs)
connection = engine.connect()

import pandas as pd

In [3]:
cleanup = False

if cleanup:
    qres = engine.execute(f'show tables in ghgrp_demo')
    l = qres.fetchall()

    for schema in [ 'ghgrp_demo' ]:
        print(schema)
        qres = engine.execute(f'show tables in {schema}')
        l = qres.fetchall()

        for table in l:
            qres = engine.execute(f'drop table {schema}.{table[0]}')
            display(qres.fetchall())

        qres = engine.execute(f'show tables in {schema}')
        display(qres.fetchall())

        qres = engine.execute(f'drop schema {schema}')
        display(qres.fetchall())


    qres = engine.execute('show schemas')
    qres.fetchall()
    
ingest_catalog = 'osc_datacommons_dev'
ingest_schema = 'ghgrp_demo'



# Introduction to EPA GHG Reporting Program data (EPA_GHGRP)

The EPA's GHG Reporting Program (GHGRP) seems to be a gold standard in terms of creating a bottoms-up list that's good enough to play a major role in tops-down estimates.

In [4]:
qres = engine.execute(f"show tables in epa_ghgrp")
display(qres.fetchall())

[('co2_injection',),
 ('direct_emitters',),
 ('gathering_boosting',),
 ('geologic_sequestration_of_co2',),
 ('ldc_direct_emissions',),
 ('onshore_oil_gas_prod',),
 ('parent_company',),
 ('sf6_from_elec_equip',),
 ('suppliers',),
 ('transmission_pipelines',)]

`Direct_Emitters` are the lion's share of CO2 _emissions_.  `Suppliers` tracks fuels and products which, when used as intended, will create GHG emissions (by direct emitters or others).

In [5]:
qres = engine.execute(f"describe epa_ghgrp.direct_emitters")
display(qres.fetchall())

[('facility_id', 'bigint', '', ''),
 ('frs_id', 'varchar', '', ''),
 ('facility_name', 'varchar', '', ''),
 ('city', 'varchar', '', ''),
 ('state', 'varchar', '', ''),
 ('zip_code', 'varchar', '', ''),
 ('address', 'varchar', '', ''),
 ('county', 'varchar', '', ''),
 ('latitude', 'double', '', ''),
 ('longitude', 'double', '', ''),
 ('primary_naics_code', 'varchar', '', ''),
 ('latest_reported_industry_type_subparts', 'varchar', '', ''),
 ('latest_reported_industry_type_sectors', 'varchar', '', ''),
 ('total_reported_direct_emissions', 'double', '', ''),
 ('year', 'timestamp(3)', '', '')]

In [6]:
qres = engine.execute(f"""
select format('%tY', year), format('%,.2f', sum(total_reported_direct_emissions)/1000000000) || ' GtCO2e'
from epa_ghgrp.direct_emitters group by year order by year desc""")
display(qres.fetchall())

[('2020', '2.40 GtCO2e'),
 ('2019', '2.63 GtCO2e'),
 ('2018', '2.78 GtCO2e'),
 ('2017', '2.74 GtCO2e'),
 ('2016', '2.81 GtCO2e'),
 ('2015', '2.94 GtCO2e'),
 ('2014', '3.08 GtCO2e'),
 ('2013', '3.07 GtCO2e'),
 ('2012', '3.06 GtCO2e'),
 ('2011', '3.21 GtCO2e')]

Here's a look at how they stack up (from a Database perspective--we should also look at this in Super Set).

In [7]:
qres = engine.execute(f"""
select count (*), latest_reported_industry_type_sectors, format('%,.2f', sum(total_reported_direct_emissions)/1000000) || ' MtCO2e' as MtCO2e
from epa_ghgrp.direct_emitters
where year>=DATE('2019-01-01') and year<DATE('2020-01-01')
group by latest_reported_industry_type_sectors
order by MtCO2e desc
""")
display(qres.fetchall())

[(4, 'Chemicals,Petroleum Product Suppliers,Refineries,Suppliers of CO2', '8.58 MtCO2e'),
 (1, 'Petroleum Product Suppliers,Power Plants,Refineries', '7.66 MtCO2e'),
 (281, 'Metals', '69.49 MtCO2e'),
 (69, 'Petroleum Product Suppliers,Refineries', '65.08 MtCO2e'),
 (41, 'Chemicals,Petroleum Product Suppliers,Refineries', '64.02 MtCO2e'),
 (1, 'Metals,Minerals', '6.20 MtCO2e'),
 (1, 'Chemicals,Petroleum Product Suppliers,Power Plants,Refineries', '5.92 MtCO2e'),
 (7, 'Refineries', '5.71 MtCO2e'),
 (15, 'Chemicals,Industrial Gas Suppliers', '5.32 MtCO2e'),
 (1, 'Metals,Power Plants', '5.24 MtCO2e'),
 (1, 'Chemicals,Other,Petroleum Product Suppliers,Power Plants,Refineries', '4.98 MtCO2e'),
 (1, 'Chemicals,Refineries,Waste', '4.85 MtCO2e'),
 (1, 'Injection of CO2,Other,Suppliers of CO2,Waste', '4.40 MtCO2e'),
 (1, 'Chemicals,Petroleum Product Suppliers,Refineries,Suppliers of CO2,Waste', '4.36 MtCO2e'),
 (1, 'Chemicals,Other,Petroleum and Natural Gas Systems,Waste', '4.26 MtCO2e'),
 (1, '

This looks at the `Minerals` industry (which includes cement).  We see that the top emitters have multiple facility locations.

In [8]:
qres = engine.execute(f"""
select count (*), parent_company_name, format('%5.2f', sum(total_reported_direct_emissions)/1000000) || ' MtCO2e' as MtCO2e
from epa_ghgrp.direct_emitters, epa_ghgrp.parent_company
where year>=DATE('2019-01-01') and year<DATE('2020-01-01') and year=reporting_year
      and latest_reported_industry_type_sectors='Minerals'
      and epa_ghgrp.direct_emitters.facility_id=epa_ghgrp.parent_company.ghgrp_facility_id
group by parent_company_name
order by MtCO2e desc
limit 20
""")
display(qres.fetchall())

[(12, 'HOLCIM PARTICIPATIONS (US) INC', '10.78 MtCO2e'),
 (9, 'CEMEX INC', ' 7.84 MtCO2e'),
 (12, 'LEHIGH HANSON INC', ' 7.17 MtCO2e'),
 (8, 'RC LONESTAR INC', ' 6.67 MtCO2e'),
 (11, 'LHOIST NORTH AMERICA INC', ' 6.10 MtCO2e'),
 (11, 'CARMEUSE LIME INC', ' 4.96 MtCO2e'),
 (9, 'GRAYMONT INC', ' 4.15 MtCO2e'),
 (8, 'CRH AMERICAS INC', ' 4.04 MtCO2e'),
 (4, 'MARTIN MARIETTA MATERIALS INC', ' 3.94 MtCO2e'),
 (4, 'ARGOS USA LLC', ' 3.57 MtCO2e'),
 (11, 'EAGLE MATERIALS INC', ' 3.46 MtCO2e'),
 (3, 'MISSISSIPPI LIME CO', ' 3.39 MtCO2e'),
 (3, 'TAIHEIYO CEMENT USA INC', ' 3.27 MtCO2e'),
 (5, 'GCC OF AMERICA INC', ' 2.31 MtCO2e'),
 (2, 'TITAN AMERICA LLC', ' 2.23 MtCO2e'),
 (1, 'GENESIS ENERGY LP', ' 1.82 MtCO2e'),
 (1, 'LAFARGEHOLCIM NORTH AMERICA INC', ' 1.79 MtCO2e'),
 (2, 'NATIONAL CEMENT', ' 1.63 MtCO2e'),
 (2, 'SUMMIT MATERIALS INC', ' 1.56 MtCO2e'),
 (1, 'TATA CHEMICALS (SODA ASH) PARTNERS NORTH AMERICA', ' 1.54 MtCO2e')]

`Suppliers` are those who buy and sell GHG-emitting products, but they do not, themselves, cause the emissions.  They merely enable others to emit.

In [9]:
qres = engine.execute(f"describe epa_ghgrp.suppliers")
display(qres.fetchall())

[('facility_id', 'bigint', '', ''),
 ('frs_id', 'varchar', '', ''),
 ('facility_name', 'varchar', '', ''),
 ('city', 'varchar', '', ''),
 ('state', 'varchar', '', ''),
 ('zip_code', 'varchar', '', ''),
 ('address', 'varchar', '', ''),
 ('county', 'varchar', '', ''),
 ('latitude', 'double', '', ''),
 ('longitude', 'double', '', ''),
 ('primary_naics_code', 'varchar', '', ''),
 ('latest_reported_industry_type_subparts', 'varchar', '', ''),
 ('coal_based_liquid_fuel_production_ghg', 'double', '', ''),
 ('petroleum_products_produced_ghg', 'double', '', ''),
 ('petroleum_products_imported_ghg', 'double', '', ''),
 ('petroleum_products_exported_ghg', 'double', '', ''),
 ('natural_gas_supply_ghg', 'double', '', ''),
 ('natural_gas_liquids_supply_ghg', 'double', '', ''),
 ('co2_supply_ghg', 'double', '', ''),
 ('year', 'timestamp(3)', '', '')]

A quick summary of how many rows of data we have in `epa_ghgrp`.

68k rows in `direct_emitters`: lots of facilities  
103k rows in `parent_company`: lots of facility/owner relationships

In [10]:
qres = engine.execute('show tables in epa_ghgrp')
l = qres.fetchall()
totalrows = 0
for e in l:
    s = f'select count (*) from epa_ghgrp.{e[0]}'
    qres = engine.execute(s)
    rowcount = qres.fetchall()[0][0]
    totalrows += rowcount
    print(f"{rowcount:>6} <- {s})")

print(f'{totalrows} <- total rows')

   954 <- select count (*) from epa_ghgrp.co2_injection)
 68472 <- select count (*) from epa_ghgrp.direct_emitters)
  1703 <- select count (*) from epa_ghgrp.gathering_boosting)
    20 <- select count (*) from epa_ghgrp.geologic_sequestration_of_co2)
  1730 <- select count (*) from epa_ghgrp.ldc_direct_emissions)
  5068 <- select count (*) from epa_ghgrp.onshore_oil_gas_prod)
103043 <- select count (*) from epa_ghgrp.parent_company)
  1012 <- select count (*) from epa_ghgrp.sf6_from_elec_equip)
  8539 <- select count (*) from epa_ghgrp.suppliers)
   780 <- select count (*) from epa_ghgrp.transmission_pipelines)
191321 <- total rows


# Reshaping tables to make them easier to chart

The key metric is total_emissions (in metric tons of CO2e), but the name of the metric depends on the source/process.  Nevertheless, we know that `year` is our last metric and that the CO2e metric is 2nd-to-last (hence the `-2` index).

We also know that when building our final summary table, the sums feeding into it are all only one row per year.  We use `iat[0,1` to access the 0th row and the 1st column (which will be named specifically to the source/process).  By using `iat`, we get a scalar value we can sum, instead of a Series object we'd have to `squeeze`.

In [11]:
import pandas as pd

emission_tables = ['direct_emitters', 'onshore_oil_gas_prod', 'gathering_boosting',
                   'transmission_pipelines', 'ldc_direct_emissions', 'sf6_from_elec_equip']
tot_em_columns = []

q_dict = {}

for t in emission_tables:
    qres = engine.execute(f"describe epa_ghgrp.{t}")
    tr = qres.fetchall()
    total_emission_cname = tr[-2][0]
    tot_em_columns.append(total_emission_cname)
    qres = engine.execute(f"select year, sum({total_emission_cname}) from epa_ghgrp.{t} group by year")
    q_dict[t] = pd.DataFrame(qres.fetchall(), columns=['year', total_emission_cname])

# A function that excludes terms using SQL to say "and X!=Y"
def excl_text(excl):
    return ' and '.join([f"latest_reported_industry_type_sectors!='{e}'" for e in excl])

# A function that includes text that matches; SQL that says "or X like '%Y%'"
def incl_text(excl):
    return ' or '.join([f"latest_reported_industry_type_sectors like '%{e}%'" for e in excl])

t = 'direct_emitters'
qres = engine.execute(f"describe epa_ghgrp.{t}")
t_cols = qres.fetchall()
total_emission_cname = t_cols[-2][0]

incl = [ 'Power', 'Petroleum']
qres = engine.execute(f"""
select year, sum({total_emission_cname}) from epa_ghgrp.{t}
where {incl_text(incl)}
group by year
""")
q_dict[t + f" (incl {','.join(incl)})"] = pd.DataFrame(qres.fetchall(), columns=['year', total_emission_cname + f" (matching {','.join(incl)})"])

excl = [ 'Minerals', 'Other', 'Waste', 'Chemicals', 'Pulp and Paper,Waste',
        'Metals,Waste', 'Pulp and Paper']
qres = engine.execute(f"""
select year, sum({total_emission_cname}) from epa_ghgrp.{t}
where {excl_text(excl)}
group by year
""")
q_dict[t + f" (excl {','.join(excl)})"] = pd.DataFrame(qres.fetchall(), columns=['year', total_emission_cname + f" (excl {','.join(excl)})"])

for t in emission_tables:
    qres = engine.execute(f"describe epa_ghgrp.{t}")
    tr = qres.fetchall()
    total_emission_cname = tr[-2][0]
    qres = engine.execute(f"select year, sum({total_emission_cname}) from epa_ghgrp.{t} group by year")
    q_dict[t] = pd.DataFrame(qres.fetchall(), columns=['year', total_emission_cname])

grand_total = {}

for year in q_dict['direct_emitters'].year:
    grand_total[year] = sum([q_dict[t][q_dict[t].year==year].iat[0,1] for t in emission_tables if year in q_dict[t].year.values])

df = pd.DataFrame.from_dict(grand_total, orient='index', columns=['total_co2e']).reset_index()
df.rename(columns={'index':'year'}, inplace=True)
q_dict['grand_total'] = df

This gem comes from https://stackoverflow.com/questions/44327999/python-pandas-merge-multiple-dataframes

In [12]:
from functools import reduce

df_merged = reduce(lambda left,right: pd.merge(left,right,on=['year'], how='outer'), q_dict.values()).fillna(0)
df_merged.sort_values(by='year', ascending=False, inplace=True)
df_merged.index = pd.RangeIndex(len(df_merged.index))

A summary table consolidating the totals from the GHGRP, plus three additional columns:
1. direct emitters that match "Power" or "Petroleum"
2. direct emitters that are not the top other industries
3. total co2e

In [13]:
df_merged.rename(columns={v:v.replace('_', ' ') for v in df_merged.columns.values})

Unnamed: 0,year,total reported direct emissions,total reported emissions from onshore oil gas production,total reported emissions from gathering boosting,total reported direct emissions from transmission pipelines,total reported direct emissions from local dist companies,total reported direct emissions from electrical equipment use,"total reported direct emissions (matching Power,Petroleum)","total reported direct emissions (excl Minerals,Other,Waste,Chemicals,Pulp and Paper,Waste,Metals,Waste,Pulp and Paper)",total co2e
0,2020-01-01 00:00:00.000,2400335000.0,93488110.0,90028670.0,3497590.0,12641100.0,2004836.0,1768353000.0,1947463000.0,2601995000.0
1,2019-01-01 00:00:00.000,2626532000.0,120174300.0,92765660.0,2859475.0,12847020.0,2510832.0,1958369000.0,2153508000.0,2857689000.0
2,2018-01-01 00:00:00.000,2779471000.0,111958800.0,83325600.0,3050315.0,13236260.0,2270228.0,2099221000.0,2304343000.0,2993312000.0
3,2017-01-01 00:00:00.000,2735841000.0,96241460.0,77830580.0,2699047.0,13670430.0,2555766.0,2070082000.0,2270354000.0,2928838000.0
4,2016-01-01 00:00:00.000,2805105000.0,86898250.0,82597010.0,3183982.0,14002290.0,2930497.0,2144348000.0,2337524000.0,2994717000.0
5,2015-01-01 00:00:00.000,2939444000.0,101748500.0,0.0,0.0,14558310.0,2472281.0,2261725000.0,2452045000.0,3058223000.0
6,2014-01-01 00:00:00.000,3084069000.0,101951700.0,0.0,0.0,14771850.0,3220287.0,2392070000.0,2592859000.0,3204013000.0
7,2013-01-01 00:00:00.000,3073214000.0,97959460.0,0.0,0.0,15161470.0,3258298.0,2392082000.0,2593335000.0,3189593000.0
8,2012-01-01 00:00:00.000,3058076000.0,92539660.0,0.0,0.0,15412350.0,3236291.0,2376107000.0,2579361000.0,3169264000.0
9,2011-01-01 00:00:00.000,3207583000.0,91190570.0,0.0,0.0,15667940.0,3920547.0,2509918000.0,2728839000.0,3318362000.0


# Cross-check with ESSD tops-down dataset

A quick look at *just* CO2.  We'll look at CO2e in the next set of cells.

In [14]:
qres = engine.execute("""
select format('%tY', year), sector_title, format('%,.2f', sum(value)/1000000000) || ' GtCO2' as GtCO2 from essd.ghg_data
where sector_title='Energy systems' and gas='CO2' and year>DATE('2010-01-01') and year<DATE('2020-01-01') and ISO='USA'
group by year, sector_title, gas order by year desc""")
qres.fetchall()

[('2019', 'Energy systems', '1.99 GtCO2'),
 ('2018', 'Energy systems', '2.13 GtCO2'),
 ('2017', 'Energy systems', '2.11 GtCO2'),
 ('2016', 'Energy systems', '2.18 GtCO2'),
 ('2015', 'Energy systems', '2.28 GtCO2'),
 ('2014', 'Energy systems', '2.41 GtCO2'),
 ('2013', 'Energy systems', '2.41 GtCO2'),
 ('2012', 'Energy systems', '2.39 GtCO2'),
 ('2011', 'Energy systems', '2.51 GtCO2')]

In [15]:
qres = engine.execute('describe essd.ghg_data')
qres.fetchall()

[('iso', 'varchar', '', ''),
 ('country', 'varchar', '', ''),
 ('region_ar6_6', 'varchar', '', ''),
 ('region_ar6_10', 'varchar', '', ''),
 ('region_ar6_22', 'varchar', '', ''),
 ('region_ar6_dev', 'varchar', '', ''),
 ('sector_title', 'varchar', '', ''),
 ('subsector_title', 'varchar', '', ''),
 ('gas', 'varchar', '', ''),
 ('gwp100_ar5', 'integer', '', ''),
 ('value', 'double', '', ''),
 ('year', 'timestamp(3)', '', '')]

In [16]:
qres = engine.execute('describe essd.gwp100_data')
qres.fetchall()

[('iso', 'varchar', '', ''),
 ('country', 'varchar', '', ''),
 ('region_ar6_6', 'varchar', '', ''),
 ('region_ar6_10', 'varchar', '', ''),
 ('region_ar6_22', 'varchar', '', ''),
 ('region_ar6_dev', 'varchar', '', ''),
 ('sector_title', 'varchar', '', ''),
 ('subsector_title', 'varchar', '', ''),
 ('co2', 'double', '', ''),
 ('ch4', 'double', '', ''),
 ('n2o', 'double', '', ''),
 ('fgas', 'double', '', ''),
 ('ghg', 'double', '', ''),
 ('year', 'timestamp(3)', '', '')]

A look at CO2e (presuming that's what GHG gives us from the GWP100 table) for the category `Energy Systems`.

In [17]:
qres = engine.execute("""
select format('%tY', year), sector_title, format('%,.2f', sum(GHG)/1000000000) || ' GtCO2' as GtCO2 from essd.gwp100_data
where sector_title='Energy systems' and year>DATE('2010-01-01') and year<DATE('2020-01-01') and ISO='USA'
group by year, sector_title order by year desc""")
qres.fetchall()

[('2019', 'Energy systems', '2.35 GtCO2'),
 ('2018', 'Energy systems', '2.48 GtCO2'),
 ('2017', 'Energy systems', '2.45 GtCO2'),
 ('2016', 'Energy systems', '2.51 GtCO2'),
 ('2015', 'Energy systems', '2.63 GtCO2'),
 ('2014', 'Energy systems', '2.78 GtCO2'),
 ('2013', 'Energy systems', '2.78 GtCO2'),
 ('2012', 'Energy systems', '2.76 GtCO2'),
 ('2011', 'Energy systems', '2.89 GtCO2')]

# Connect with economic data provided by US CENSUS All-sector Survey (2017)

In [18]:
qres = engine.execute("describe us_census.all_sector_survey_2017")
display(qres.fetchall())
qres = engine.execute("select * from us_census.all_sector_survey_2017 where naics2012='221112'")
display(qres.fetchall())


[('geo_id', 'varchar', '', ''),
 ('name', 'varchar', '', ''),
 ('geo_id_f', 'bigint', '', ''),
 ('naics2012', 'varchar', '', ''),
 ('naics2012_f', 'varchar', '', ''),
 ('naics2012_label', 'varchar', '', ''),
 ('year', 'varchar', '', ''),
 ('estab', 'varchar', '', ''),
 ('rcptot', 'varchar', '', ''),
 ('payann', 'varchar', '', ''),
 ('emp', 'varchar', '', '')]

[('0100000US', 'United States', None, '221112', None, 'Fossil fuel electric power generation', '2012', '1416', '81473633', '7997908', '82071'),
 ('0100000US', 'United States', None, '221112', None, 'Fossil fuel electric power generation', '2017', '1711', '75455040', '8192622', '76058')]

Exercise the connection to NAICS and sector information provided by US Department of Commerce (US_CENSUS)

In [19]:
# Show how many facilities are tagged with what primary NAICS codes

qres = engine.execute(f"""
select count (*), format('%tY', epa_ghgrp.direct_emitters.year), primary_naics_code, naics2012_label
from epa_ghgrp.direct_emitters, us_census.all_sector_survey_2017
where primary_naics_code=naics2012
      and us_census.all_sector_survey_2017.year='2017' and epa_ghgrp.direct_emitters.year=DATE('2017-01-01')
group by epa_ghgrp.direct_emitters.year, primary_naics_code, naics2012_label
order by count (*) desc limit 20
""")
display(qres.fetchall())

[(1281, '2017', '221112', 'Fossil fuel electric power generation'),
 (1134, '2017', '562212', 'Solid waste landfill'),
 (585, '2017', '486210', 'Pipeline transportation of natural gas'),
 (173, '2017', '325193', 'Ethyl alcohol manufacturing'),
 (141, '2017', '324110', 'Petroleum refineries'),
 (120, '2017', '331110', 'Iron and steel mills and ferroalloy manufacturing'),
 (114, '2017', '322121', 'Paper (except newsprint) mills'),
 (100, '2017', '325199', 'All other basic organic chemical manufacturing'),
 (93, '2017', '327310', 'Cement manufacturing'),
 (79, '2017', '212112', 'Bituminous coal underground mining'),
 (77, '2017', '322130', 'Paperboard mills'),
 (75, '2017', '325211', 'Plastics material and resin manufacturing'),
 (69, '2017', '325120', 'Industrial gas manufacturing'),
 (65, '2017', '562213', 'Solid waste combustors and incinerators'),
 (59, '2017', '221330', 'Steam and air-conditioning supply'),
 (59, '2017', '325180', 'Other basic inorganic chemical manufacturing'),
 (55

# More table reshaping: attribution estimation

In [20]:
df = pd.read_sql("""
select facility_id, year, latitude, longitude, latest_reported_industry_type_sectors, total_reported_direct_emissions
from epa_ghgrp.direct_emitters""", engine)
df.facility_id = df.facility_id.astype('int64')
df.year = df.year.astype('datetime64[ns, UTC]')
df.total_reported_direct_emissions = df.total_reported_direct_emissions.astype('float64')
df.latest_reported_industry_type_sectors.fillna('Other', inplace=True)

df['sector_groupings'] = pd.Series([f"{s[0]} ({len(s)+1})" if len(s)>1 else s[0] for s in df.latest_reported_industry_type_sectors.str.split(',')])

In [21]:
for sl in df.latest_reported_industry_type_sectors.str.split(','):
    # Ensure all primary (and if listed, secondary) sectors are represented
    if f's_{sl[0]}' not in df.columns:
        df[f's_{sl[0]}'] = 0.0
    if len(sl)>1 and f's_{sl[1]}' not in df.columns:
        df[f's_{sl[1]}'] = 0.0

In [22]:
attribution_vector = [ pd.Series([1.0]),
                       pd.Series([2.0/3.0, 1.0/3.0]),
                       pd.Series([0.5, 0.3, 0.2]),
                       pd.Series([0.4, 0.3, 0.2, 0.1]),
                       pd.Series([0.30, 0.25, 0.20, 0.15, 0.10]),
                       pd.Series([0.30, 0.24, 0.19, 0.14, 0.09, 0.04])]

def apply_attribution(x):
    sl = x.latest_reported_industry_type_sectors.split(',')
    # Tertiary sectors not previously mentioned are silently converted to Other, keeping our attribution columns from exploding
    appropriate_columns = set([f's_{s}' if f's_{s}' in x else 's_Other' for s in sl])
    x[ appropriate_columns ] = x.total_reported_direct_emissions * attribution_vector[len(appropriate_columns)-1].values
    return x

df_emitters = df.apply(apply_attribution, axis=1)

In [23]:
df_emitters[df_emitters.latest_reported_industry_type_sectors.str.contains(',')]

Unnamed: 0,facility_id,year,latitude,longitude,latest_reported_industry_type_sectors,total_reported_direct_emissions,sector_groupings,s_Waste,s_Power Plants,s_Other,...,s_Petroleum and Natural Gas Systems,s_Metals,s_Suppliers of CO2,s_Pulp and Paper,s_Petroleum Product Suppliers,s_Refineries,s_Injection of CO2,s_Natural Gas and Natural Gas Liquids Suppliers,s_Import and Export of Equipment Containing Fluorintaed GHGs,s_Coal-based Liquid Fuel Supply
4,1004206,2020-01-01 00:00:00+00:00,34.641667,-87.038611,"Chemicals,Industrial Gas Suppliers",4.401626e+04,Chemicals (3),0.000,0.000000e+00,0.00,...,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
5,1006665,2020-01-01 00:00:00+00:00,41.755000,-90.284167,"Chemicals,Industrial Gas Suppliers",6.578659e+05,Chemicals (3),0.000,0.000000e+00,0.00,...,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
6,1004836,2020-01-01 00:00:00+00:00,44.789444,-92.908333,"Chemicals,Industrial Gas Suppliers,Minerals",4.954027e+04,Chemicals (4),0.000,0.000000e+00,0.00,...,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
30,1002627,2020-01-01 00:00:00+00:00,43.499510,-92.917090,"Other,Waste",1.425194e+05,Other (3),47506.450,0.000000e+00,95012.90,...,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
33,1004761,2020-01-01 00:00:00+00:00,44.958900,-90.960800,"Other,Suppliers of CO2",6.930084e+04,Other (3),0.000,0.000000e+00,23100.28,...,0.000000,0.0,46200.560000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68335,1001822,2011-01-01 00:00:00+00:00,42.047900,-104.189000,"Minerals,Waste",1.017244e+05,Minerals (3),33908.144,0.000000e+00,0.00,...,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
68373,1000912,2011-01-01 00:00:00+00:00,33.016300,-79.928500,"Petroleum and Natural Gas Systems,Power Plants",2.450750e+06,Petroleum and Natural Gas Systems (3),0.000,1.633833e+06,0.00,...,816916.640667,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
68412,1004143,2011-01-01 00:00:00+00:00,39.551680,-110.815840,"Petroleum and Natural Gas Systems,Suppliers of...",5.946525e+04,Petroleum and Natural Gas Systems (3),0.000,0.000000e+00,0.00,...,39643.500000,0.0,19821.750000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
68414,1004137,2011-01-01 00:00:00+00:00,32.907723,-97.465856,"Petroleum and Natural Gas Systems,Suppliers of...",5.236050e+04,Petroleum and Natural Gas Systems (3),0.000,0.000000e+00,0.00,...,34906.997333,0.0,17453.498667,0.0,0.0,0.0,0.0,0.000000,0.0,0.0


In [24]:
df_emitters[df_emitters.latest_reported_industry_type_sectors.str.count(',')>1]

Unnamed: 0,facility_id,year,latitude,longitude,latest_reported_industry_type_sectors,total_reported_direct_emissions,sector_groupings,s_Waste,s_Power Plants,s_Other,...,s_Petroleum and Natural Gas Systems,s_Metals,s_Suppliers of CO2,s_Pulp and Paper,s_Petroleum Product Suppliers,s_Refineries,s_Injection of CO2,s_Natural Gas and Natural Gas Liquids Suppliers,s_Import and Export of Equipment Containing Fluorintaed GHGs,s_Coal-based Liquid Fuel Supply
6,1004836,2020-01-01 00:00:00+00:00,44.789444,-92.908333,"Chemicals,Industrial Gas Suppliers,Minerals",4.954027e+04,Chemicals (4),0.0000,0.0,0.0000,...,0.0000,0.0,0.0000,0.0000,0.0000,0.0000,0.000,0.0,0.0,0.0
232,1001945,2020-01-01 00:00:00+00:00,44.474074,-95.783525,"Other,Suppliers of CO2,Waste",1.727498e+05,Other (4),34549.9604,0.0,51824.9406,...,0.0000,0.0,86374.9010,0.0000,0.0000,0.0000,0.000,0.0,0.0,0.0
253,1002515,2020-01-01 00:00:00+00:00,32.756400,-104.211100,"Injection of CO2,Petroleum and Natural Gas Sys...",9.549250e+04,Injection of CO2 (4),0.0000,0.0,0.0000,...,28647.7488,0.0,19098.4992,0.0000,0.0000,0.0000,47746.248,0.0,0.0,0.0
378,1004861,2020-01-01 00:00:00+00:00,40.556501,-89.666235,"Other,Suppliers of CO2,Waste",1.709771e+05,Other (4),34195.4216,0.0,51293.1324,...,0.0000,0.0,85488.5540,0.0000,0.0000,0.0000,0.000,0.0,0.0,0.0
379,1000413,2020-01-01 00:00:00+00:00,40.555000,-89.662800,"Other,Suppliers of CO2,Waste",2.660592e+05,Other (4),53211.8456,0.0,79817.7684,...,0.0000,0.0,133029.6140,0.0000,0.0000,0.0000,0.000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68217,1006095,2011-01-01 00:00:00+00:00,44.404000,-89.825500,"Pulp and Paper,Suppliers of CO2,Waste",4.333954e+05,Pulp and Paper (4),130018.6188,0.0,0.0000,...,0.0000,0.0,216697.6980,86679.0792,0.0000,0.0000,0.000,0.0,0.0,0.0
68218,1004874,2011-01-01 00:00:00+00:00,44.398600,-89.826300,"Pulp and Paper,Suppliers of CO2,Waste",1.701224e+04,Pulp and Paper (4),5103.6708,0.0,0.0000,...,0.0000,0.0,8506.1180,3402.4472,0.0000,0.0000,0.000,0.0,0.0,0.0
68232,1007518,2011-01-01 00:00:00+00:00,38.837500,-90.067500,"Chemicals,Petroleum Product Suppliers,Refineri...",3.399126e+06,Chemicals (6),509868.9351,0.0,0.0000,...,0.0000,0.0,339912.6234,0.0000,849781.5585,679825.2468,0.000,0.0,0.0,0.0
68239,1002310,2011-01-01 00:00:00+00:00,34.628900,-97.168500,"Chemicals,Petroleum Product Suppliers,Refineries",3.655539e+05,Chemicals (4),0.0000,0.0,0.0000,...,0.0000,0.0,0.0000,0.0000,109666.1754,73110.7836,0.000,0.0,0.0,0.0


# Working with Materialized Views

Here's an example of a facility with many owners...

In [25]:
qres = engine.execute("""select ghgrp_facility_id,frs_id_facility,lei,format('%tY', reporting_year),facility_name,facility_city,facility_state,parent_company_name,facility_naics_code
from epa_ghgrp.parent_company where reporting_year=DATE('2020-01-01') and ghgrp_facility_id=1005071 order by lei""")
qres.fetchall()

[(1005071, '110000702730', '2549000NXAL5JJHJYT18', '2020', 'North Terrebonne Gas Plant', 'Gibson', 'LA', 'ENERGY RESOURCES TECHNOLOGY LAND INC', '211130'),
 (1005071, '110000702730', '54930000S35EESPK1C27', '2020', 'North Terrebonne Gas Plant', 'Gibson', 'LA', 'BYRON ENERGY LLC', '211130'),
 (1005071, '110000702730', '5493003QENHHS261UR94', '2020', 'North Terrebonne Gas Plant', 'Gibson', 'LA', 'TARGA RESOURCES CORP', '211130'),
 (1005071, '110000702730', '5493005Y7TJPYWLDEO18', '2020', 'North Terrebonne Gas Plant', 'Gibson', 'LA', 'ARENA ENERGY LP', '211130'),
 (1005071, '110000702730', '5493007VQUSLFRDRBT52', '2020', 'North Terrebonne Gas Plant', 'Gibson', 'LA', 'SUPERIOR NATURAL GAS CORP', '211130'),
 (1005071, '110000702730', '549300HX0ISXOOEMR657', '2020', 'North Terrebonne Gas Plant', 'Gibson', 'LA', 'BLACK ELK ENERGY OFFSHORE OPERATIONS LLC', '211130'),
 (1005071, '110000702730', '549300IRDTHJQ1PVET45', '2020', 'North Terrebonne Gas Plant', 'Gibson', 'LA', 'FREEPORT-MCMORAN INC',

...meaning 10 rows of data that's outside our easy-to-aggregate data

In [26]:
qres = engine.execute("""
select facility_id,facility_name,total_reported_direct_emissions,city,state,latitude,longitude,primary_naics_code,
       latest_reported_industry_type_subparts,latest_reported_industry_type_sectors,format('%tY', year)
from epa_ghgrp.direct_emitters where facility_id=1005071 order by year""")
qres.fetchall()

[(1005071, 'North Terrebonne Gas Plant', 383446.646, 'Gibson', 'LA', 29.6257, -90.9289, '211130', 'C,W-PROC', 'Petroleum and Natural Gas Systems', '2011'),
 (1005071, 'North Terrebonne Gas Plant', 339163.524, 'Gibson', 'LA', 29.6257, -90.9289, '211130', 'C,W-PROC', 'Petroleum and Natural Gas Systems', '2012'),
 (1005071, 'North Terrebonne Gas Plant', 313640.418, 'Gibson', 'LA', 29.6257, -90.9289, '211130', 'C,W-PROC', 'Petroleum and Natural Gas Systems', '2013'),
 (1005071, 'North Terrebonne Gas Plant', 312585.924, 'Gibson', 'LA', 29.6257, -90.9289, '211130', 'C,W-PROC', 'Petroleum and Natural Gas Systems', '2014'),
 (1005071, 'North Terrebonne Gas Plant', 292713.398, 'Gibson', 'LA', 29.6257, -90.9289, '211130', 'C,W-PROC', 'Petroleum and Natural Gas Systems', '2015'),
 (1005071, 'North Terrebonne Gas Plant', 216519.086, 'Gibson', 'LA', 29.6257, -90.9289, '211130', 'C,W-PROC', 'Petroleum and Natural Gas Systems', '2016'),
 (1005071, 'North Terrebonne Gas Plant', 194438.318, 'Gibson', '

Create actual materialized view from a large concatenation operation

In [27]:
import osc_ingest_trino as osc

engine.execute("create schema if not exists ghgrp_demo")

# display([(x, y) for x, y in zip(emission_tables,tot_em_columns)])

emission_selects = [ f"""
select ghgrp_facility_id, reporting_year, lei, '{e_tbl}' as table_source,
         primary_naics_code, parent_co_percent_ownership * 0.01 * {e_col} as fractional_emissions,
         facility_naics_code, parent_company_name
    from epa_ghgrp.parent_company as PC, epa_ghgrp.{e_tbl} as ET
    where PC.ghgrp_facility_id=ET.facility_id
          and PC.reporting_year=ET.year
""" for e_tbl, e_col in zip(emission_tables,tot_em_columns) ]

qres = engine.execute("drop table if exists ghgrp_demo.parent_attribution")
qres.fetchall()

sql = f"""
create table ghgrp_demo.parent_attribution as {' union all '.join(emission_selects)}
"""

print(sql)

qres = engine.execute(sql)
qres.fetchall()


create table ghgrp_demo.parent_attribution as 
select ghgrp_facility_id, reporting_year, lei, 'direct_emitters' as table_source,
         primary_naics_code, parent_co_percent_ownership * 0.01 * total_reported_direct_emissions as fractional_emissions,
         facility_naics_code, parent_company_name
    from epa_ghgrp.parent_company as PC, epa_ghgrp.direct_emitters as ET
    where PC.ghgrp_facility_id=ET.facility_id
          and PC.reporting_year=ET.year
 union all 
select ghgrp_facility_id, reporting_year, lei, 'onshore_oil_gas_prod' as table_source,
         primary_naics_code, parent_co_percent_ownership * 0.01 * total_reported_emissions_from_onshore_oil_gas_production as fractional_emissions,
         facility_naics_code, parent_company_name
    from epa_ghgrp.parent_company as PC, epa_ghgrp.onshore_oil_gas_prod as ET
    where PC.ghgrp_facility_id=ET.facility_id
          and PC.reporting_year=ET.year
 union all 
select ghgrp_facility_id, reporting_year, lei, 'gathering_boostin

[(89879,)]

In [28]:
qres = engine.execute('describe ghgrp_demo.parent_attribution')
display(qres.fetchall())

qres = engine.execute("""
select ghgrp_facility_id, format('%tY', reporting_year), lei, table_source, format('%,.2f', fractional_emissions) || ' t CO2e' as metric
from ghgrp_demo.parent_attribution""")
qres.fetchall()[::2000]

[('ghgrp_facility_id', 'bigint', '', ''),
 ('reporting_year', 'timestamp(3)', '', ''),
 ('lei', 'varchar', '', ''),
 ('table_source', 'varchar(22)', '', ''),
 ('primary_naics_code', 'varchar', '', ''),
 ('fractional_emissions', 'double', '', ''),
 ('facility_naics_code', 'varchar', '', ''),
 ('parent_company_name', 'varchar', '', '')]

[(1012281, '2020', '549300WR7IX8XE0TBO16', 'transmission_pipelines', '163,414.52 t CO2e'),
 (1010945, '2016', 'G6D9JBJ3B5USYVP7TB32', 'onshore_oil_gas_prod', '141,314.70 t CO2e'),
 (1004654, '2011', 'V8HA8TAJG10VL0G7B254', 'ldc_direct_emissions', '36,806.60 t CO2e'),
 (1005117, '2020', '549300C97P3BW8QROE16', 'direct_emitters', '108,560.65 t CO2e'),
 (1010719, '2015', '549300VVH8HHFMWL8536', 'direct_emitters', '8,301.94 t CO2e'),
 (1009728, '2018', '6X88M1NLGKDIRGWMPT94', 'onshore_oil_gas_prod', '342,959.95 t CO2e'),
 (1012486, '2020', 'ILUL7B6Z54MRYCF6H308', 'gathering_boosting', '75,600.69 t CO2e'),
 (1009624, '2017', '84BUTVXV5ODI6BXNMH43', 'onshore_oil_gas_prod', '65,739.69 t CO2e'),
 (1010046, '2015', '5493007LJHV6UCZP0Z13', 'direct_emitters', '35,044.65 t CO2e'),
 (1002838, '2020', 'MTLVN9N7JE8MIBIJ1H73', 'direct_emitters', '14,510.48 t CO2e'),
 (1006941, '2019', '765LHXWGK1KXCLTFYQ30', 'direct_emitters', '97,111.87 t CO2e'),
 (1001988, '2019', '549300WR7IX8XE0TBO16', 'direct_emi

How many **_facilities owned by public companies_** match to corporate reports we can see using the SEC's DERA dataset?

See how many `PARENT_COMPANY` records have LEIs we know.  Note that there are about 8400 total facilities, so 4 facilities not covered by LEI for each that is.
There are 3K-4K distinctly named entities, so average entity owns (at least partially) approx 2-3 facilities.  It also means we know the LEIs of approximately half of the parent copmanies.

In [29]:
qres = engine.execute(f"""select count (*), format('%tY', reporting_year)
from (select lei, reporting_year from epa_ghgrp.parent_company where LEI is not null group by lei, reporting_year)
group by reporting_year order by reporting_year desc""")
qres.fetchall()

[(1542, '2020'),
 (1559, '2019'),
 (1626, '2018'),
 (1562, '2017'),
 (1560, '2016'),
 (1541, '2015'),
 (1712, '2014'),
 (1695, '2013'),
 (1657, '2012'),
 (1578, '2011'),
 (1309, '2010')]

In [30]:
qres = engine.execute(f"describe sec_dera.sub")
qres.fetchall()

[('adsh', 'varchar', '', ''),
 ('cik', 'integer', '', ''),
 ('name', 'varchar', '', ''),
 ('lei', 'varchar', '', ''),
 ('sic', 'integer', '', ''),
 ('countryba', 'varchar', '', ''),
 ('stprba', 'varchar', '', ''),
 ('cityba', 'varchar', '', ''),
 ('zipba', 'varchar', '', ''),
 ('bas1', 'varchar', '', ''),
 ('bas2', 'varchar', '', ''),
 ('baph', 'varchar', '', ''),
 ('countryma', 'varchar', '', ''),
 ('stprma', 'varchar', '', ''),
 ('cityma', 'varchar', '', ''),
 ('zipma', 'varchar', '', ''),
 ('mas1', 'varchar', '', ''),
 ('mas2', 'varchar', '', ''),
 ('countryinc', 'varchar', '', ''),
 ('stprinc', 'varchar', '', ''),
 ('ein', 'bigint', '', ''),
 ('former', 'varchar', '', ''),
 ('changed', 'varchar', '', ''),
 ('afs', 'varchar', '', ''),
 ('wksi', 'boolean', '', ''),
 ('fye', 'varchar', '', ''),
 ('form', 'varchar', '', ''),
 ('period', 'timestamp(3)', '', ''),
 ('fy', 'timestamp(3)', '', ''),
 ('fp', 'varchar', '', ''),
 ('filed', 'timestamp(3)', '', ''),
 ('accepted', 'timestamp(3)',

In [31]:
qres = engine.execute(f"describe sec_dera.num")
qres.fetchall()

[('adsh', 'varchar', '', ''),
 ('tag', 'varchar', '', ''),
 ('version', 'varchar', '', ''),
 ('coreg', 'varchar', '', ''),
 ('ddate', 'timestamp(3)', '', ''),
 ('qtrs', 'integer', '', ''),
 ('uom', 'varchar', '', ''),
 ('value', 'double', '', ''),
 ('footnote', 'varchar', '', ''),
 ('uuid', 'varchar', '', ''),
 ('quarter', 'varchar', '', '')]

In [32]:
qres = engine.execute(f"""select count (*), format('%tY', reporting_year)
from ghgrp_demo.parent_attribution, sec_dera.sub
where form='10-K'
and reporting_year >= DATE('2020-01-01') and filed >= DATE('2020-01-01') and filed < DATE('2021-01-01')
and ghgrp_demo.parent_attribution.lei=sec_dera.sub.lei and ghgrp_demo.parent_attribution.lei is not null
group by ghgrp_demo.parent_attribution.reporting_year
order by ghgrp_demo.parent_attribution.reporting_year
""")
qres.fetchall()

[(3699, '2020')]

We can tie these companies to ticker symbols...

In [33]:
qres = engine.execute(f"""select * from sec_dera.ticker limit 10""")
qres.fetchall()

[(320193, 'aapl'),
 (789019, 'msft'),
 (1652044, 'goog'),
 (1018724, 'amzn'),
 (1318605, 'tsla'),
 (1326801, 'fb'),
 (40545, 'ge'),
 (1067983, 'brk-a'),
 (1046179, 'tsm'),
 (1293451, 'tcehy')]

How many distinct companies own these facilities (and what are their ticker symbols)?

In [34]:
qres = engine.execute(f"""
with leis as (select DISTINCT(sec_dera.sub.lei), name, if(tname IS NULL, '<private>', tname) as ticker
              from ghgrp_demo.parent_attribution,sec_dera.sub,sec_dera.ticker
              where ghgrp_demo.parent_attribution.lei=sec_dera.sub.lei and form='10-K'
              and sec_dera.sub.cik=sec_dera.ticker.cik
              and period>=DATE('2020-01-01')
              and period<DATE('2021-01-01'))
select count (*), ticker, leis.lei, name, format('%tY', reporting_year)
from ghgrp_demo.parent_attribution, leis
where reporting_year >= DATE('2020-01-01')
and ghgrp_demo.parent_attribution.lei=leis.lei
group by leis.ticker, leis.lei, name, reporting_year
order by count(*) desc
-- limit 10
""")
ticker_list = qres.fetchall()
print(len(ticker_list))

417


Note that some comapnies have more than one ticker symbol!

In [35]:
ticker_list[0:50]

[(259, 'kmi', '549300WR7IX8XE0TBO16', 'KINDER MORGAN, INC.', '2020'),
 (259, 'ep-pc', '549300WR7IX8XE0TBO16', 'KINDER MORGAN, INC.', '2020'),
 (228, 'wm', '549300YX8JIID70NFS41', 'WASTE MANAGEMENT INC', '2020'),
 (162, 'rsg', 'NKNQHM6BLECKVOQP7O46', 'REPUBLIC SERVICES, INC.', '2020'),
 (158, 'et-pd', 'MTLVN9N7JE8MIBIJ1H73', 'ENERGY TRANSFER LP', '2020'),
 (158, 'et', 'MTLVN9N7JE8MIBIJ1H73', 'ENERGY TRANSFER LP', '2020'),
 (158, 'et-pe', 'MTLVN9N7JE8MIBIJ1H73', 'ENERGY TRANSFER LP', '2020'),
 (158, 'et-pc', 'MTLVN9N7JE8MIBIJ1H73', 'ENERGY TRANSFER LP', '2020'),
 (114, 'brk-b', '5493000C01ZX7D35SD85', 'BERKSHIRE HATHAWAY INC', '2020'),
 (114, 'brk-a', '5493000C01ZX7D35SD85', 'BERKSHIRE HATHAWAY INC', '2020'),
 (89, 'xom', 'J3WHBG0MTS7O8ZVMDC91', 'EXXON MOBIL CORP', '2020'),
 (87, 'wmb', 'D71FAKCBLFS2O0RBPG08', 'WILLIAMS COMPANIES, INC.', '2020'),
 (65, 'epd', 'K4CDIF4M54DJZ6TB4Q48', 'ENTERPRISE PRODUCTS PARTNERS L.P.', '2020'),
 (61, 'soje', '549300FC3G3YU2FBZD92', 'SOUTHERN CO', '2020')

We can try to add up all the faciltiies for all the tickers, but that leads to counting duplicates for companies that have multiple ticker symbols...(should be 2651, not 5746)

In [36]:
sum([te[0] for te in ticker_list])

5746

Sample data to cross-check LEI, Facility ID and EDGAR submission data

In [37]:
qres = engine.execute(f"""
select DISTINCT(sec_dera.sub.lei), ghgrp_facility_id, adsh
              from ghgrp_demo.parent_attribution,sec_dera.sub
              where reporting_year>=DATE('2020-01-01')
              and ghgrp_demo.parent_attribution.lei=sec_dera.sub.lei and form='10-K'
              and period>=DATE('2020-01-01') and period<DATE('2021-01-01')
              order by sec_dera.sub.lei desc""")
l = qres.fetchall()
print(len(l))
display(l[::100])

3623


[('ZW1LRE7C3H17O2ZN9B45', 1004730, '0001628280-21-003434'),
 ('WD6L6041MNRW1JE49D58', 1000016, '0000100493-20-000132'),
 ('UMI46YPGBLUE4VGNNT48', 1007471, '0000753308-21-000014'),
 ('R8V1FN4M5ITGZOG7BS19', 1003915, '0001140361-21-003906'),
 ('NKNQHM6BLECKVOQP7O46', 1007054, '0001060391-21-000014'),
 ('NKNQHM6BLECKVOQP7O46', 1003411, '0001060391-21-000014'),
 ('MTLVN9N7JE8MIBIJ1H73', 1003221, '0001276187-21-000034'),
 ('MP3J6QPYPGN75NVW2S34', 1002179, '0000055785-21-000016'),
 ('K4CDIF4M54DJZ6TB4Q48', 1005070, '0001061219-21-000009'),
 ('J3WHBG0MTS7O8ZVMDC91', 1006846, '0000034088-21-000012'),
 ('IM7X0T3ECJW4C1T7ON55', 1005858, '0000797468-21-000009'),
 ('I1BZKREC126H0VB1BL91', 1001049, '0001326160-21-000063'),
 ('ENYF8GB5SMQZ25S06U51', 1003718, '0000045012-21-000009'),
 ('CE5OG6JPOZMDSA0LAQ19', 1000824, '0001021635-21-000026'),
 ('824LMFJDH41EY779Q875', 1006760, '0000051434-21-000012'),
 ('549300YX8JIID70NFS41', 1007683, '0001558370-21-001348'),
 ('549300YX8JIID70NFS41', 1007812, '0001

Compute intensity in metric tons of CO2e per million dollars

In [38]:
qres = engine.execute(f"""
select PA.lei, sic, floor(sic/100) as sic_2digit, format('%1$tY-%1$tm-%1$td', reporting_year),
       name, sum(fractional_emissions) as tot_co2e,
       uom || ' $M', round(max(value)/1000000,3) as tot_revenue,
       format('%7.2f', 1000000*sum(fractional_emissions)/sum(value)) || ' tCO2e/$M' as intensity
from ghgrp_demo.parent_attribution as PA join sec_dera.sub as S on PA.lei=S.lei
     join sec_dera.num as N on S.adsh=N.adsh
where reporting_year>=DATE('2020-01-01')
and form='10-K'
and period>=DATE('2020-01-01') and period<DATE('2021-01-01')
and ddate>=DATE('2020-01-01') and ddate<DATE('2021-01-01')
and coreg is NULL
and (N.tag='Revenues'
     or N.tag='RevenueFromContractWithCustomerIncludingAssessedTax'
     or N.tag='RevenueFromContractWithCustomerExcludingAssessedTax'
     or N.tag='RevenuesNetOfInterestExpense'
     or N.tag='RegulatedAndUnregulatedOperatingRevenue'
     or N.tag='RegulatedOperatingRevenuePipelines')
and N.qtrs=4
group by PA.lei, PA.reporting_year, sic, name, uom
order by intensity desc
-- limit 100
""")
list = qres.fetchall()
print(len(list))
display(list[::5])

332


[('549300O4B5CVWMKUES27', 3829, 38, '2020-01-01', 'MIDWEST ENERGY EMISSIONS CORP.', 29981.588, 'USD $M', 8.158, '3674.91 tCO2e/$M'),
 ('5493001H215A0HG0MI71', 2810, 28, '2020-01-01', 'LSB INDUSTRIES INC', 2280742.614, 'USD $M', 351.316, '1623.00 tCO2e/$M'),
 ('549300F0PA3NW6D2OO63', 1311, 13, '2020-01-01', 'HIGHPOINT RESOURCES CORP', 286194.984, 'USD $M', 250.347, '1143.19 tCO2e/$M'),
 ('549300VDHNFNPADSSV98', 4911, 49, '2020-01-01', 'TRI-STATE GENERATION & TRANSMISSION ASSOCIATION, INC.', 9549351.709253304, 'USD $M', 1352.295, ' 882.70 tCO2e/$M'),
 ('529900WYQZSMWFXLA845', 2860, 28, '2020-01-01', 'FUTUREFUEL CORP.', 165972.266, 'USD $M', 204.505, ' 811.58 tCO2e/$M'),
 ('549300JK3KH8PWM3B226', 1220, 12, '2020-01-01', 'CONSOL ENERGY INC.', 13134700.456, 'USD $M', 1021.643, ' 690.90 tCO2e/$M'),
 ('GJOUP9M7C39GLSK9R870', 4911, 49, '2020-01-01', 'PORTLAND GENERAL ELECTRIC CO', 17886164.30464775, 'USD $M', 2151.0, ' 587.80 tCO2e/$M'),
 ('1KF1J2NXQE2PI0QOB943', 3312, 33, '2020-01-01', 'SUNCO

# A Deep Dive into outlier data

In [39]:
qreg=engine.execute("""select DISTINCT(sec_dera.sub.lei), ghgrp_facility_id, name, adsh
              from ghgrp_demo.parent_attribution,sec_dera.sub
              where reporting_year>=DATE('2020-01-01') and sec_dera.sub.lei='549300O4B5CVWMKUES27'
              and ghgrp_demo.parent_attribution.lei=sec_dera.sub.lei and form='10-K'
              and period>=DATE('2020-01-01') and period<DATE('2021-01-01')""")
qreg.fetchall()

[('549300O4B5CVWMKUES27', 1012016, 'MIDWEST ENERGY EMISSIONS CORP.', '0001477932-21-002039')]

In [40]:
qreg=engine.execute("""select reporting_year, format ('%,10.2f', sum(fractional_emissions)) || ' t CO2e' as metric
              from ghgrp_demo.parent_attribution
              where lei='549300O4B5CVWMKUES27'
              group by reporting_year
""")
l = qreg.fetchall()
l

[('2018-01-01 00:00:00.000', '100,039.15 t CO2e'),
 ('2015-01-01 00:00:00.000', ' 46,877.45 t CO2e'),
 ('2013-01-01 00:00:00.000', ' 24,847.40 t CO2e'),
 ('2016-01-01 00:00:00.000', ' 98,413.61 t CO2e'),
 ('2020-01-01 00:00:00.000', ' 29,981.59 t CO2e'),
 ('2017-01-01 00:00:00.000', ' 71,698.95 t CO2e'),
 ('2012-01-01 00:00:00.000', ' 24,854.15 t CO2e'),
 ('2011-01-01 00:00:00.000', ' 25,032.10 t CO2e'),
 ('2014-01-01 00:00:00.000', ' 21,395.30 t CO2e'),
 ('2019-01-01 00:00:00.000', ' 69,246.51 t CO2e')]

# GHGRP Direct Emitters include Cement and Steel Plans (which we can connect to SFI data)

In [41]:
qres = engine.execute("describe sfi_geoasset.cement")
display(qres.fetchall())

[('uid', 'varchar', '', ''),
 ('city', 'varchar', '', ''),
 ('state', 'varchar', '', ''),
 ('country', 'varchar', '', ''),
 ('iso3', 'varchar', '', ''),
 ('country_code', 'double', '', ''),
 ('region', 'varchar', '', ''),
 ('sub_region', 'varchar', '', ''),
 ('latitude', 'double', '', ''),
 ('longitude', 'double', '', ''),
 ('accuracy', 'varchar', '', ''),
 ('status', 'varchar', '', ''),
 ('plant_type', 'varchar', '', ''),
 ('production_type', 'varchar', '', ''),
 ('capacity', 'double', '', ''),
 ('capacity_source', 'varchar', '', ''),
 ('year', 'timestamp(3)', '', ''),
 ('owner_permid', 'double', '', ''),
 ('owner_name', 'varchar', '', ''),
 ('owner_source', 'varchar', '', ''),
 ('parent_permid', 'double', '', ''),
 ('parent_name', 'varchar', '', ''),
 ('ownership_stake', 'double', '', ''),
 ('parent_lei', 'varchar', '', ''),
 ('parent_holding_status', 'varchar', '', ''),
 ('parent_ticker', 'varchar', '', ''),
 ('parent_exchange', 'varchar', '', ''),
 ('parent_permid_2', 'double', '',

In [42]:
qres = engine.execute("select count (*) from sfi_geoasset.cement")
display(qres.fetchall())
qres = engine.execute("select count (*) from sfi_geoasset.steel")
display(qres.fetchall())

# There are 105 US-located cement plants listed in the SFI report with parent LEIs
qres = engine.execute("select count (*), iso3 from sfi_geoasset.cement where iso3='USA' group by iso3")
display(qres.fetchall())

qres = engine.execute("""
select owner_name, parent_name, lei, parent_lei, facility_id
from sfi_geoasset.cement, epa_ghgrp.direct_emitters, epa_ghgrp.parent_company
where ghgrp_facility_id=facility_id
and reporting_year=epa_ghgrp.direct_emitters.year
and reporting_year>=DATE('2019-01-01') and reporting_year<DATE('2020-01-01')
and sfi_geoasset.cement.iso3='USA'
and abs(sfi_geoasset.cement.latitude-epa_ghgrp.direct_emitters.latitude)<0.01
and abs(sfi_geoasset.cement.longitude-epa_ghgrp.direct_emitters.longitude)<0.01
""")
l = qres.fetchall()
print(f"{len(l)}: facilities/parent relationships matched in USA using lat/lon")

[(3117,)]

[(1598,)]

[(105, 'USA')]

108: facilities/parent relationships matched in USA using lat/lon


In [43]:
l[3::2]

[('Argos USA Corp', 'Grupo Argos SA', '2549000NKLSHNQQBTJ24', '254900HANAO95XIAE681', 1002750),
 ('Cemex Inc', 'CEMEX SAB de CV', '549300JHGUF0VVA38719', '549300RIG2CXWN6IV731', 1003303),
 ('Lehigh Hanson Inc', 'HeidelbergCement AG', '40XIFLS8XDQGGHGPGC04', 'LZ2C6E0W5W7LQMX5ZI37', 1000362),
 ('Continental Cement Company LLC', 'Summit Materials Inc', '549300C97P3BW8QROE16', None, 1005987),
 ('Buzzi Unicem USA Inc', 'Buzzi Unicem SpA', None, '5299003DX5YLKSVJ6K59', 1006450),
 ('Lehigh Hanson Inc', 'HeidelbergCement AG', 'LZ2C6E0W5W7LQMX5ZI37', 'LZ2C6E0W5W7LQMX5ZI37', 1002431),
 ('Argos USA Corp', 'Grupo Argos SA', '549300ZTJKRI8F2QY512', '254900HANAO95XIAE681', 1003479),
 ('Lehigh Hanson Inc', 'HeidelbergCement AG', 'LZ2C6E0W5W7LQMX5ZI37', 'LZ2C6E0W5W7LQMX5ZI37', 1002566),
 ('Lafarge North America Inc', 'LafargeHolcim Ltd', '5299007BR966QQRQTB48', '529900EHPFPYHV6IQO98', 1005639),
 ('Buzzi Unicem USA Inc', 'Buzzi Unicem SpA', None, '5299003DX5YLKSVJ6K59', 1003002),
 ('Roanoke Cement Comp