Create an SIC <-> ISIC Crosswalk by hand.  This gets wiped every time the main DERA ingestion runs, but can be run after that any number of times

In [1]:
import osc_ingest_trino as osc
# For now...fix when _do_sql is exposed as public interface
from osc_ingest_trino.trino_utils import _do_sql as osc_do_sql
import os

Load Credentials

In [2]:
# From the AWS Account page, copy the export scripts from the appropriate role using the "Command Line or Programmatic Access" link
# Paste the copied text into ~/credentials.env

# Load environment variables from credentials.env
osc.load_credentials_dotenv()

Open a Trino connection using JWT for authentication

In [3]:
iceberg_catalog = 'osc_datacommons_dev'
iceberg_schema = 'mdt_sandbox'

engine = osc.attach_trino_engine(verbose=True, catalog=iceberg_catalog, schema=iceberg_schema)

using connect string: trino://MichaelTiemannOSC@trino-secure-odh-trino.apps.odh-cl2.apps.os-climate.org:443/osc_datacommons_dev/mdt_sandbox


In [4]:
# bucket must be configured with credentials for trino, and accessible to the hive catalog
# You may need to use a different prefix here depending on how you name your credentials.env variables
hive_bucket = osc.attach_s3_bucket('S3_HIVE')

In [5]:
# Show available schemas to ensure trino connection is set correctly
schema_read = engine.execute(f'show schemas in {iceberg_catalog}')
for row in schema_read.fetchall():
    print(row)

('aicoe_osc_demo_results',)
('default',)
('demo_dv',)
('dera',)
('essd',)
('iceberg_demo',)
('information_schema',)
('ingest',)
('mdt_sandbox',)
('pcaf_sovereign_footprint',)
('sandbox',)
('wri_gppd',)


  schema_read = engine.execute(f'show schemas in {iceberg_catalog}')


Enter the Pandas!

In [6]:
import pandas as pd

For now, create SIC -> ISIC crosswalk by hand

In [7]:
sic_isic = {
    1400: 1410, # Mining
    2911: 1920, # Petroleum refining
    3714: 2910, # Motor Vehicle Manufacturing
    3829: 3190, # Measuring & Controlling Devices (Midwest Energy Emissions Corp (MEEC))
    3310: 2410, # STEEL WORKS, BLAST FURNACES & ROLLING & FINISHING MILLS
    3311: 2410, # ???
    3312: 2410, # STEEL WORKS, BLAST FURNACES & ROLLING MILLS (COKE OVENS)
    3313: 2410, # Iron and Steel Mills and Ferroalloy Mfg.
    3315: 2410, # Steel Wire Drawing
    3316: 2410, # Rolled Steel Shape Mfg.
    3317: 2410, # Iron and Steel Pipe and Tube Mfg. from Purchased Steel (should be relatively larger S3 emissions, lower S1+S2 emissions per ton)
    4911: 4010, # Electricity Generation
    4931: 4010, # Electricity Generation
    4932: 4010, # Electricity Generation
    4991: 4010, # Typo?
}

df = pd.DataFrame.from_dict(sic_isic, orient='index', columns=['isic'])
df.reset_index(inplace=True)
df.rename(columns={'index':'sic'}, inplace=True)
df

Unnamed: 0,sic,isic
0,1400,1410
1,2911,1920
2,3714,2910
3,3829,3190
4,3310,2410
5,3311,2410
6,3312,2410
7,3313,2410
8,3315,2410
9,3316,2410


In [8]:
iceberg_table = 'sic_isic'
drop_table = osc_do_sql(f"drop table if exists {iceberg_schema}.{iceberg_table}", engine, verbose=True)

columnschema = osc.create_table_schema_pairs(df)
tabledef = f"""
create table if not exists {iceberg_catalog}.{iceberg_schema}.{iceberg_table}(
{columnschema}
) with (
format = 'ORC'
)
"""
print(tabledef)
qres = osc_do_sql(tabledef, engine, verbose=True)

df.to_sql(iceberg_table,
          con=engine, schema=iceberg_schema, if_exists='append',
          index=False,
          method=osc.TrinoBatchInsert(batch_size=5000, verbose = True))

drop table if exists mdt_sandbox.sic_isic

create table if not exists osc_datacommons_dev.mdt_sandbox.sic_isic(
    sic bigint,
    isic bigint
) with (
format = 'ORC'
)


create table if not exists osc_datacommons_dev.mdt_sandbox.sic_isic(
    sic bigint,
    isic bigint
) with (
format = 'ORC'
)

constructed fully qualified table name as: "mdt_sandbox.sic_isic"
inserting 15 records
  (1400, 1410)
  (2911, 1920)
  (3714, 2910)
  ...
  (4991, 4010)
batch insert result: [(15,)]
