Create an SIC <-> ISIC Crosswalk by hand.  This gets wiped every time the main DERA ingestion runs, but can be run after that any number of times

In [None]:
from dotenv import dotenv_values, load_dotenv
from osc_ingest_trino import *
import os
import pathlib

Load Environment Variables

In [None]:
dotenv_dir = os.environ.get("CREDENTIAL_DOTENV_DIR", os.environ.get("PWD", "/opt/app-root/src"))
dotenv_path = pathlib.Path(dotenv_dir) / "credentials.env"
if os.path.exists(dotenv_path):
    load_dotenv(dotenv_path=dotenv_path, override=True)

In [None]:
import trino
from sqlalchemy.engine import create_engine

env_var_prefix = "TRINO"

sqlstring = "trino://{user}@{host}:{port}/".format(
    user=os.environ[f"{env_var_prefix}_USER"],
    host=os.environ[f"{env_var_prefix}_HOST"],
    port=os.environ[f"{env_var_prefix}_PORT"],
)
sqlargs = {
    "auth": trino.auth.JWTAuthentication(os.environ[f"{env_var_prefix}_PASSWD"]),
    "http_scheme": "https",
    "catalog": "osc_datacommons_dev",
}
engine = create_engine(sqlstring, connect_args=sqlargs)
connection = engine.connect()

In [None]:
import osc_ingest_trino as osc

trino_bucket = osc.attach_s3_bucket("S3_DEV")

Open a Trino connection using JWT for authentication

In [None]:
ingest_catalog = "osc_datacommons_dev"
ingest_schema = "sec_dera"

In [None]:
# Show available schemas to ensure trino connection is set correctly
schema_read = engine.execute(f"show schemas in {ingest_catalog}")
for row in schema_read.fetchall():
    print(row)

Enter the Pandas!

In [None]:
import pandas as pd

For now, create SIC -> ISIC crosswalk by hand

In [None]:
sic_isic = {
    1400: 1410,  # Mining
    2911: 1920,  # Petroleum refining
    3714: 2910,  # Motor Vehicle Manufacturing
    3829: 3190,  # Measuring & Controlling Devices (Midwest Energy Emissions Corp (MEEC))
    3310: 2410,  # STEEL WORKS, BLAST FURNACES & ROLLING & FINISHING MILLS
    3311: 2410,  # ???
    3312: 2410,  # STEEL WORKS, BLAST FURNACES & ROLLING MILLS (COKE OVENS)
    3313: 2410,  # Iron and Steel Mills and Ferroalloy Mfg.
    3315: 2410,  # Steel Wire Drawing
    3316: 2410,  # Rolled Steel Shape Mfg.
    3317: 2410,  # Iron and Steel Pipe and Tube Mfg. from Purchased Steel (should be relatively larger S3 emissions, lower S1+S2 emissions per ton)
    4911: 4010,  # Electricity Generation
    4931: 4010,  # Electricity Generation
    4932: 4010,  # Electricity Generation
    4991: 4010,  # Typo?
}

df = pd.DataFrame.from_dict(sic_isic, orient="index", columns=["isic"])
df.reset_index(inplace=True)
df.rename(columns={"index": "sic"}, inplace=True)
df

In [None]:
osc.drop_unmanaged_table(ingest_catalog, ingest_schema, "sic_isic", engine, trino_bucket)
osc.ingest_unmanaged_parquet(df, ingest_schema, "sic_isic", trino_bucket)
tabledef = unmanaged_parquet_tabledef(df, ingest_catalog, ingest_schema, "sic_isic", trino_bucket)
qres = engine.execute(tabledef)
qres.fetchall()