## Turn the following cell back into 'code' to run dependecy installations

Run these in a cell if you need to install onto your nb env
```
%%capture pipoutput
%pip install boto3 python-dotenv
%pip install trino sqlalchemy sqlalchemy-trino
%pip install pandas pyarrow fastparquet
%pip install anytree
%pip install osc-ingest-tools
```

In [1]:
from dotenv import dotenv_values, load_dotenv
import os
import pathlib

dotenv_dir = os.environ.get('CREDENTIAL_DOTENV_DIR', os.environ.get('PWD', '/opt/app-root/src'))
dotenv_path = pathlib.Path(dotenv_dir) / 'credentials.env'
if os.path.exists(dotenv_path):
    load_dotenv(dotenv_path=dotenv_path,override=True)

In [2]:
import trino
from sqlalchemy.engine import create_engine

sqlstring = 'trino://{user}@{host}:{port}/'.format(
    user = os.environ['TRINO_USER'],
    host = os.environ['TRINO_HOST'],
    port = os.environ['TRINO_PORT']
)
sqlargs = {
    'auth': trino.auth.JWTAuthentication(os.environ['TRINO_PASSWD']),
    'http_scheme': 'https'
}
engine = create_engine(sqlstring, connect_args = sqlargs)
print("connecting with engine " + str(engine))
connection = engine.connect()

connecting with engine Engine(trino://erikerlandson@trino-secure-odh-trino.apps.odh-cl1.apps.os-climate.org:443/)


In [3]:
import boto3
s3 = boto3.resource(
    service_name="s3",
    endpoint_url=os.environ["S3_DEV_ENDPOINT"],
    aws_access_key_id=os.environ["S3_DEV_ACCESS_KEY"],
    aws_secret_access_key=os.environ["S3_DEV_SECRET_KEY"],
)

In [4]:
import pandas as pd
sql = """
select *
  from osc_datacommons_dev.urgentem.itr_emissions_1_v2
  limit 10
"""
df = pd.read_sql(sql, engine).convert_dtypes()
df

Unnamed: 0,company_name,isin,target_type,scope,coverage_s1,coverage_s2,coverage_s3,reduction_ambition,base_year,end_year,start_year,base_year_ghg_emissions_s1_tco2e,base_year_ghg_emissions_s1s2_tco2e,base_year_ghg_emissions_s3_tco2e,achieved_reduction
0,3M CO,US88579Y1010,Absolute,S1+S2,1.0,1.0,,0.5,2002,2025,2015,,18300000.0,91500000.0,0.3
1,ADIDAS AG,DE000A1EWWW0,Absolute,S1+S2,0.9,0.9,,0.15,2015,2020,2015,,59132.0,295660.0,1.0
2,BARCLAYS PLC,GB0031348658,Absolute,S1+S2,1.0,1.0,,0.37,2018,2025,2018,,282593.0,1412965.0,0.0
3,DANONE,FR0000120644,Absolute,S1+S2,0.95,0.95,,0.3,2015,2030,2017,,1681235.0,8406175.0,0.68
4,EQUINOR ASA,NO0010096985,Absolute,S1,1.0,,,0.21,2016,2030,2017,9329201.0,,,0.06
5,GLAXOSMITHKLINE,GB0009252882,Absolute,S1+S2,1.0,1.0,,1.0,2017,2025,2017,,1495165.0,7475825.0,0.88
6,GLAXOSMITHKLINE,GB0009252882,Absolute,S3,,,1.0,0.16,2017,2030,2017,,,7475825.0,0.0
7,JAPAN TOBACCO,JP3726800000,Absolute,S1+S2,1.0,1.0,,0.2,2009,2020,2009,,908441.0,4542207.0,1.0
8,JAPAN TOBACCO,JP3726800000,Absolute,S1+S2,1.0,1.0,,0.32,2015,2030,2018,,763175.0,4434874.0,0.2
9,JAPAN TOBACCO,JP3726800000,Absolute,S3,,,1.0,0.23,2015,2030,2018,,,4434874.0,0.0


In [5]:
sql = """
select *
  from osc_datacommons_dev.gleif.gleif_isin_lei
  limit 10
"""
df = pd.read_sql(sql, engine).convert_dtypes()
df

Unnamed: 0,lei,isin,time_stamp
0,001GPB6A9XPE8XJICC14,US3158052262,2021-11-12 08:00:00.000
1,00KLB2PFTM3060S2N216,US4138385749,2021-11-12 08:00:00.000
2,01ERPZV3DOLNXY2MLB90,US531554AA10,2021-11-12 08:00:00.000
3,01ERPZV3DOLNXY2MLB90,US531554AB92,2021-11-12 08:00:00.000
4,01ERPZV3DOLNXY2MLB90,US531554AC75,2021-11-12 08:00:00.000
5,00EHHQ2ZHDCFXJCPCL46,US92204Q1031,2021-11-12 08:00:00.000
6,00KLB2PFTM3060S2N216,US4138382027,2021-11-12 08:00:00.000
7,01ERPZV3DOLNXY2MLB90,US531554AD58,2021-11-12 08:00:00.000
8,01ERPZV3DOLNXY2MLB90,US531554AE32,2021-11-12 08:00:00.000
9,01ERPZV3DOLNXY2MLB90,US531554AF07,2021-11-12 08:00:00.000


In [6]:
sql = """
select *
  from osc_datacommons_dev.gleif.gleif_direct_issuer_ultimate_issuer
  limit 10
"""
df = pd.read_sql(sql, engine).convert_dtypes()
df

Unnamed: 0,direct_issuer_lei,ultimate_parent_issuer_lei,time_stamp
0,010CMKZ3VON21WF2ZD45,3C7474T6CDKPR9K6YT90,2021-11-12 08:00:00.000
1,010PWNH4K3BLIC3I7R03,549300B2Q47IR0CR5B54,2021-11-12 08:00:00.000
2,01J4SO3XTWZF4PP38209,5493000HPQ4D2RZ79739,2021-11-12 08:00:00.000
3,01TRDHWDCL69YP41S025,LORM1GNEU1DKEW527V90,2021-11-12 08:00:00.000
4,020BQJXAXCZNLKIN7326,549300PFEWKNHRG25N08,2021-11-12 08:00:00.000
5,0292001381F1R1IB5B85,029200388A7S7Z5I0H57,2021-11-12 08:00:00.000
6,0292001568C3M3WGI292,0292004558B0R5B7I133,2021-11-12 08:00:00.000
7,0292002717G4T0CH6E65,029200370B2O2ZA1C754,2021-11-12 08:00:00.000
8,029200302D4K9BC3E535,6SHGI4ZSSLCXXQSBB395,2021-11-12 08:00:00.000
9,0292003053E9T6XD4J69,029200268F8M5YI5I629,2021-11-12 08:00:00.000


In [7]:
big_demo_query = """
select itr.isin,
       itr.company_name,
       glflei.lei,
       glfpar.ultimate_parent_issuer_lei as parent_lei,
       itr.achieved_reduction
  from osc_datacommons_dev.urgentem.itr_emissions_1_v2 as itr
  inner join osc_datacommons_dev.gleif.gleif_isin_lei as glflei
          on itr.isin = glflei.isin
  left join osc_datacommons_dev.gleif.gleif_direct_issuer_ultimate_issuer as glfpar
        on glflei.lei = glfpar.direct_issuer_lei
  where itr.scope = 'S1+S2' and itr.end_year > 2021
"""
df = pd.read_sql(big_demo_query, engine) \
    .convert_dtypes() \
    .astype({'parent_lei': 'string'})
df

Unnamed: 0,isin,company_name,lei,parent_lei,achieved_reduction
0,US88579Y1010,3M CO,LUZQVYP4VS22CLWDAR65,,0.3
1,JP3726800000,JAPAN TOBACCO,353800Z0ENYBQO0XRJ31,,0.2
2,CH0012005267,NOVARTIS AG-REG,5493007HIVTX6SY6XD66,,0.13
3,GB0031348658,BARCLAYS PLC,213800LBQA1Y9L22JB70,,0.0
4,FR0000120644,DANONE,969500KMUQ2B6CBAF162,,0.68
5,GB0009252882,GLAXOSMITHKLINE,5493000HZTVUYLO1D793,,0.88
6,FR0000121972,SCHNEIDER ELECTR,969500A1YF1XUYYXS284,,0.29


In [8]:
schemaname = 'demo'
tablename = os.environ.get('ODSC_DEMO_TBL', 'odsc_isin_reduction_notebook')

In [9]:
sql = f"""
drop table if exists osc_datacommons_dev.{schemaname}.{tablename}
"""
qres = engine.execute(sql)
# trino hive connector isn't managing these for deletion \o/
s3.Bucket(os.environ["S3_DEV_BUCKET"]).objects \
    .filter(Prefix=f'trino/{schemaname}/{tablename}/') \
    .delete()
print(qres.fetchall())

[(True,)]


In [10]:
import osc_ingest_trino
columnschema = osc_ingest_trino.create_table_schema_pairs(df)

tabledef = f"""create table if not exists osc_datacommons_dev.{schemaname}.{tablename}(
{columnschema}
) with (
    format = 'parquet',
    external_location = 's3a://ocp-odh-os-demo-s3/trino/{schemaname}/{tablename}/'
)"""
print(tabledef)
qres = engine.execute(tabledef)
print(qres.fetchall())

create table if not exists osc_datacommons_dev.demo.odsc_isin_reduction_notebook(
    isin varchar,
    company_name varchar,
    lei varchar,
    parent_lei varchar,
    achieved_reduction double
) with (
    format = 'parquet',
    external_location = 's3a://ocp-odh-os-demo-s3/trino/demo/odsc_isin_reduction_notebook/'
)
[(True,)]


In [11]:
sql = f"""
insert into osc_datacommons_dev.{schemaname}.{tablename}
(isin, company_name, lei, parent_lei, achieved_reduction)
select * from ({big_demo_query})
"""
qres = engine.execute(sql)
print(qres.fetchall())

[(7,)]


copy and paste me
```python
sql = """
select * from osc_datacommons_dev.demo.xxx
"""
df = pd.read_sql(sql, engine).convert_dtypes()
df
```