# Simple matching case with S3 and Trino

In [1]:
# Sets up the location of the api relative to this notebook 
import os
import sys
sys.path.append('../../')

## 1. Connecting with Trino using Esg-Matching

In [2]:
# Import the module for connection to a sqllite database
from esg_matching.engine.connectors.trino import TrinoConnector

In [3]:
user_trino = os.environ['TRINO_USER']
pwd_trino = os.environ['TRINO_PASSWD']
host_trino = os.environ['TRINO_HOST']
port_trino = int(os.environ['TRINO_PORT'])

In [4]:
# The database connector is represented by the class SqlLiteConnector 
db_conn = TrinoConnector()

In [5]:
# The connect() method of the SqlLiteConnector is used to stablish a connection with the database if it exists, 
# or to create a new one. The property path_db defines the location and name of the database.
# The  property show_sql_statement indicates if the SQL statements are echoed (or printed) in the default output channel.
db_conn.username = user_trino
db_conn.user_password = pwd_trino
db_conn.host_url = host_trino
db_conn.port_number = port_trino
db_conn.catalog = 'osc_datacommons_iceberg_dev'
db_conn.show_sql_statement = True
db_conn.connect()

2022-07-14 15:04:57,588 INFO sqlalchemy.engine.Engine SELECT version()
2022-07-14 15:04:57,593 INFO sqlalchemy.engine.Engine [dialect trino+rest does not support caching 0.00483s] ()


In [6]:
# Check if the connection was stablished
db_conn.is_connected()

True

In [23]:
# Show available schemas to ensure trino connection is set correctly
schema_show_sql = f"""
show schemas in osc_datacommons_iceberg_dev
"""
schema_show = db_conn.engine.execute(schema_show_sql)
print(schema_show.fetchall())

2022-07-16 13:36:13,464 INFO sqlalchemy.engine.Engine 
show schemas in osc_datacommons_iceberg_dev

2022-07-16 13:36:13,465 INFO sqlalchemy.engine.Engine [raw sql] ()
[('aicoe_osc_demo',), ('company_data',), ('default',), ('defaultschema1',), ('demo',), ('eje_test_iceberg',), ('epa_frs',), ('epa_ghgrp',), ('epacems',), ('epacems_y95_al',), ('esg_matching',), ('essd',), ('ghgrp_demo',), ('gleif',), ('gleif_mdt',), ('iceberg_demo',), ('information_schema',), ('ingest_schema',), ('iso3166',), ('itr_mdt',), ('metastore',), ('metastore_iceberg',), ('osc_corp_data',), ('pcaf_sovereign_footprint',), ('physical_risk_project',), ('pudl',), ('rmi_20210929',), ('rmi_20211120',), ('rmi_20220119',), ('rmi_utility_transition_hub',), ('sec_dera',), ('sfi_geoasset',), ('team1',), ('team2',), ('testaccessschema1',), ('testdb',), ('urgentem',), ('us_census',), ('wri',), ('wri_demo',), ('wri_dev',), ('wri_gppd',), ('wri_gppd_md',), ('wri_new',), ('wri_test',)]


In [24]:
import sqlalchemy as sa
from sqlalchemy import MetaData, Table

In [25]:
meta = MetaData(schema="esg_matching")

In [26]:
meta.reflect(db_conn.engine)

2022-07-16 13:36:19,684 INFO sqlalchemy.engine.Engine SELECT "table_name"
FROM "information_schema"."tables"
WHERE "table_schema" = ?
2022-07-16 13:36:19,685 INFO sqlalchemy.engine.Engine [dialect trino+rest does not support caching 0.00162s] ('esg_matching',)
2022-07-16 13:36:21,058 INFO sqlalchemy.engine.Engine SELECT "table_name"
FROM "information_schema"."tables"
WHERE "table_schema" = ?
  AND "table_name" = ?
2022-07-16 13:36:21,058 INFO sqlalchemy.engine.Engine [dialect trino+rest does not support caching 0.00085s] ('esg_matching', 'matching')
2022-07-16 13:36:22,230 INFO sqlalchemy.engine.Engine SELECT
    "column_name",
    "data_type",
    "column_default",
    UPPER("is_nullable") AS "is_nullable"
FROM "information_schema"."columns"
WHERE "table_schema" = ?
  AND "table_name" = ?
ORDER BY "ordinal_position" ASC
2022-07-16 13:36:22,231 INFO sqlalchemy.engine.Engine [dialect trino+rest does not support caching 0.00088s] ('esg_matching', 'matching')
2022-07-16 13:36:23,594 INFO 

  meta.reflect(db_conn.engine)
  meta.reflect(db_conn.engine)
  meta.reflect(db_conn.engine)
  meta.reflect(db_conn.engine)


2022-07-16 13:36:27,272 INFO sqlalchemy.engine.Engine ROLLBACK


In [27]:
type(meta)

sqlalchemy.sql.schema.MetaData

In [28]:
table_obj = sa.Table('matching', meta, autoload_with=db_conn.engine)

In [29]:
import pandas as pd

In [30]:
query_table = sa.select([table_obj])
df_table = pd.read_sql_query(query_table, db_conn.engine)

2022-07-16 13:36:52,919 INFO sqlalchemy.engine.Engine SELECT esg_matching.matching.matching_id, esg_matching.matching.ref_name, esg_matching.matching.tgt_name, esg_matching.matching.matching_type, esg_matching.matching.matching_scope, esg_matching.matching.matching_rule, esg_matching.matching.ref_id, esg_matching.matching.ref_company, esg_matching.matching.ref_country, esg_matching.matching.tgt_id, esg_matching.matching.tgt_company, esg_matching.matching.tgt_country, esg_matching.matching.isin, esg_matching.matching.lei, esg_matching.matching.sedol 
FROM esg_matching.matching
2022-07-16 13:36:52,920 INFO sqlalchemy.engine.Engine [dialect trino+rest does not support caching 0.00131s] ()


In [31]:
df_table

Unnamed: 0,matching_id,ref_name,tgt_name,matching_type,matching_scope,matching_rule,ref_id,ref_company,ref_country,tgt_id,tgt_company,tgt_country,isin,lei,sedol
0,8,ds_tgt1,ds_tgt2,indirect,full,lei,1,SALLMOOR AGRUPACION DE INTERES ECONOMICO,es,1,HONEYDUKES LIMITED,,,097900BHK10000084115,7108899
1,4,ds_ref,ds_tgt2,direct,full,lei,4,STERLING COOPER,gb,6,STERLING COOPER,gb,GB00B1YW4409,213800KY4C9WU7WBW518,B1YW440
2,2,ds_ref,ds_tgt2,direct,full,lei,3,STARCOURT MALL SRLS.,it,2,STARCOURT MALL,it,NO0003058109,8156006CE62347C74658,10009110965
3,5,ds_ref,ds_tgt1,direct,residual,name+country,1,SALLMOOR AGRUPACION DE INTERES ECONOMICO,es,1,SALLMOOR AGRUPACION DE INTERES ECONOMICO,es,SK1120005824,097900BHK10000084115,
4,6,ds_ref,ds_tgt1,direct,residual,name+country,2,HONEYDUKES LIMITED,us,3,HONEYDUKES LIMITED,us,,254900B1P3S786KDAW57,
5,9,ds_tgt1,ds_tgt2,indirect,full,isin,1,SALLMOOR AGRUPACION DE INTERES ECONOMICO,es,7,CENTRAL PERK,sk,SK1120005824,,B1YW440
6,3,ds_ref,ds_tgt2,direct,full,lei,6,INGEN SOCIÉTÉ COOPÉRATIVE DE PRODUCTION,fr,4,INGEN,fr,FR0000072910,9695001UE8RNVNTE9L89,2019952
7,7,ds_ref,ds_tgt2,direct,residual,name+country,8,SPECTRE EMPRESA INDIVIDUAL DE RESPONSABILIDADE...,br,5,SPECTRE EMPRESA INDIVIDUAL DE RESPONSABILIDADE...,br,BRCIELACNOR3,,2046853
8,1,ds_ref,ds_tgt1,direct,full,lei,3,STARCOURT MALL SRLS.,it,4,STARCOURT MALL SOCIETÀ A RESPONSABILITÀ LIMITA...,it,,8156006CE62347C74658,
