# Demo - Exact Matching

In [2]:
# Sets up the location of the api relative to this notebook 
import os
import sys
sys.path.append('../../')

In [3]:
# Import the module for connection to a sqllite database
from esg_matching.engine.connectors.sql_lite import SqlLiteConnector

In [4]:
# Import the modules for file management
from esg_matching.file_reader.file import File
from esg_matching.file_reader.csv_reader import FileReaderCsv

In [5]:
# Import the modules for the etl processing: reading, transformation and loading data to a database
from esg_matching.processing.etl import EtlProcessing

## 1. Database setup

In [6]:
# Localization of the database to be created in relation to this jupyter notebook
# The database will be created in the /data/dabase folder, under the project main folder
path_db = '../../tests/data/demo/test_esg_matching_CLEAN.db'

In [7]:
# The database connector is represented by the class SqlLiteConnector 
db_conn = SqlLiteConnector()

In [8]:
# The connect() method of the SqlLiteConnector is used to stablish a connection with the database if it exists, 
# or to create a new one. The property path_db defines the location and name of the database.
# The  property show_sql_statement indicates if the SQL statements are echoed (or printed) in the default output channel.
db_conn.path_db = path_db
db_conn.show_sql_statement = True
db_conn.connect()

In [9]:
# Check if the connection was stablished
db_conn.is_connected()

True

## 2. Load the data into database from csv files

In [10]:
# Create an ETL process object
etl_proc_obj = EtlProcessing(db_conn)

In [11]:
# Crete a file reader object for csv files
csv_reader_obj = FileReaderCsv()  

In [12]:
# Referential Data Source
ref_settings = '../../tests/data/demo/test_ref_sqlite_CLEANED.json'
ref_obj = File(ref_settings)

In [13]:
# Load data from REFERENCIAL
db_ref = etl_proc_obj.load_file_to_db(ref_obj, csv_reader_obj)

2022-07-14 15:14:33,836 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("esg_match_ref")
2022-07-14 15:14:33,842 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-07-14 15:14:33,848 INFO sqlalchemy.engine.Engine PRAGMA main.table_xinfo("esg_match_ref")
2022-07-14 15:14:33,849 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-07-14 15:14:33,853 INFO sqlalchemy.engine.Engine SELECT sql FROM  (SELECT * FROM sqlite_master UNION ALL   SELECT * FROM sqlite_temp_master) WHERE name = ? AND type = 'table'
2022-07-14 15:14:33,854 INFO sqlalchemy.engine.Engine [raw sql] ('esg_match_ref',)
2022-07-14 15:14:33,857 INFO sqlalchemy.engine.Engine PRAGMA main.foreign_key_list("esg_match_ref")
2022-07-14 15:14:33,858 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-07-14 15:14:33,860 INFO sqlalchemy.engine.Engine PRAGMA temp.foreign_key_list("esg_match_ref")
2022-07-14 15:14:33,861 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-07-14 15:14:33,862 INFO sqlalchemy.engine.Engine SELECT sql FROM  (SELECT *

In [14]:
# Target Data Sources
tgt1_settings = '../../tests/data/demo/test_ds1_sqlite_CLEANED.json'
tgt1_obj = File(tgt1_settings)

In [15]:
# Load data from TARGET 1
db_tgt1 = etl_proc_obj.load_file_to_db(tgt1_obj, csv_reader_obj)

2022-07-14 15:14:34,238 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("esg_match_tgt1")
2022-07-14 15:14:34,239 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-07-14 15:14:34,244 INFO sqlalchemy.engine.Engine PRAGMA main.table_xinfo("esg_match_tgt1")
2022-07-14 15:14:34,245 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-07-14 15:14:34,248 INFO sqlalchemy.engine.Engine SELECT sql FROM  (SELECT * FROM sqlite_master UNION ALL   SELECT * FROM sqlite_temp_master) WHERE name = ? AND type = 'table'
2022-07-14 15:14:34,249 INFO sqlalchemy.engine.Engine [raw sql] ('esg_match_tgt1',)
2022-07-14 15:14:34,252 INFO sqlalchemy.engine.Engine PRAGMA main.foreign_key_list("esg_match_tgt1")
2022-07-14 15:14:34,253 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-07-14 15:14:34,255 INFO sqlalchemy.engine.Engine PRAGMA temp.foreign_key_list("esg_match_tgt1")
2022-07-14 15:14:34,256 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-07-14 15:14:34,258 INFO sqlalchemy.engine.Engine SELECT sql FROM  (SEL

In [16]:
# Target Data Sources
tgt2_settings = '../../tests/data/demo/test_ds2_sqlite_CLEANED.json'
tgt2_obj = File(tgt2_settings)

In [17]:
# Load data from TARGET 1
db_tgt2 = etl_proc_obj.load_file_to_db(tgt2_obj, csv_reader_obj)

2022-07-14 15:14:34,923 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("esg_match_tgt2")
2022-07-14 15:14:34,924 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-07-14 15:14:34,929 INFO sqlalchemy.engine.Engine PRAGMA main.table_xinfo("esg_match_tgt2")
2022-07-14 15:14:34,930 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-07-14 15:14:34,933 INFO sqlalchemy.engine.Engine SELECT sql FROM  (SELECT * FROM sqlite_master UNION ALL   SELECT * FROM sqlite_temp_master) WHERE name = ? AND type = 'table'
2022-07-14 15:14:34,934 INFO sqlalchemy.engine.Engine [raw sql] ('esg_match_tgt2',)
2022-07-14 15:14:34,937 INFO sqlalchemy.engine.Engine PRAGMA main.foreign_key_list("esg_match_tgt2")
2022-07-14 15:14:34,938 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-07-14 15:14:34,940 INFO sqlalchemy.engine.Engine PRAGMA temp.foreign_key_list("esg_match_tgt2")
2022-07-14 15:14:34,941 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-07-14 15:14:34,943 INFO sqlalchemy.engine.Engine SELECT sql FROM  (SEL

## 3. Creating matching/no-matching tables

In [18]:
match_settings = '../../tests/data/demo/test_matching_sqlite.json'
no_match_settings = '../../tests/data/demo/test_no_matching_sqlite.json'

In [19]:
file_match = File(match_settings)
file_no_match = File(no_match_settings)

In [20]:
# Call the create_data_source_from_settings() method by passing the File
db_matching = etl_proc_obj.create_data_source(file_match)

2022-07-14 15:14:36,421 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("esg_matching")
2022-07-14 15:14:36,422 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-07-14 15:14:36,427 INFO sqlalchemy.engine.Engine PRAGMA main.table_xinfo("esg_matching")
2022-07-14 15:14:36,428 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-07-14 15:14:36,433 INFO sqlalchemy.engine.Engine SELECT sql FROM  (SELECT * FROM sqlite_master UNION ALL   SELECT * FROM sqlite_temp_master) WHERE name = ? AND type = 'table'
2022-07-14 15:14:36,434 INFO sqlalchemy.engine.Engine [raw sql] ('esg_matching',)
2022-07-14 15:14:36,436 INFO sqlalchemy.engine.Engine PRAGMA main.foreign_key_list("esg_matching")
2022-07-14 15:14:36,437 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-07-14 15:14:36,439 INFO sqlalchemy.engine.Engine PRAGMA temp.foreign_key_list("esg_matching")
2022-07-14 15:14:36,440 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-07-14 15:14:36,442 INFO sqlalchemy.engine.Engine SELECT sql FROM  (SELECT * FROM

In [21]:
# Call the create_data_source_from_settings() method by passing the File
db_no_matching = etl_proc_obj.create_data_source(file_no_match)

2022-07-14 15:14:36,694 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("esg_no_matching")
2022-07-14 15:14:36,695 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-07-14 15:14:36,700 INFO sqlalchemy.engine.Engine PRAGMA main.table_xinfo("esg_no_matching")
2022-07-14 15:14:36,701 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-07-14 15:14:36,705 INFO sqlalchemy.engine.Engine SELECT sql FROM  (SELECT * FROM sqlite_master UNION ALL   SELECT * FROM sqlite_temp_master) WHERE name = ? AND type = 'table'
2022-07-14 15:14:36,706 INFO sqlalchemy.engine.Engine [raw sql] ('esg_no_matching',)
2022-07-14 15:14:36,708 INFO sqlalchemy.engine.Engine PRAGMA main.foreign_key_list("esg_no_matching")
2022-07-14 15:14:36,709 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-07-14 15:14:36,710 INFO sqlalchemy.engine.Engine PRAGMA temp.foreign_key_list("esg_no_matching")
2022-07-14 15:14:36,712 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-07-14 15:14:36,714 INFO sqlalchemy.engine.Engine SELECT sql FROM 

## 4. Checking matching policy and aliases

In [22]:
db_tgt1.get_policy_definition()

{'matching_with_ref1': {'dfm': {'lei': ['LEI']},
  'drm': {'name+country': ['COMPANY', 'COUNTRY']},
  'irm': {'lei': ['LEI'],
   'isin': ['ISIN'],
   'name+country': ['COMPANY', 'COUNTRY']}}}

In [23]:
db_tgt2.get_policy_definition()

{'matching_with_ref1': {'dfm': {'lei': ['LEI']},
  'drm': {'name+country': ['COMPANY', 'COUNTRY']},
  'irm': {'lei': ['LEI'],
   'isin': ['ISIN'],
   'name+country': ['COMPANY', 'COUNTRY']}}}

In [24]:
db_ref.get_mapping_to_alias()

{'LEI': Column('lei_ref', String(), table=<esg_match_ref>),
 'COMPANY': Column('name_ref', String(), table=<esg_match_ref>),
 'COUNTRY': Column('country_ref', String(), table=<esg_match_ref>)}

In [25]:
db_tgt1.get_mapping_to_alias()

{'ISIN': Column('isin_ds1', String(), table=<esg_match_tgt1>),
 'LEI': Column('lei_ds1', String(), table=<esg_match_tgt1>),
 'COMPANY': Column('name_ds1', String(), table=<esg_match_tgt1>),
 'COUNTRY': Column('country_ds1', String(), table=<esg_match_tgt1>)}

In [26]:
db_tgt2.get_mapping_to_alias()

{'ISIN': Column('isin_ds2', String(), table=<esg_match_tgt2>),
 'LEI': Column('lei_ds2', String(), table=<esg_match_tgt2>),
 'SEDOL': Column('sedol_ds2', String(), table=<esg_match_tgt2>),
 'COMPANY': Column('name_ds2', String(), table=<esg_match_tgt2>),
 'COUNTRY': Column('country_ds2', String(), table=<esg_match_tgt2>)}

## 5. Checking attribute mapping between sources and matching tables

In [27]:
db_ref.get_mapping_to_matching()

{'ref_id': Column('id_ref', Integer(), table=<esg_match_ref>, primary_key=True, nullable=False),
 'ref_company': Column('name_ref', String(), table=<esg_match_ref>),
 'ref_country': Column('country_ref', String(), table=<esg_match_ref>)}

In [28]:
db_tgt1.get_mapping_to_matching()

{'tgt_id': Column('id_ds1', Integer(), table=<esg_match_tgt1>, primary_key=True, nullable=False),
 'isin': Column('isin_ds1', String(), table=<esg_match_tgt1>),
 'lei': Column('lei_ds1', String(), table=<esg_match_tgt1>),
 'tgt_company': Column('name_ds1', String(), table=<esg_match_tgt1>),
 'tgt_country': Column('country_ds1', String(), table=<esg_match_tgt1>)}

In [29]:
db_tgt2.get_mapping_to_matching()

{'tgt_id': Column('id_ds2', Integer(), table=<esg_match_tgt2>, primary_key=True, nullable=False),
 'isin': Column('isin_ds2', String(), table=<esg_match_tgt2>),
 'lei': Column('lei_ds2', String(), table=<esg_match_tgt2>),
 'sedol': Column('sedol_ds2', String(), table=<esg_match_tgt2>),
 'tgt_company': Column('name_ds2', String(), table=<esg_match_tgt2>),
 'tgt_country': Column('country_ds2', String(), table=<esg_match_tgt2>)}

## 6. Create policies for matching each target data source with the referential

In [30]:
# Import policy module
from esg_matching.matcher.policy import MatchingPolicy

In [31]:
# Create macthing policy object for target1
policy_match_tgt1 = MatchingPolicy(db_tgt1, 'matching_with_ref1')

In [32]:
# Set the referential and matching/no-matching sources
policy_match_tgt1.set_referential_source(db_ref)
policy_match_tgt1.set_matching_source(db_matching)
policy_match_tgt1.set_no_matching_source(db_no_matching)

In [33]:
# Create macthing policy object for target2
policy_match_tgt2 = MatchingPolicy(db_tgt2, 'matching_with_ref1')

In [34]:
# Set the referential and matching/no-matching sources
policy_match_tgt2.set_referential_source(db_ref)
policy_match_tgt2.set_matching_source(db_matching)
policy_match_tgt2.set_no_matching_source(db_no_matching)

## 7. Perform direct full matching (DFM) for each data source

In [35]:
# Import DFM module
from esg_matching.matcher.dfm import DbMatcherDfm

In [36]:
# Create a matcher object for DFM
dfm_matcher_obj = DbMatcherDfm(db_conn)

In [37]:
# Perform DFM on target 1
dfm_matcher_obj.set_policy(policy_match_tgt1)
dfm_matcher_obj.execute_matching()

2022-07-14 15:14:47,853 INFO sqlalchemy.engine.Engine INSERT INTO esg_matching (ref_name, tgt_name, matching_type, matching_scope, matching_rule, tgt_id, isin, lei, tgt_company, tgt_country, ref_id, ref_company, ref_country) SELECT 'ds_ref' AS ref_name, 'ds_tgt1' AS tgt_name, 'direct' AS matching_type, 'full' AS matching_scope, 'lei' AS matching_rule, esg_match_tgt1.id_ds1 AS tgt_id, esg_match_tgt1.isin_ds1 AS isin, esg_match_tgt1.lei_ds1 AS lei, esg_match_tgt1.name_ds1 AS tgt_company, esg_match_tgt1.country_ds1 AS tgt_country, esg_match_ref.id_ref AS ref_id, esg_match_ref.name_ref AS ref_company, esg_match_ref.country_ref AS ref_country 
FROM esg_match_tgt1 JOIN esg_match_ref ON esg_match_tgt1.lei_ds1 = esg_match_ref.lei_ref
2022-07-14 15:14:47,854 INFO sqlalchemy.engine.Engine [generated in 0.00108s] ()
2022-07-14 15:14:47,862 INFO sqlalchemy.engine.Engine COMMIT
2022-07-14 15:14:47,882 INFO sqlalchemy.engine.Engine INSERT INTO esg_no_matching (tgt_name, tgt_id, isin, lei, tgt_compan

In [38]:
# Perform DFM on target 2
dfm_matcher_obj.set_policy(policy_match_tgt2)
dfm_matcher_obj.execute_matching()

2022-07-14 15:14:48,055 INFO sqlalchemy.engine.Engine INSERT INTO esg_matching (ref_name, tgt_name, matching_type, matching_scope, matching_rule, tgt_id, isin, lei, sedol, tgt_company, tgt_country, ref_id, ref_company, ref_country) SELECT 'ds_ref' AS ref_name, 'ds_tgt2' AS tgt_name, 'direct' AS matching_type, 'full' AS matching_scope, 'lei' AS matching_rule, esg_match_tgt2.id_ds2 AS tgt_id, esg_match_tgt2.isin_ds2 AS isin, esg_match_tgt2.lei_ds2 AS lei, esg_match_tgt2.sedol_ds2 AS sedol, esg_match_tgt2.name_ds2 AS tgt_company, esg_match_tgt2.country_ds2 AS tgt_country, esg_match_ref.id_ref AS ref_id, esg_match_ref.name_ref AS ref_company, esg_match_ref.country_ref AS ref_country 
FROM esg_match_tgt2 JOIN esg_match_ref ON esg_match_tgt2.lei_ds2 = esg_match_ref.lei_ref
2022-07-14 15:14:48,056 INFO sqlalchemy.engine.Engine [generated in 0.00120s] ()
2022-07-14 15:14:48,061 INFO sqlalchemy.engine.Engine COMMIT
2022-07-14 15:14:48,088 INFO sqlalchemy.engine.Engine INSERT INTO esg_no_matchin

## 8. Perform direct residual matching (DRM) for each data source

In [39]:
# Import DRM module
from esg_matching.matcher.drm import DbMatcherDrm

In [40]:
# Create a matcher object for DRM
drm_matcher_obj = DbMatcherDrm(db_conn)

In [41]:
# Perform DRM on target 1
drm_matcher_obj.set_policy(policy_match_tgt1)
drm_matcher_obj.execute_matching()

2022-07-14 15:14:50,876 INFO sqlalchemy.engine.Engine INSERT INTO esg_matching (ref_name, tgt_name, matching_type, matching_scope, matching_rule, tgt_id, isin, lei, tgt_company, tgt_country, ref_id, ref_company, ref_country) SELECT 'ds_ref' AS ref_name, 'ds_tgt1' AS tgt_name, 'direct' AS matching_type, 'residual' AS matching_scope, 'name+country' AS matching_rule, esg_no_matching.tgt_id, esg_no_matching.isin, esg_no_matching.lei, esg_no_matching.tgt_company, esg_no_matching.tgt_country, esg_match_ref.id_ref AS ref_id, esg_match_ref.name_ref AS ref_company, esg_match_ref.country_ref AS ref_country 
FROM esg_no_matching JOIN esg_match_ref ON esg_no_matching.tgt_company = esg_match_ref.name_ref AND esg_no_matching.tgt_country = esg_match_ref.country_ref 
WHERE esg_no_matching.tgt_name = ?
2022-07-14 15:14:50,877 INFO sqlalchemy.engine.Engine [generated in 0.00105s] ('ds_tgt1',)
2022-07-14 15:14:50,882 INFO sqlalchemy.engine.Engine COMMIT
2022-07-14 15:14:50,903 INFO sqlalchemy.engine.Engi

In [42]:
# Perform DRM on target 2
drm_matcher_obj.set_policy(policy_match_tgt2)
drm_matcher_obj.execute_matching()

2022-07-14 15:14:51,060 INFO sqlalchemy.engine.Engine INSERT INTO esg_matching (ref_name, tgt_name, matching_type, matching_scope, matching_rule, tgt_id, isin, lei, sedol, tgt_company, tgt_country, ref_id, ref_company, ref_country) SELECT 'ds_ref' AS ref_name, 'ds_tgt2' AS tgt_name, 'direct' AS matching_type, 'residual' AS matching_scope, 'name+country' AS matching_rule, esg_no_matching.tgt_id, esg_no_matching.isin, esg_no_matching.lei, esg_no_matching.sedol, esg_no_matching.tgt_company, esg_no_matching.tgt_country, esg_match_ref.id_ref AS ref_id, esg_match_ref.name_ref AS ref_company, esg_match_ref.country_ref AS ref_country 
FROM esg_no_matching JOIN esg_match_ref ON esg_no_matching.tgt_company = esg_match_ref.name_ref AND esg_no_matching.tgt_country = esg_match_ref.country_ref 
WHERE esg_no_matching.tgt_name = ?
2022-07-14 15:14:51,061 INFO sqlalchemy.engine.Engine [generated in 0.00108s] ('ds_tgt2',)
2022-07-14 15:14:51,066 INFO sqlalchemy.engine.Engine COMMIT
2022-07-14 15:14:51,0

## 9. Perform indirect full matching (IFM) for each data source

In [43]:
# Import IFM module
from esg_matching.matcher.irm import DbMatcherIrm

In [44]:
# Create a matcher object for IFM
irm_matcher_obj = DbMatcherIrm(db_conn)

In [45]:
# Perform IFM on target 1
irm_matcher_obj.set_policy(policy_match_tgt1)
irm_matcher_obj.execute_matching()

2022-07-14 15:14:53,160 INFO sqlalchemy.engine.Engine INSERT INTO esg_matching (matching_type, matching_scope, matching_rule, tgt_name, tgt_id, tgt_company, tgt_country, isin, lei, sedol, ref_name, ref_id, ref_company, ref_country) SELECT 'indirect' AS matching_type, 'full' AS matching_scope, 'lei' AS matching_rule, esg_no_matching.tgt_name, esg_no_matching.tgt_id, esg_no_matching.tgt_company, esg_no_matching.tgt_country, esg_no_matching.isin, esg_no_matching.lei, esg_no_matching.sedol, esg_matching.tgt_name AS ref_name, esg_matching.tgt_id AS ref_id, esg_matching.tgt_company AS ref_company, esg_matching.tgt_country AS ref_country 
FROM esg_no_matching JOIN esg_matching ON esg_no_matching.lei = esg_matching.lei 
WHERE esg_no_matching.tgt_name = ? AND esg_matching.tgt_name != ?
2022-07-14 15:14:53,161 INFO sqlalchemy.engine.Engine [generated in 0.00144s] ('ds_tgt1', 'ds_tgt1')
2022-07-14 15:14:53,164 INFO sqlalchemy.engine.Engine COMMIT
2022-07-14 15:14:53,168 INFO sqlalchemy.engine.Eng

In [46]:
# Perform IFM on target 2
irm_matcher_obj.set_policy(policy_match_tgt2)
irm_matcher_obj.execute_matching()

2022-07-14 15:14:53,364 INFO sqlalchemy.engine.Engine INSERT INTO esg_matching (matching_type, matching_scope, matching_rule, tgt_name, tgt_id, tgt_company, tgt_country, isin, lei, sedol, ref_name, ref_id, ref_company, ref_country) SELECT 'indirect' AS matching_type, 'full' AS matching_scope, 'lei' AS matching_rule, esg_no_matching.tgt_name, esg_no_matching.tgt_id, esg_no_matching.tgt_company, esg_no_matching.tgt_country, esg_no_matching.isin, esg_no_matching.lei, esg_no_matching.sedol, esg_matching.tgt_name AS ref_name, esg_matching.tgt_id AS ref_id, esg_matching.tgt_company AS ref_company, esg_matching.tgt_country AS ref_country 
FROM esg_no_matching JOIN esg_matching ON esg_no_matching.lei = esg_matching.lei 
WHERE esg_no_matching.tgt_name = ? AND esg_matching.tgt_name != ?
2022-07-14 15:14:53,365 INFO sqlalchemy.engine.Engine [cached since 0.2053s ago] ('ds_tgt2', 'ds_tgt2')
2022-07-14 15:14:53,370 INFO sqlalchemy.engine.Engine COMMIT
2022-07-14 15:14:53,404 INFO sqlalchemy.engine.

## 10. Saving the matching table as .csv file

In [47]:
import pandas as pd

In [48]:
df_matching = db_matching.get_data_as_df()

2022-07-14 15:14:55,496 INFO sqlalchemy.engine.Engine SELECT esg_matching.timestamp, esg_matching.matching_id, esg_matching.ref_name, esg_matching.tgt_name, esg_matching.matching_type, esg_matching.matching_scope, esg_matching.matching_rule, esg_matching.ref_id, esg_matching.ref_company, esg_matching.ref_country, esg_matching.tgt_id, esg_matching.tgt_company, esg_matching.tgt_country, esg_matching.isin, esg_matching.lei, esg_matching.sedol 
FROM esg_matching
2022-07-14 15:14:55,497 INFO sqlalchemy.engine.Engine [generated in 0.00128s] ()


In [49]:
df_matching

Unnamed: 0,timestamp,matching_id,ref_name,tgt_name,matching_type,matching_scope,matching_rule,ref_id,ref_company,ref_country,tgt_id,tgt_company,tgt_country,isin,lei,sedol
0,2022-07-14 14:14:47,1,ds_ref,ds_tgt1,direct,full,lei,3,STARCOURT MALL SRLS.,it,4,STARCOURT MALL SOCIETÀ A RESPONSABILITÀ LIMITA...,it,,8156006CE62347C74658,
1,2022-07-14 14:14:48,2,ds_ref,ds_tgt2,direct,full,lei,3,STARCOURT MALL SRLS.,it,2,STARCOURT MALL,it,NO0003058109,8156006CE62347C74658,10009110965
2,2022-07-14 14:14:48,3,ds_ref,ds_tgt2,direct,full,lei,6,INGEN SOCIÉTÉ COOPÉRATIVE DE PRODUCTION,fr,4,INGEN,fr,FR0000072910,9695001UE8RNVNTE9L89,2019952
3,2022-07-14 14:14:48,4,ds_ref,ds_tgt2,direct,full,lei,4,STERLING COOPER,gb,6,STERLING COOPER,gb,GB00B1YW4409,213800KY4C9WU7WBW518,B1YW440
4,2022-07-14 14:14:50,5,ds_ref,ds_tgt1,direct,residual,name+country,1,SALLMOOR AGRUPACION DE INTERES ECONOMICO,es,1,SALLMOOR AGRUPACION DE INTERES ECONOMICO,es,SK1120005824,097900BHK10000084115,
5,2022-07-14 14:14:50,6,ds_ref,ds_tgt1,direct,residual,name+country,2,HONEYDUKES LIMITED,us,3,HONEYDUKES LIMITED,us,,254900B1P3S786KDAW57,
6,2022-07-14 14:14:51,7,ds_ref,ds_tgt2,direct,residual,name+country,8,SPECTRE EMPRESA INDIVIDUAL DE RESPONSABILIDADE...,br,5,SPECTRE EMPRESA INDIVIDUAL DE RESPONSABILIDADE...,br,BRCIELACNOR3,,2046853
7,2022-07-14 14:14:53,8,ds_tgt1,ds_tgt2,indirect,full,lei,1,SALLMOOR AGRUPACION DE INTERES ECONOMICO,es,1,HONEYDUKES LIMITED,,,097900BHK10000084115,7108899
8,2022-07-14 14:14:53,9,ds_tgt1,ds_tgt2,indirect,full,isin,1,SALLMOOR AGRUPACION DE INTERES ECONOMICO,es,7,CENTRAL PERK,sk,SK1120005824,,B1YW440


In [50]:
df_matching.to_csv('my_matching.csv', index=False)

In [51]:
# Import the module for connection to a sqllite database
from esg_matching.engine.connectors.trino import TrinoConnector

In [52]:
import os
user_trino = os.environ['TRINO_USER']
pwd_trino = os.environ['TRINO_PASSWD']
host_trino = os.environ['TRINO_HOST']
port_trino = int(os.environ['TRINO_PORT'])

In [53]:
# The database connector is represented by the class SqlLiteConnector 
db_conn = TrinoConnector()

In [54]:
# The connect() method of the SqlLiteConnector is used to stablish a connection with the database if it exists, 
# or to create a new one. The property path_db defines the location and name of the database.
# The  property show_sql_statement indicates if the SQL statements are echoed (or printed) in the default output channel.
db_conn.username = user_trino
db_conn.user_password = pwd_trino
db_conn.host_url = host_trino
db_conn.port_number = port_trino
db_conn.catalog = 'osc_datacommons_iceberg_dev'
db_conn.show_sql_statement = True
db_conn.connect()

2022-07-14 15:15:43,752 INFO sqlalchemy.engine.Engine SELECT version()
2022-07-14 15:15:43,753 INFO sqlalchemy.engine.Engine [dialect trino+rest does not support caching 0.00187s] ()


In [55]:
# Check if the connection was stablished
db_conn.is_connected()

True

In [56]:
# Show available schemas to ensure trino connection is set correctly
schema_show_sql = f"""
show schemas in osc_datacommons_iceberg_dev
"""
schema_show = db_conn.engine.execute(schema_show_sql)
print(schema_show.fetchall())

2022-07-14 15:16:06,428 INFO sqlalchemy.engine.Engine 
show schemas in osc_datacommons_iceberg_dev

2022-07-14 15:16:06,429 INFO sqlalchemy.engine.Engine [raw sql] ()
[('aicoe_osc_demo',), ('company_data',), ('default',), ('defaultschema1',), ('demo',), ('eje_test_iceberg',), ('epa_frs',), ('epa_ghgrp',), ('epacems',), ('epacems_y95_al',), ('esg_matching',), ('essd',), ('ghgrp_demo',), ('gleif',), ('gleif_mdt',), ('iceberg_demo',), ('information_schema',), ('ingest_schema',), ('iso3166',), ('itr_mdt',), ('metastore',), ('metastore_iceberg',), ('osc_corp_data',), ('pcaf_sovereign_footprint',), ('physical_risk_project',), ('pudl',), ('rmi_20210929',), ('rmi_20211120',), ('rmi_20220119',), ('rmi_utility_transition_hub',), ('sec_dera',), ('sfi_geoasset',), ('team1',), ('team2',), ('testaccessschema1',), ('testdb',), ('urgentem',), ('us_census',), ('wri',), ('wri_demo',), ('wri_dev',), ('wri_gppd',), ('wri_gppd_md',), ('wri_new',), ('wri_test',)]


In [60]:
import sqlalchemy as sa

In [69]:
_p2smap = {
    'object': sa.String,
    'float64': sa.DECIMAL,
    'int64': sa.BigInteger,
    'datetime64[ns]': sa.DateTime
}

In [70]:
def pandas_type_to_sql(pt):
    st = _p2smap.get(pt)
    if st is not None:
        return st
    raise ValueError("unexpected pandas column type '{pt}'".format(pt=pt))

In [80]:
def generate_table_schema_pairs(df):
    ptypes = [str(e) for e in df.dtypes.to_list()]
    stypes = [pandas_type_to_sql(e) for e in ptypes]
    dtypes={}
    pz = list(zip(df.columns.to_list(), stypes))
    dtypes={}
    for item in pz:
        dtypes[item[0]] = item[1] 
    return dtypes

In [81]:
db_types = generate_table_schema_pairs(df_matching)
db_types

{'timestamp': sqlalchemy.sql.sqltypes.DateTime,
 'matching_id': sqlalchemy.sql.sqltypes.BigInteger,
 'ref_name': sqlalchemy.sql.sqltypes.String,
 'tgt_name': sqlalchemy.sql.sqltypes.String,
 'matching_type': sqlalchemy.sql.sqltypes.String,
 'matching_scope': sqlalchemy.sql.sqltypes.String,
 'matching_rule': sqlalchemy.sql.sqltypes.String,
 'ref_id': sqlalchemy.sql.sqltypes.BigInteger,
 'ref_company': sqlalchemy.sql.sqltypes.String,
 'ref_country': sqlalchemy.sql.sqltypes.String,
 'tgt_id': sqlalchemy.sql.sqltypes.BigInteger,
 'tgt_company': sqlalchemy.sql.sqltypes.String,
 'tgt_country': sqlalchemy.sql.sqltypes.String,
 'isin': sqlalchemy.sql.sqltypes.String,
 'lei': sqlalchemy.sql.sqltypes.String,
 'sedol': sqlalchemy.sql.sqltypes.String}

In [85]:
df_matching.to_sql(name='matching', schema='esg_matching', con=db_conn.engine, index=False, if_exists='replace', dtype=db_types, chunksize=1000)

2022-07-16 13:35:16,550 INFO sqlalchemy.engine.Engine SELECT "table_name"
FROM "information_schema"."tables"
WHERE "table_schema" = ?
  AND "table_name" = ?
2022-07-16 13:35:16,552 INFO sqlalchemy.engine.Engine [dialect trino+rest does not support caching 0.00414s] ('esg_matching', 'matching')
2022-07-16 13:35:18,438 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2022-07-16 13:35:18,439 INFO sqlalchemy.engine.Engine 
CREATE TABLE esg_matching.matching (
	matching_id BIGINT, 
	ref_name VARCHAR, 
	tgt_name VARCHAR, 
	matching_type VARCHAR, 
	matching_scope VARCHAR, 
	matching_rule VARCHAR, 
	ref_id BIGINT, 
	ref_company VARCHAR, 
	ref_country VARCHAR, 
	tgt_id BIGINT, 
	tgt_company VARCHAR, 
	tgt_country VARCHAR, 
	isin VARCHAR, 
	lei VARCHAR, 
	sedol VARCHAR
)


2022-07-16 13:35:18,440 INFO sqlalchemy.engine.Engine [no key 0.00121s] ()
2022-07-16 13:35:19,544 INFO sqlalchemy.engine.Engine COMMIT
2022-07-16 13:35:19,545 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2022-07-16 13:35:19,5

-1

In [83]:
df_matching.drop('timestamp', axis=1, inplace=True)

In [84]:
df_matching.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   matching_id     9 non-null      int64 
 1   ref_name        9 non-null      object
 2   tgt_name        9 non-null      object
 3   matching_type   9 non-null      object
 4   matching_scope  9 non-null      object
 5   matching_rule   9 non-null      object
 6   ref_id          9 non-null      int64 
 7   ref_company     9 non-null      object
 8   ref_country     9 non-null      object
 9   tgt_id          9 non-null      int64 
 10  tgt_company     9 non-null      object
 11  tgt_country     8 non-null      object
 12  isin            6 non-null      object
 13  lei             7 non-null      object
 14  sedol           6 non-null      object
dtypes: int64(3), object(12)
memory usage: 1.2+ KB


## 11. Close database connection

In [None]:
db_conn.disconnect()

In [None]:
db_conn.is_connected()