# How to...perform exact matching on Trino database

In [1]:
# Sets up the location of the api relative to this notebook 
import sys
sys.path.append('../../../')

In [2]:
# Import the module for connection to a sqllite database
from esg_matching.engine.connectors.trino import TrinoConnector

In [3]:
# Import the modules for file management
from esg_matching.file_reader.file import File
from esg_matching.file_reader.csv_reader import FileReaderCsv

In [4]:
# Import the modules for the etl processing: reading, transformation and loading data to a database
from esg_matching.processing.etl import EtlProcessing

## 1. Database setup

In [5]:
import os
user_trino = os.environ['TRINO_USER']
pwd_trino = os.environ['TRINO_PASSWD']
host_trino = os.environ['TRINO_HOST']
port_trino = int(os.environ['TRINO_PORT'])

In [6]:
# The database connector is represented by the class TrinoConnector 
db_conn = TrinoConnector()

In [7]:
# The connect() method of the TrinoConnector is used to stablish a connection with the database if it exists, 
# or to create a new one. The property path_db defines the location and name of the database.
# The  property show_sql_statement indicates if the SQL statements are echoed (or printed) in the default output channel.
db_conn.username = user_trino
db_conn.user_password = pwd_trino
db_conn.host_url = host_trino
db_conn.port_number = port_trino
db_conn.catalog = 'osc_datacommons_iceberg_dev'
db_conn.show_sql_statement = True
db_conn.connect()

2022-07-17 18:46:40,683 INFO sqlalchemy.engine.Engine SELECT version()
2022-07-17 18:46:40,686 INFO sqlalchemy.engine.Engine [dialect trino+rest does not support caching 0.00304s] ()


In [8]:
# Check if the connection was stablished
db_conn.is_connected()

True

## 2. Load the data into database from csv files

In [9]:
# Create an ETL process object
etl_proc_obj = EtlProcessing(db_conn)

In [10]:
# Crete a file reader object for csv files
csv_reader_obj = FileReaderCsv()  

In [11]:
# Referential Data Source
ref_settings = '../../../tests/data/howto/trino/test_referential1_trino.json'
ref_obj = File(ref_settings)

In [12]:
# Load data from REFERENCIAL
db_ref = etl_proc_obj.load_file_to_db(ref_obj, csv_reader_obj)

2022-07-17 18:46:45,097 INFO sqlalchemy.engine.Engine SELECT "table_name"
FROM "information_schema"."tables"
WHERE "table_schema" = ?
2022-07-17 18:46:45,098 INFO sqlalchemy.engine.Engine [dialect trino+rest does not support caching 0.00153s] ('esg_matching',)
2022-07-17 18:46:46,466 INFO sqlalchemy.engine.Engine SELECT "table_name"
FROM "information_schema"."tables"
WHERE "table_schema" = ?
2022-07-17 18:46:46,467 INFO sqlalchemy.engine.Engine [dialect trino+rest does not support caching 0.00121s] ('esg_matching',)
2022-07-17 18:46:47,847 INFO sqlalchemy.engine.Engine SELECT "table_name"
FROM "information_schema"."tables"
WHERE "table_schema" = ?
  AND "table_name" = ?
2022-07-17 18:46:47,848 INFO sqlalchemy.engine.Engine [dialect trino+rest does not support caching 0.00105s] ('esg_matching', 'esg_match_tgt2')
2022-07-17 18:46:49,017 INFO sqlalchemy.engine.Engine SELECT
    "column_name",
    "data_type",
    "column_default",
    UPPER("is_nullable") AS "is_nullable"
FROM "informatio

  metadata.reflect(self._engine)
  metadata.reflect(self._engine)
  metadata.reflect(self._engine)
  metadata.reflect(self._engine)


2022-07-17 18:46:53,146 INFO sqlalchemy.engine.Engine ROLLBACK
2022-07-17 18:46:53,147 INFO sqlalchemy.engine.Engine SELECT "table_name"
FROM "information_schema"."tables"
WHERE "table_schema" = ?
  AND "table_name" = ?
2022-07-17 18:46:53,148 INFO sqlalchemy.engine.Engine [dialect trino+rest does not support caching 0.00068s] ('esg_matching', 'esg_match_tgt1')
2022-07-17 18:46:54,326 INFO sqlalchemy.engine.Engine SELECT
    "column_name",
    "data_type",
    "column_default",
    UPPER("is_nullable") AS "is_nullable"
FROM "information_schema"."columns"
WHERE "table_schema" = ?
  AND "table_name" = ?
ORDER BY "ordinal_position" ASC
2022-07-17 18:46:54,326 INFO sqlalchemy.engine.Engine [dialect trino+rest does not support caching 0.00068s] ('esg_matching', 'esg_match_tgt1')
2022-07-17 18:46:55,705 INFO sqlalchemy.engine.Engine SELECT "table_name"
FROM "information_schema"."tables"
WHERE "table_schema" = ?
  AND "table_name" = ?
2022-07-17 18:46:55,706 INFO sqlalchemy.engine.Engine [dia

  metadata.reflect(self._engine)
  metadata.reflect(self._engine)
  metadata.reflect(self._engine)
  metadata.reflect(self._engine)


2022-07-17 18:46:58,602 INFO sqlalchemy.engine.Engine ROLLBACK
2022-07-17 18:46:58,605 INFO sqlalchemy.engine.Engine SELECT "table_name"
FROM "information_schema"."tables"
WHERE "table_schema" = ?
  AND "table_name" = ?
2022-07-17 18:46:58,606 INFO sqlalchemy.engine.Engine [dialect trino+rest does not support caching 0.00107s] ('esg_matching', 'esg_match_ref')
2022-07-17 18:46:59,772 INFO sqlalchemy.engine.Engine SELECT
    "column_name",
    "data_type",
    "column_default",
    UPPER("is_nullable") AS "is_nullable"
FROM "information_schema"."columns"
WHERE "table_schema" = ?
  AND "table_name" = ?
ORDER BY "ordinal_position" ASC
2022-07-17 18:46:59,773 INFO sqlalchemy.engine.Engine [dialect trino+rest does not support caching 0.00123s] ('esg_matching', 'esg_match_ref')
2022-07-17 18:47:01,159 INFO sqlalchemy.engine.Engine SELECT "table_name"
FROM "information_schema"."tables"
WHERE "table_schema" = ?
  AND "table_name" = ?
2022-07-17 18:47:01,160 INFO sqlalchemy.engine.Engine [diale

  metadata.reflect(self._engine)
  metadata.reflect(self._engine)
  metadata.reflect(self._engine)
  metadata.reflect(self._engine)


2022-07-17 18:47:04,002 INFO sqlalchemy.engine.Engine ROLLBACK
2022-07-17 18:47:04,003 INFO sqlalchemy.engine.Engine SELECT "table_name"
FROM "information_schema"."tables"
WHERE "table_schema" = ?
  AND "table_name" = ?
2022-07-17 18:47:04,004 INFO sqlalchemy.engine.Engine [dialect trino+rest does not support caching 0.00070s] ('esg_matching', 'esg_matching')
2022-07-17 18:47:05,187 INFO sqlalchemy.engine.Engine SELECT
    "column_name",
    "data_type",
    "column_default",
    UPPER("is_nullable") AS "is_nullable"
FROM "information_schema"."columns"
WHERE "table_schema" = ?
  AND "table_name" = ?
ORDER BY "ordinal_position" ASC
2022-07-17 18:47:05,188 INFO sqlalchemy.engine.Engine [dialect trino+rest does not support caching 0.00118s] ('esg_matching', 'esg_matching')
2022-07-17 18:47:06,561 INFO sqlalchemy.engine.Engine SELECT "table_name"
FROM "information_schema"."tables"
WHERE "table_schema" = ?
  AND "table_name" = ?
2022-07-17 18:47:06,562 INFO sqlalchemy.engine.Engine [dialect

  metadata.reflect(self._engine)
  metadata.reflect(self._engine)
  metadata.reflect(self._engine)
  metadata.reflect(self._engine)


2022-07-17 18:47:09,890 INFO sqlalchemy.engine.Engine ROLLBACK
2022-07-17 18:47:09,892 INFO sqlalchemy.engine.Engine SELECT "table_name"
FROM "information_schema"."tables"
WHERE "table_schema" = ?
  AND "table_name" = ?
2022-07-17 18:47:09,892 INFO sqlalchemy.engine.Engine [dialect trino+rest does not support caching 0.00075s] ('esg_matching', 'esg_no_matching')
2022-07-17 18:47:11,101 INFO sqlalchemy.engine.Engine SELECT
    "column_name",
    "data_type",
    "column_default",
    UPPER("is_nullable") AS "is_nullable"
FROM "information_schema"."columns"
WHERE "table_schema" = ?
  AND "table_name" = ?
ORDER BY "ordinal_position" ASC
2022-07-17 18:47:11,101 INFO sqlalchemy.engine.Engine [dialect trino+rest does not support caching 0.00090s] ('esg_matching', 'esg_no_matching')
2022-07-17 18:47:12,483 INFO sqlalchemy.engine.Engine SELECT "table_name"
FROM "information_schema"."tables"
WHERE "table_schema" = ?
  AND "table_name" = ?
2022-07-17 18:47:12,484 INFO sqlalchemy.engine.Engine [d

  metadata.reflect(self._engine)
  metadata.reflect(self._engine)
  metadata.reflect(self._engine)
  metadata.reflect(self._engine)


2022-07-17 18:47:15,342 INFO sqlalchemy.engine.Engine ROLLBACK
2022-07-17 18:47:15,343 INFO sqlalchemy.engine.Engine SELECT "table_name"
FROM "information_schema"."tables"
WHERE "table_schema" = ?
  AND "table_name" = ?
2022-07-17 18:47:15,344 INFO sqlalchemy.engine.Engine [dialect trino+rest does not support caching 0.00115s] ('esg_matching', 'matching')
2022-07-17 18:47:16,518 INFO sqlalchemy.engine.Engine SELECT
    "column_name",
    "data_type",
    "column_default",
    UPPER("is_nullable") AS "is_nullable"
FROM "information_schema"."columns"
WHERE "table_schema" = ?
  AND "table_name" = ?
ORDER BY "ordinal_position" ASC
2022-07-17 18:47:16,519 INFO sqlalchemy.engine.Engine [dialect trino+rest does not support caching 0.00076s] ('esg_matching', 'matching')
2022-07-17 18:47:17,885 INFO sqlalchemy.engine.Engine SELECT "table_name"
FROM "information_schema"."tables"
WHERE "table_schema" = ?
  AND "table_name" = ?
2022-07-17 18:47:17,886 INFO sqlalchemy.engine.Engine [dialect trino+r

  metadata.reflect(self._engine)
  metadata.reflect(self._engine)
  metadata.reflect(self._engine)
  metadata.reflect(self._engine)


2022-07-17 18:47:20,719 INFO sqlalchemy.engine.Engine ROLLBACK
2022-07-17 18:47:20,719 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2022-07-17 18:47:20,721 INFO sqlalchemy.engine.Engine 
DROP TABLE esg_matching.esg_match_ref
2022-07-17 18:47:20,721 INFO sqlalchemy.engine.Engine [no key 0.00105s] ()
2022-07-17 18:47:23,472 INFO sqlalchemy.engine.Engine COMMIT
2022-07-17 18:47:23,474 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2022-07-17 18:47:23,476 INFO sqlalchemy.engine.Engine 
CREATE TABLE esg_matching.esg_match_ref (
	unique_id VARCHAR, 
	isin VARCHAR(12), 
	company VARCHAR(100), 
	country VARCHAR(100)
)


2022-07-17 18:47:23,477 INFO sqlalchemy.engine.Engine [no key 0.00119s] ()
2022-07-17 18:47:24,602 INFO sqlalchemy.engine.Engine COMMIT
{'unique_id': '1', 'isin': 'SK1120005824', 'company': 'CENTRAL PERK', 'country': 'SK'}
2022-07-17 18:47:24,605 INFO sqlalchemy.engine.Engine INSERT INTO esg_matching.esg_match_ref (unique_id, isin, company, country) VALUES (?, ?, ?, ?)
2022-0

In [13]:
# Target Data Sources
tgt1_settings = '../../../tests/data/howto/trino/test_target1_trino.json'
tgt1_obj = File(tgt1_settings)

In [14]:
# Load data from TARGET 1
db_tgt1 = etl_proc_obj.load_file_to_db(tgt1_obj, csv_reader_obj)

2022-07-17 18:47:39,701 INFO sqlalchemy.engine.Engine SELECT "table_name"
FROM "information_schema"."tables"
WHERE "table_schema" = ?
2022-07-17 18:47:39,702 INFO sqlalchemy.engine.Engine [dialect trino+rest does not support caching 0.00127s] ('esg_matching',)
2022-07-17 18:47:41,079 INFO sqlalchemy.engine.Engine SELECT "table_name"
FROM "information_schema"."tables"
WHERE "table_schema" = ?
2022-07-17 18:47:41,080 INFO sqlalchemy.engine.Engine [dialect trino+rest does not support caching 0.00099s] ('esg_matching',)
2022-07-17 18:47:42,451 INFO sqlalchemy.engine.Engine SELECT "table_name"
FROM "information_schema"."tables"
WHERE "table_schema" = ?
  AND "table_name" = ?
2022-07-17 18:47:42,452 INFO sqlalchemy.engine.Engine [dialect trino+rest does not support caching 0.00149s] ('esg_matching', 'esg_match_tgt2')
2022-07-17 18:47:43,620 INFO sqlalchemy.engine.Engine SELECT
    "column_name",
    "data_type",
    "column_default",
    UPPER("is_nullable") AS "is_nullable"
FROM "informatio

2022-07-17 18:48:06,666 INFO sqlalchemy.engine.Engine SELECT "table_name"
FROM "information_schema"."tables"
WHERE "table_schema" = ?
  AND "table_name" = ?
2022-07-17 18:48:06,667 INFO sqlalchemy.engine.Engine [dialect trino+rest does not support caching 0.00083s] ('esg_matching', 'esg_no_matching')
2022-07-17 18:48:07,835 INFO sqlalchemy.engine.Engine SELECT
    "column_name",
    "data_type",
    "column_default",
    UPPER("is_nullable") AS "is_nullable"
FROM "information_schema"."columns"
WHERE "table_schema" = ?
  AND "table_name" = ?
ORDER BY "ordinal_position" ASC
2022-07-17 18:48:07,835 INFO sqlalchemy.engine.Engine [dialect trino+rest does not support caching 0.00087s] ('esg_matching', 'esg_no_matching$partitions')
2022-07-17 18:48:09,542 INFO sqlalchemy.engine.Engine SELECT "comment" FROM "esg_matching"."esg_no_matching$properties"
2022-07-17 18:48:09,543 INFO sqlalchemy.engine.Engine [dialect trino+rest does not support caching 0.00123s] ()
2022-07-17 18:48:09,838 INFO sqla

In [15]:
db_tgt1.get_policy_definition()

{'matching_with_ref1': {'dfm': {'isin': ['isin']},
  'drm': {'name+country': ['company', 'country']},
  'irm': {'isin': ['isin'],
   'lei': ['lei'],
   'name+country': ['company', 'country']}}}

In [16]:
# Target Data Sources
tgt2_settings = '../../../tests/data/howto/trino/test_target2_trino.json'
tgt2_obj = File(tgt2_settings)

In [17]:
# Load data from TARGET 1
db_tgt2 = etl_proc_obj.load_file_to_db(tgt2_obj, csv_reader_obj)

2022-07-17 18:48:32,267 INFO sqlalchemy.engine.Engine SELECT "table_name"
FROM "information_schema"."tables"
WHERE "table_schema" = ?
2022-07-17 18:48:32,269 INFO sqlalchemy.engine.Engine [dialect trino+rest does not support caching 0.00165s] ('esg_matching',)
2022-07-17 18:48:33,648 INFO sqlalchemy.engine.Engine SELECT "table_name"
FROM "information_schema"."tables"
WHERE "table_schema" = ?
2022-07-17 18:48:33,649 INFO sqlalchemy.engine.Engine [dialect trino+rest does not support caching 0.00099s] ('esg_matching',)
2022-07-17 18:48:35,013 INFO sqlalchemy.engine.Engine SELECT "table_name"
FROM "information_schema"."tables"
WHERE "table_schema" = ?
  AND "table_name" = ?
2022-07-17 18:48:35,014 INFO sqlalchemy.engine.Engine [dialect trino+rest does not support caching 0.00121s] ('esg_matching', 'esg_match_tgt2')
2022-07-17 18:48:36,177 INFO sqlalchemy.engine.Engine SELECT
    "column_name",
    "data_type",
    "column_default",
    UPPER("is_nullable") AS "is_nullable"
FROM "informatio

2022-07-17 18:48:58,988 INFO sqlalchemy.engine.Engine SELECT "table_name"
FROM "information_schema"."tables"
WHERE "table_schema" = ?
  AND "table_name" = ?
2022-07-17 18:48:58,989 INFO sqlalchemy.engine.Engine [dialect trino+rest does not support caching 0.00236s] ('esg_matching', 'esg_no_matching')
2022-07-17 18:49:00,160 INFO sqlalchemy.engine.Engine SELECT
    "column_name",
    "data_type",
    "column_default",
    UPPER("is_nullable") AS "is_nullable"
FROM "information_schema"."columns"
WHERE "table_schema" = ?
  AND "table_name" = ?
ORDER BY "ordinal_position" ASC
2022-07-17 18:49:00,161 INFO sqlalchemy.engine.Engine [dialect trino+rest does not support caching 0.00124s] ('esg_matching', 'esg_no_matching$partitions')
2022-07-17 18:49:01,532 INFO sqlalchemy.engine.Engine SELECT "comment" FROM "esg_matching"."esg_no_matching$properties"
2022-07-17 18:49:01,533 INFO sqlalchemy.engine.Engine [dialect trino+rest does not support caching 0.00073s] ()
2022-07-17 18:49:01,824 INFO sqla

{'unique_id': '9', 'isin': 'GB00B1YW4409', 'lei': <sqlalchemy.sql.elements.Null object at 0x00000204E963EAC0>, 'sedol': 'B1YW440', 'company': 'Krusty Kr~ab', 'country': 'GBR'}
2022-07-17 18:49:24,214 INFO sqlalchemy.engine.Engine INSERT INTO esg_matching.esg_match_tgt2 (unique_id, isin, lei, sedol, company, country) VALUES (?, ?, NULL, ?, ?, ?)
2022-07-17 18:49:24,215 INFO sqlalchemy.engine.Engine [dialect trino+rest does not support caching 0.00080s] ('9', 'GB00B1YW4409', 'B1YW440', 'Krusty Kr~ab', 'GBR')
2022-07-17 18:49:25,864 INFO sqlalchemy.engine.Engine COMMIT


## 3. Creating matching/no-matching tables

In [18]:
match_settings = '../../../tests/data/howto/trino/test_matching_trino.json'
no_match_settings = '../../../tests/data/howto/trino/test_no_matching_trino.json'

In [19]:
file_match = File(match_settings)
file_no_match = File(no_match_settings)

In [20]:
# Call the create_data_source_from_settings() method by passing the File
db_matching = etl_proc_obj.create_data_source(file_match)

2022-07-17 18:49:25,924 INFO sqlalchemy.engine.Engine SELECT "table_name"
FROM "information_schema"."tables"
WHERE "table_schema" = ?
2022-07-17 18:49:25,925 INFO sqlalchemy.engine.Engine [dialect trino+rest does not support caching 0.00110s] ('esg_matching',)
2022-07-17 18:49:27,277 INFO sqlalchemy.engine.Engine SELECT "table_name"
FROM "information_schema"."tables"
WHERE "table_schema" = ?
2022-07-17 18:49:27,278 INFO sqlalchemy.engine.Engine [dialect trino+rest does not support caching 0.00077s] ('esg_matching',)
2022-07-17 18:49:28,642 INFO sqlalchemy.engine.Engine SELECT "table_name"
FROM "information_schema"."tables"
WHERE "table_schema" = ?
  AND "table_name" = ?
2022-07-17 18:49:28,642 INFO sqlalchemy.engine.Engine [dialect trino+rest does not support caching 0.00068s] ('esg_matching', 'esg_match_tgt2')
2022-07-17 18:49:29,808 INFO sqlalchemy.engine.Engine SELECT
    "column_name",
    "data_type",
    "column_default",
    UPPER("is_nullable") AS "is_nullable"
FROM "informatio

2022-07-17 18:49:52,578 INFO sqlalchemy.engine.Engine SELECT "table_name"
FROM "information_schema"."tables"
WHERE "table_schema" = ?
  AND "table_name" = ?
2022-07-17 18:49:52,579 INFO sqlalchemy.engine.Engine [dialect trino+rest does not support caching 0.00168s] ('esg_matching', 'esg_no_matching')
2022-07-17 18:49:53,753 INFO sqlalchemy.engine.Engine SELECT
    "column_name",
    "data_type",
    "column_default",
    UPPER("is_nullable") AS "is_nullable"
FROM "information_schema"."columns"
WHERE "table_schema" = ?
  AND "table_name" = ?
ORDER BY "ordinal_position" ASC
2022-07-17 18:49:53,754 INFO sqlalchemy.engine.Engine [dialect trino+rest does not support caching 0.00070s] ('esg_matching', 'esg_no_matching$partitions')
2022-07-17 18:49:55,115 INFO sqlalchemy.engine.Engine SELECT "comment" FROM "esg_matching"."esg_no_matching$properties"
2022-07-17 18:49:55,116 INFO sqlalchemy.engine.Engine [dialect trino+rest does not support caching 0.00113s] ()
2022-07-17 18:49:55,412 INFO sqla

In [21]:
# Call the create_data_source_from_settings() method by passing the File
db_no_matching = etl_proc_obj.create_data_source(file_no_match)

2022-07-17 18:50:02,462 INFO sqlalchemy.engine.Engine SELECT "table_name"
FROM "information_schema"."tables"
WHERE "table_schema" = ?
2022-07-17 18:50:02,463 INFO sqlalchemy.engine.Engine [dialect trino+rest does not support caching 0.00188s] ('esg_matching',)
2022-07-17 18:50:03,849 INFO sqlalchemy.engine.Engine SELECT "table_name"
FROM "information_schema"."tables"
WHERE "table_schema" = ?
2022-07-17 18:50:03,850 INFO sqlalchemy.engine.Engine [dialect trino+rest does not support caching 0.00114s] ('esg_matching',)
2022-07-17 18:50:05,227 INFO sqlalchemy.engine.Engine SELECT "table_name"
FROM "information_schema"."tables"
WHERE "table_schema" = ?
  AND "table_name" = ?
2022-07-17 18:50:05,228 INFO sqlalchemy.engine.Engine [dialect trino+rest does not support caching 0.00108s] ('esg_matching', 'esg_match_tgt2')
2022-07-17 18:50:06,405 INFO sqlalchemy.engine.Engine SELECT
    "column_name",
    "data_type",
    "column_default",
    UPPER("is_nullable") AS "is_nullable"
FROM "informatio

2022-07-17 18:50:29,352 INFO sqlalchemy.engine.Engine SELECT "table_name"
FROM "information_schema"."tables"
WHERE "table_schema" = ?
  AND "table_name" = ?
2022-07-17 18:50:29,352 INFO sqlalchemy.engine.Engine [dialect trino+rest does not support caching 0.00075s] ('esg_matching', 'esg_no_matching')
2022-07-17 18:50:30,514 INFO sqlalchemy.engine.Engine SELECT
    "column_name",
    "data_type",
    "column_default",
    UPPER("is_nullable") AS "is_nullable"
FROM "information_schema"."columns"
WHERE "table_schema" = ?
  AND "table_name" = ?
ORDER BY "ordinal_position" ASC
2022-07-17 18:50:30,515 INFO sqlalchemy.engine.Engine [dialect trino+rest does not support caching 0.00109s] ('esg_matching', 'esg_no_matching$partitions')
2022-07-17 18:50:31,872 INFO sqlalchemy.engine.Engine SELECT "comment" FROM "esg_matching"."esg_no_matching$properties"
2022-07-17 18:50:31,872 INFO sqlalchemy.engine.Engine [dialect trino+rest does not support caching 0.00108s] ()
2022-07-17 18:50:32,168 INFO sqla

## 4. Checking attribute mapping alias

In [22]:
db_ref.get_mapping_to_alias()

{'isin': Column('isin', String(length=12), table=<esg_match_ref>),
 'company': Column('company', String(length=100), table=<esg_match_ref>),
 'country': Column('country', String(length=100), table=<esg_match_ref>)}

In [23]:
db_tgt1.get_mapping_to_alias()

{'isin': Column('isin', String(length=12), table=<esg_match_tgt1>),
 'lei': Column('lei', String(length=20), table=<esg_match_tgt1>),
 'company': Column('company', String(length=255), table=<esg_match_tgt1>),
 'country': Column('country', String(length=255), table=<esg_match_tgt1>)}

In [24]:
db_tgt2.get_mapping_to_alias()

{'isin': Column('isin', String(length=12), table=<esg_match_tgt2>),
 'lei': Column('lei', String(length=20), table=<esg_match_tgt2>),
 'sedol': Column('sedol', String(length=20), table=<esg_match_tgt2>),
 'company': Column('company', String(length=100), table=<esg_match_tgt2>),
 'country': Column('country', String(length=100), table=<esg_match_tgt2>)}

## 5. Checking attribute mapping between sources and matching tables

In [25]:
db_ref.get_mapping_to_matching()

{'ref_id': Column('unique_id', String(), table=<esg_match_ref>),
 'ref_company': Column('company', String(length=100), table=<esg_match_ref>),
 'ref_country': Column('country', String(length=100), table=<esg_match_ref>)}

In [26]:
db_tgt1.get_mapping_to_matching()

{'tgt_id': Column('unique_id', String(), table=<esg_match_tgt1>),
 'isin': Column('isin', String(length=12), table=<esg_match_tgt1>),
 'lei': Column('lei', String(length=20), table=<esg_match_tgt1>),
 'tgt_company': Column('company', String(length=255), table=<esg_match_tgt1>),
 'tgt_country': Column('country', String(length=255), table=<esg_match_tgt1>)}

In [27]:
db_tgt2.get_mapping_to_matching()

{'tgt_id': Column('unique_id', String(), table=<esg_match_tgt2>),
 'isin': Column('isin', String(length=12), table=<esg_match_tgt2>),
 'lei': Column('lei', String(length=20), table=<esg_match_tgt2>),
 'sedol': Column('sedol', String(length=20), table=<esg_match_tgt2>),
 'tgt_company': Column('company', String(length=100), table=<esg_match_tgt2>),
 'tgt_country': Column('country', String(length=100), table=<esg_match_tgt2>)}

## 6. Create policies for matching each target data source with the referential

In [28]:
# Import policy module
from esg_matching.matcher.policy import MatchingPolicy

In [29]:
# Create macthing policy object for target1
policy_match_tgt1 = MatchingPolicy(db_tgt1, 'matching_with_ref1')

In [30]:
# Set the referential and matching/no-matching sources
policy_match_tgt1.set_referential_source(db_ref)
policy_match_tgt1.set_matching_source(db_matching)
policy_match_tgt1.set_no_matching_source(db_no_matching)

In [31]:
# Create macthing policy object for target2
policy_match_tgt2 = MatchingPolicy(db_tgt2, 'matching_with_ref1')

In [32]:
# Set the referential and matching/no-matching sources
policy_match_tgt2.set_referential_source(db_ref)
policy_match_tgt2.set_matching_source(db_matching)
policy_match_tgt2.set_no_matching_source(db_no_matching)

## 7. Perform direct full matching (DFM) for each data source

In [33]:
# Import DFM module
from esg_matching.matcher.dfm import DbMatcherDfm

In [34]:
# Create a matcher object for DFM
dfm_matcher_obj = DbMatcherDfm(db_conn)

In [35]:
# Perform DFM on target 1
dfm_matcher_obj.set_policy(policy_match_tgt1)
dfm_matcher_obj.execute_matching()

2022-07-17 18:50:40,010 INFO sqlalchemy.engine.Engine INSERT INTO esg_matching.esg_matching (ref_name, tgt_name, matching_type, matching_scope, matching_rule, tgt_id, isin, lei, tgt_company, tgt_country, ref_id, ref_company, ref_country) SELECT 'ds_ref' AS ref_name, 'ds_tgt1' AS tgt_name, 'direct' AS matching_type, 'full' AS matching_scope, 'isin' AS matching_rule, esg_matching.esg_match_tgt1.unique_id AS tgt_id, esg_matching.esg_match_tgt1.isin AS isin, esg_matching.esg_match_tgt1.lei AS lei, esg_matching.esg_match_tgt1.company AS tgt_company, esg_matching.esg_match_tgt1.country AS tgt_country, esg_matching.esg_match_ref.unique_id AS ref_id, esg_matching.esg_match_ref.company AS ref_company, esg_matching.esg_match_ref.country AS ref_country 
FROM esg_matching.esg_match_tgt1 JOIN esg_matching.esg_match_ref ON esg_matching.esg_match_tgt1.isin = esg_matching.esg_match_ref.isin
2022-07-17 18:50:40,011 INFO sqlalchemy.engine.Engine [dialect trino+rest does not support caching 0.00183s] ()


In [36]:
# Perform DFM on target 2
dfm_matcher_obj.set_policy(policy_match_tgt2)
dfm_matcher_obj.execute_matching()

2022-07-17 18:50:46,258 INFO sqlalchemy.engine.Engine INSERT INTO esg_matching.esg_matching (ref_name, tgt_name, matching_type, matching_scope, matching_rule, tgt_id, isin, lei, sedol, tgt_company, tgt_country, ref_id, ref_company, ref_country) SELECT 'ds_ref' AS ref_name, 'ds_tgt2' AS tgt_name, 'direct' AS matching_type, 'full' AS matching_scope, 'isin' AS matching_rule, esg_matching.esg_match_tgt2.unique_id AS tgt_id, esg_matching.esg_match_tgt2.isin AS isin, esg_matching.esg_match_tgt2.lei AS lei, esg_matching.esg_match_tgt2.sedol AS sedol, esg_matching.esg_match_tgt2.company AS tgt_company, esg_matching.esg_match_tgt2.country AS tgt_country, esg_matching.esg_match_ref.unique_id AS ref_id, esg_matching.esg_match_ref.company AS ref_company, esg_matching.esg_match_ref.country AS ref_country 
FROM esg_matching.esg_match_tgt2 JOIN esg_matching.esg_match_ref ON esg_matching.esg_match_tgt2.isin = esg_matching.esg_match_ref.isin
2022-07-17 18:50:46,260 INFO sqlalchemy.engine.Engine [dialec

## 8. Perform direct residual matching (DRM) for each data source

In [None]:
# Import DRM module
from esg_matching.matcher.drm import DbMatcherDrm

In [None]:
# Create a matcher object for DRM
drm_matcher_obj = DbMatcherDrm(db_conn)

In [None]:
# Perform DRM on target 1
drm_matcher_obj.set_policy(policy_match_tgt1)
drm_matcher_obj.execute_matching()

In [None]:
# Perform DRM on target 2
drm_matcher_obj.set_policy(policy_match_tgt2)
drm_matcher_obj.execute_matching()

## 9. Perform indirect residual matching (IRM) for each data source

In [None]:
# Import IFM module
from esg_matching.matcher.irm import DbMatcherIrm

In [None]:
# Create a matcher object for IFM
ifm_matcher_obj = DbMatcherIrm(db_conn)

In [None]:
# Perform IFM on target 1
ifm_matcher_obj.set_policy(policy_match_tgt1)
ifm_matcher_obj.execute_matching()

In [None]:
# Perform IFM on target 2
ifm_matcher_obj.set_policy(policy_match_tgt2)
ifm_matcher_obj.execute_matching()

## 10. Retrieving the matching and no-matching tables

In [37]:
# Query the matching table
df_matching = db_matching.get_data_as_df()
df_matching

2022-07-17 18:50:55,057 INFO sqlalchemy.engine.Engine SELECT esg_matching.esg_matching.matching_id, esg_matching.esg_matching.ref_name, esg_matching.esg_matching.tgt_name, esg_matching.esg_matching.matching_type, esg_matching.esg_matching.matching_scope, esg_matching.esg_matching.matching_rule, esg_matching.esg_matching.ref_id, esg_matching.esg_matching.ref_company, esg_matching.esg_matching.ref_country, esg_matching.esg_matching.tgt_id, esg_matching.esg_matching.tgt_company, esg_matching.esg_matching.tgt_country, esg_matching.esg_matching.isin, esg_matching.esg_matching.lei, esg_matching.esg_matching.sedol 
FROM esg_matching.esg_matching
2022-07-17 18:50:55,058 INFO sqlalchemy.engine.Engine [dialect trino+rest does not support caching 0.00126s] ()


Unnamed: 0,matching_id,ref_name,tgt_name,matching_type,matching_scope,matching_rule,ref_id,ref_company,ref_country,tgt_id,tgt_company,tgt_country,isin,lei,sedol
0,,ds_ref,ds_tgt2,direct,full,isin,6,InGen,usa,4,InGen,usa,US0200021014,,2019952
1,,ds_ref,ds_tgt2,direct,full,isin,5,Bluth company,CHE,3,Bluth company,CHE,CH0012221716,,
2,,ds_ref,ds_tgt2,direct,full,isin,4,STERLING COOPER,GBR,6,STERLING COOPER,GBR,GB00B1YW4409,,B1YW440
3,,ds_ref,ds_tgt2,direct,full,isin,4,STERLING COOPER,GBR,9,Krusty Kr~ab,GBR,GB00B1YW4409,,B1YW440
4,,ds_ref,ds_tgt2,direct,full,isin,1,CENTRAL PERK,SK,7,CENTRAL PERK,SK,SK1120005824,,B1YW440
5,,ds_ref,ds_tgt2,direct,full,isin,9,SPECTRE 33 SUBSIDIARY,USA,5,SPECTRE,USA,US0126531013,,2046853
6,,ds_ref,ds_tgt2,direct,full,isin,8,SPECTRE,USA,5,SPECTRE,USA,US0126531013,,2046853
7,,ds_ref,ds_tgt1,direct,full,isin,4,STERLING COOPER,GBR,5,Krusty Krab,GB,GB00B1YW4409,,
8,,ds_ref,ds_tgt1,direct,full,isin,1,CENTRAL PERK,SK,1,CENTRAL PERK AND SONS,DE,SK1120005824,097900BHK10000084115,
9,,ds_ref,ds_tgt1,direct,full,isin,9,SPECTRE 33 SUBSIDIARY,USA,8,SPECTRE UNIVERSAL LIMITED,USA,US0126531013,HDBLS2Q6GV1LSKQPBS54,


In [None]:
# Query the no-matching table
df_no_matching = db_no_matching.get_data_as_df()
df_no_matching

## 11. Close database connection

In [None]:
db_conn.disconnect()

In [None]:
db_conn.is_connected()