# How to...perform exact matching on SqlLite database

In [1]:
# Sets up the location of the api relative to this notebook 
import sys
sys.path.append('../../../')

In [2]:
# Import the module for connection to a sqllite database
from esg_matching.engine.connectors.sql_lite import SqlLiteConnector

In [3]:
# Import the modules for file management
from esg_matching.file_reader.file import File
from esg_matching.file_reader.csv_reader import FileReaderCsv

In [4]:
# Import the modules for the etl processing: reading, transformation and loading data to a database
from esg_matching.processing.etl import EtlProcessing

## 1. Database setup

In [5]:
# Localization of the database to be created in relation to this jupyter notebook
# The database will be created in the /data/dabase folder, under the project main folder
path_db = '../../../tests/data/howto/sqlite/test_esg_matching.db'

In [6]:
# The database connector is represented by the class SqlLiteConnector 
db_conn = SqlLiteConnector()

In [7]:
# The connect() method of the SqlLiteConnector is used to stablish a connection with the database if it exists, 
# or to create a new one. The property path_db defines the location and name of the database.
# The  property show_sql_statement indicates if the SQL statements are echoed (or printed) in the default output channel.
db_conn.path_db = path_db
db_conn.show_sql_statement = False
db_conn.connect()

In [8]:
# Check if the connection was stablished
db_conn.is_connected()

True

## 2. Load the data into database from csv files

In [9]:
# Create an ETL process object
etl_proc_obj = EtlProcessing(db_conn)

In [10]:
# Crete a file reader object for csv files
csv_reader_obj = FileReaderCsv()  

In [11]:
# Referential Data Source
ref_settings = '../../../tests/data/howto/sqlite/test_referential1_sqlite.json'
ref_obj = File(ref_settings)

In [12]:
# Load data from REFERENCIAL
db_ref = etl_proc_obj.load_file_to_db(ref_obj, csv_reader_obj)

{'unique_id': '1', 'isin': 'SK1120005824', 'company': 'CENTRAL PERK', 'country': 'SK'}
{'unique_id': '2', 'isin': <sqlalchemy.sql.elements.Null object at 0x0000028E9456B4C0>, 'company': 'HONEYDUKES', 'country': 'UNITED STATES OF AMERICA'}
{'unique_id': '3', 'isin': <sqlalchemy.sql.elements.Null object at 0x0000028E9456B4C0>, 'company': 'STARCOURT MALL', 'country': 'AUSTRIA'}
{'unique_id': '4', 'isin': 'GB00B1YW4409', 'company': 'STERLING COOPER', 'country': 'GBR'}
{'unique_id': '5', 'isin': 'CH0012221716', 'company': 'Bluth company', 'country': 'CHE'}
{'unique_id': '6', 'isin': 'US0200021014', 'company': 'InGen', 'country': 'usa'}
{'unique_id': '7', 'isin': 'US0231351067', 'company': 'Stark Industries', 'country': 'us'}
{'unique_id': '8', 'isin': 'US0126531013', 'company': 'SPECTRE', 'country': 'USA'}
{'unique_id': '9', 'isin': 'US0126531013', 'company': 'SPECTRE 33 SUBSIDIARY', 'country': 'USA'}


In [13]:
# Target Data Sources
tgt1_settings = '../../../tests/data/howto/sqlite/test_target1_sqlite.json'
tgt1_obj = File(tgt1_settings)

In [14]:
# Load data from TARGET 1
db_tgt1 = etl_proc_obj.load_file_to_db(tgt1_obj, csv_reader_obj)

{'unique_id': '1', 'isin': 'SK1120005824', 'lei': '097900BHK10000084115', 'company': 'CENTRAL PERK AND SONS', 'country': 'DE'}
{'unique_id': '2', 'isin': 'DE0005545503', 'lei': '5299003VKVDCUPSS5X23', 'company': 'DUNDER MIFFLINS [12345]', 'country': 'de'}
{'unique_id': '3', 'isin': <sqlalchemy.sql.elements.Null object at 0x0000028E9456B4C0>, 'lei': '254900B1P3S786KDAW57', 'company': 'HONEYDUKES (adm@honeydukes.com)', 'country': 'UNITED STATES OF AMERICA'}
{'unique_id': '4', 'isin': <sqlalchemy.sql.elements.Null object at 0x0000028E9456B4C0>, 'lei': '529900MVZ2YHFZV3K546', 'company': 'STARCOURT MALL', 'country': 'AUSTRIA'}
{'unique_id': '5', 'isin': 'GB00B1YW4409', 'lei': <sqlalchemy.sql.elements.Null object at 0x0000028E9456B4C0>, 'company': 'Krusty Krab', 'country': 'GB'}
{'unique_id': '6', 'isin': <sqlalchemy.sql.elements.Null object at 0x0000028E9456B4C0>, 'lei': '3003007MR0P683GYR674', 'company': 'RR DINER', 'country': 'CHINA'}
{'unique_id': '7', 'isin': 'US0231351067', 'lei': 'ZXT

In [15]:
db_tgt1.get_policy_definition()

{'matching_with_ref1': {'dfm': {'isin': ['isin']},
  'drm': {'name+country': ['company', 'country']},
  'irm': {'isin': ['isin'],
   'lei': ['lei'],
   'name+country': ['company', 'country']}}}

In [16]:
# Target Data Sources
tgt2_settings = '../../../tests/data/howto/sqlite/test_target2_sqlite.json'
tgt2_obj = File(tgt2_settings)

In [17]:
# Load data from TARGET 1
db_tgt2 = etl_proc_obj.load_file_to_db(tgt2_obj, csv_reader_obj)

{'unique_id': '1', 'isin': <sqlalchemy.sql.elements.Null object at 0x0000028E9456B4C0>, 'lei': '097900BHK10000084115', 'sedol': '7108899', 'company': 'HONEYDUKES', 'country': 'UNITED STATES'}
{'unique_id': '2', 'isin': <sqlalchemy.sql.elements.Null object at 0x0000028E9456B4C0>, 'lei': <sqlalchemy.sql.elements.Null object at 0x0000028E9456B4C0>, 'sedol': <sqlalchemy.sql.elements.Null object at 0x0000028E9456B4C0>, 'company': 'STARCOURT MALL', 'country': 'AUSTRIA'}
{'unique_id': '3', 'isin': 'CH0012221716', 'lei': <sqlalchemy.sql.elements.Null object at 0x0000028E9456B4C0>, 'sedol': <sqlalchemy.sql.elements.Null object at 0x0000028E9456B4C0>, 'company': 'Bluth company', 'country': 'CHE'}
{'unique_id': '4', 'isin': 'US0200021014', 'lei': <sqlalchemy.sql.elements.Null object at 0x0000028E9456B4C0>, 'sedol': '2019952', 'company': 'InGen', 'country': 'usa'}
{'unique_id': '5', 'isin': 'US0126531013', 'lei': <sqlalchemy.sql.elements.Null object at 0x0000028E9456B4C0>, 'sedol': '2046853', 'com

## 3. Creating matching/no-matching tables

In [18]:
match_settings = '../../../tests/data/howto/sqlite/test_matching_sqlite.json'
no_match_settings = '../../../tests/data/howto/sqlite/test_no_matching_sqlite.json'

In [19]:
file_match = File(match_settings)
file_no_match = File(no_match_settings)

In [20]:
# Call the create_data_source_from_settings() method by passing the File
db_matching = etl_proc_obj.create_data_source(file_match)

In [21]:
# Call the create_data_source_from_settings() method by passing the File
db_no_matching = etl_proc_obj.create_data_source(file_no_match)

## 4. Checking attribute mapping alias

In [22]:
db_ref.get_mapping_to_alias()

{'isin': Column('isin', String(), table=<esg_match_ref>),
 'company': Column('company', String(), table=<esg_match_ref>),
 'country': Column('country', String(), table=<esg_match_ref>)}

In [23]:
db_tgt1.get_mapping_to_alias()

{'isin': Column('isin', String(), table=<esg_match_tgt1>),
 'lei': Column('lei', String(), table=<esg_match_tgt1>),
 'company': Column('company', String(), table=<esg_match_tgt1>),
 'country': Column('country', String(), table=<esg_match_tgt1>)}

In [24]:
db_tgt2.get_mapping_to_alias()

{'isin': Column('isin', String(), table=<esg_match_tgt2>),
 'lei': Column('lei', String(), table=<esg_match_tgt2>),
 'sedol': Column('sedol', String(), table=<esg_match_tgt2>),
 'company': Column('company', String(), table=<esg_match_tgt2>),
 'country': Column('country', String(), table=<esg_match_tgt2>)}

## 5. Checking attribute mapping between sources and matching tables

In [25]:
db_ref.get_mapping_to_matching()

{'ref_id': Column('unique_id', Integer(), table=<esg_match_ref>, primary_key=True, nullable=False),
 'ref_company': Column('company', String(), table=<esg_match_ref>),
 'ref_country': Column('country', String(), table=<esg_match_ref>)}

In [26]:
db_tgt1.get_mapping_to_matching()

{'tgt_id': Column('unique_id', Integer(), table=<esg_match_tgt1>, primary_key=True, nullable=False),
 'isin': Column('isin', String(), table=<esg_match_tgt1>),
 'lei': Column('lei', String(), table=<esg_match_tgt1>),
 'tgt_company': Column('company', String(), table=<esg_match_tgt1>),
 'tgt_country': Column('country', String(), table=<esg_match_tgt1>)}

In [27]:
db_tgt2.get_mapping_to_matching()

{'tgt_id': Column('unique_id', Integer(), table=<esg_match_tgt2>, primary_key=True, nullable=False),
 'isin': Column('isin', String(), table=<esg_match_tgt2>),
 'lei': Column('lei', String(), table=<esg_match_tgt2>),
 'sedol': Column('sedol', String(), table=<esg_match_tgt2>),
 'tgt_company': Column('company', String(), table=<esg_match_tgt2>),
 'tgt_country': Column('country', String(), table=<esg_match_tgt2>)}

## 6. Create policies for matching each target data source with the referential

In [28]:
# Import policy module
from esg_matching.matcher.policy import MatchingPolicy

In [29]:
# Create macthing policy object for target1
policy_match_tgt1 = MatchingPolicy(db_tgt1, 'matching_with_ref1')

In [30]:
# Set the referential and matching/no-matching sources
policy_match_tgt1.set_referential_source(db_ref)
policy_match_tgt1.set_matching_source(db_matching)
policy_match_tgt1.set_no_matching_source(db_no_matching)

In [31]:
# Create macthing policy object for target2
policy_match_tgt2 = MatchingPolicy(db_tgt2, 'matching_with_ref1')

In [32]:
# Set the referential and matching/no-matching sources
policy_match_tgt2.set_referential_source(db_ref)
policy_match_tgt2.set_matching_source(db_matching)
policy_match_tgt2.set_no_matching_source(db_no_matching)

## 7. Perform direct full matching (DFM) for each data source

In [33]:
# Import DFM module
from esg_matching.matcher.dfm import DbMatcherDfm

In [34]:
# Create a matcher object for DFM
dfm_matcher_obj = DbMatcherDfm(db_conn)

In [35]:
# Perform DFM on target 1
dfm_matcher_obj.set_policy(policy_match_tgt1)
dfm_matcher_obj.execute_matching()

In [36]:
# Perform DFM on target 2
dfm_matcher_obj.set_policy(policy_match_tgt2)
dfm_matcher_obj.execute_matching()

## 8. Perform direct residual matching (DRM) for each data source

In [37]:
# Import DRM module
from esg_matching.matcher.drm import DbMatcherDrm

In [38]:
# Create a matcher object for DRM
drm_matcher_obj = DbMatcherDrm(db_conn)

In [39]:
# Perform DRM on target 1
drm_matcher_obj.set_policy(policy_match_tgt1)
drm_matcher_obj.execute_matching()

In [40]:
# Perform DRM on target 2
drm_matcher_obj.set_policy(policy_match_tgt2)
drm_matcher_obj.execute_matching()

## 9. Perform indirect residual matching (IRM) for each data source

In [41]:
# Import IFM module
from esg_matching.matcher.irm import DbMatcherIrm

In [42]:
# Create a matcher object for IFM
ifm_matcher_obj = DbMatcherIrm(db_conn)

In [43]:
# Perform IFM on target 1
ifm_matcher_obj.set_policy(policy_match_tgt1)
ifm_matcher_obj.execute_matching()

In [44]:
# Perform IFM on target 2
ifm_matcher_obj.set_policy(policy_match_tgt2)
ifm_matcher_obj.execute_matching()

## 10. Retrieving the matching and no-matching tables

In [45]:
# Query the matching table
df_matching = db_matching.get_data_as_df()
df_matching

Unnamed: 0,timestamp,matching_id,ref_name,tgt_name,matching_type,matching_scope,matching_rule,ref_id,ref_company,ref_country,tgt_id,tgt_company,tgt_country,isin,lei,sedol
0,2022-11-02 10:41:44,1,ds_ref,ds_tgt1,direct,full,isin,1,CENTRAL PERK,SK,1,CENTRAL PERK AND SONS,DE,SK1120005824,097900BHK10000084115,
1,2022-11-02 10:41:44,2,ds_ref,ds_tgt1,direct,full,isin,4,STERLING COOPER,GBR,5,Krusty Krab,GB,GB00B1YW4409,,
2,2022-11-02 10:41:44,3,ds_ref,ds_tgt1,direct,full,isin,7,Stark Industries,us,7,Starks Industries and Association,usa,US0231351067,ZXTILKJKG63JELOEG630,
3,2022-11-02 10:41:44,4,ds_ref,ds_tgt1,direct,full,isin,8,SPECTRE,USA,8,SPECTRE UNIVERSAL LIMITED,USA,US0126531013,HDBLS2Q6GV1LSKQPBS54,
4,2022-11-02 10:41:44,5,ds_ref,ds_tgt1,direct,full,isin,9,SPECTRE 33 SUBSIDIARY,USA,8,SPECTRE UNIVERSAL LIMITED,USA,US0126531013,HDBLS2Q6GV1LSKQPBS54,
5,2022-11-02 10:41:45,6,ds_ref,ds_tgt2,direct,full,isin,5,Bluth company,CHE,3,Bluth company,CHE,CH0012221716,,
6,2022-11-02 10:41:45,7,ds_ref,ds_tgt2,direct,full,isin,6,InGen,usa,4,InGen,usa,US0200021014,,2019952
7,2022-11-02 10:41:45,8,ds_ref,ds_tgt2,direct,full,isin,8,SPECTRE,USA,5,SPECTRE,USA,US0126531013,,2046853
8,2022-11-02 10:41:45,9,ds_ref,ds_tgt2,direct,full,isin,9,SPECTRE 33 SUBSIDIARY,USA,5,SPECTRE,USA,US0126531013,,2046853
9,2022-11-02 10:41:45,10,ds_ref,ds_tgt2,direct,full,isin,4,STERLING COOPER,GBR,6,STERLING COOPER,GBR,GB00B1YW4409,,B1YW440


In [46]:
# Query the no-matching table
df_no_matching = db_no_matching.get_data_as_df()
df_no_matching

Unnamed: 0,timestamp,matching_id,tgt_name,tgt_id,tgt_company,tgt_country,isin,lei,sedol
0,2022-11-02 10:41:44,1,ds_tgt1,2,DUNDER MIFFLINS [12345],de,DE0005545503,5299003VKVDCUPSS5X23,
1,2022-11-02 10:41:44,2,ds_tgt1,3,HONEYDUKES (adm@honeydukes.com),UNITED STATES OF AMERICA,,254900B1P3S786KDAW57,
2,2022-11-02 10:41:44,4,ds_tgt1,6,RR DINER,CHINA,,3003007MR0P683GYR674,
3,2022-11-02 10:41:45,7,ds_tgt2,8,DUNDER MIFFLINS,Germany,DE0005545503,,5299003VKVDCUPSS5X23


## 11. Close database connection

In [47]:
db_conn.disconnect()

In [48]:
db_conn.is_connected()

False