# How to...perform exact matching on SqlLite database

In [1]:
# Sets up the location of the api relative to this notebook 
import sys
sys.path.append('../../../')

In [2]:
# Import the module for connection to a sqllite database
from esgmatching.db_engine.engines.connector_sql_lite import SqlLiteConnector

In [3]:
# Import the modules for file management
from esgmatching.file_reader.file import File
from esgmatching.file_reader.file_reader_csv import FileReaderCsv

In [4]:
# Import the modules for the etl processing: reading, transformation and loading data to a database
from esgmatching.processing.etl_processing import EtlProcessing

## 1. Database setup

In [5]:
# Localization of the database to be created in relation to this jupyter notebook
# The database will be created in the /data/dabase folder, under the project main folder
path_db = '../../../tests/data/sqlite/test_esg_matching.db'

In [6]:
# The database connector is represented by the class SqlLiteConnector 
db_conn = SqlLiteConnector()

In [7]:
# The connect() method of the SqlLiteConnector is used to stablish a connection with the database if it exists, 
# or to create a new one. The property path_db defines the location and name of the database.
# The  property show_sql_statement indicates if the SQL statements are echoed (or printed) in the default output channel.
db_conn.path_db = path_db
db_conn.show_sql_statement = False
db_conn.connect()

In [8]:
# Check if the connection was stablished
db_conn.is_connected()

True

## 2. Load the data into database from csv files

In [9]:
# Create an ETL process object
etl_proc_obj = EtlProcessing(db_conn)

In [10]:
# Crete a file reader object for csv files
csv_reader_obj = FileReaderCsv()  

In [11]:
# Referential Data Source
ref_settings = '../../../tests/data/sqlite/test_referential1_sqlite.json'
ref_obj = File(ref_settings)

In [12]:
# Load data from REFERENCIAL
db_ref = etl_proc_obj.load_file_to_db(ref_obj, csv_reader_obj)

../../../tests/data/test_referential1.csv


In [13]:
# Target Data Sources
tgt1_settings = '../../../tests/data/sqlite/test_target1_sqlite.json'
tgt1_obj = File(tgt1_settings)

In [14]:
# Load data from TARGET 1
db_tgt1 = etl_proc_obj.load_file_to_db(tgt1_obj, csv_reader_obj)

../../../tests/data/test_target1.csv


In [15]:
db_tgt1.get_policy_definition()

{'matching_with_ref1': {'dfm': {'isin': ['ISIN']},
  'drm': {'name+country': ['COMPANY', 'COUNTRY']},
  'ifm': {'isin': ['ISIN'], 'name+country': ['COMPANY', 'COUNTRY']}}}

In [16]:
# Target Data Sources
tgt2_settings = '../../../tests/data/sqlite/test_target2_sqlite.json'
tgt2_obj = File(tgt2_settings)

In [17]:
# Load data from TARGET 1
db_tgt2 = etl_proc_obj.load_file_to_db(tgt2_obj, csv_reader_obj)

../../../tests/data/test_target2.csv


## 3. Creating matching/no-matching tables

In [18]:
match_settings = '../../../tests/data/sqlite/test_matching_sqlite.json'
no_match_settings = '../../../tests/data/sqlite/test_no_matching_sqlite.json'

In [19]:
file_match = File(match_settings)
file_no_match = File(no_match_settings)

In [20]:
# Call the create_data_source_from_settings() method by passing the File
db_matching = etl_proc_obj.create_data_source(file_match)

In [21]:
# Call the create_data_source_from_settings() method by passing the File
db_no_matching = etl_proc_obj.create_data_source(file_no_match)

## 4. Checking attribute mapping alias

In [22]:
db_ref.get_mapping_to_alias()

{'ISIN': Column('ISIN', String(), table=<ESG_MATCH_REF>),
 'COMPANY': Column('COMPANY', String(), table=<ESG_MATCH_REF>),
 'COUNTRY': Column('COUNTRY', String(), table=<ESG_MATCH_REF>)}

In [23]:
db_tgt1.get_mapping_to_alias()

{'ISIN': Column('ISIN', String(), table=<ESG_MATCH_TGT1>),
 'LEI': Column('LEI', String(), table=<ESG_MATCH_TGT1>),
 'COMPANY': Column('COMPANY', String(), table=<ESG_MATCH_TGT1>),
 'COUNTRY': Column('COUNTRY', String(), table=<ESG_MATCH_TGT1>)}

In [24]:
db_tgt2.get_mapping_to_alias()

{'ISIN': Column('ISIN', String(), table=<ESG_MATCH_TGT2>),
 'LEI': Column('LEI', String(), table=<ESG_MATCH_TGT2>),
 'SEDOL': Column('SEDOL', String(), table=<ESG_MATCH_TGT2>),
 'COMPANY': Column('COMPANY', String(), table=<ESG_MATCH_TGT2>),
 'COUNTRY': Column('COUNTRY', String(), table=<ESG_MATCH_TGT2>)}

## 5. Checking attribute mapping between sources and matching tables

In [25]:
db_ref.get_mapping_to_matching()

{'REF_ID': Column('UNIQUE_ID', Integer(), table=<ESG_MATCH_REF>, primary_key=True, nullable=False),
 'REF_COMPANY': Column('COMPANY', String(), table=<ESG_MATCH_REF>),
 'REF_COUNTRY': Column('COUNTRY', String(), table=<ESG_MATCH_REF>)}

In [26]:
db_tgt1.get_mapping_to_matching()

{'TGT_ID': Column('UNIQUE_ID', Integer(), table=<ESG_MATCH_TGT1>, primary_key=True, nullable=False),
 'ISIN': Column('ISIN', String(), table=<ESG_MATCH_TGT1>),
 'LEI': Column('LEI', String(), table=<ESG_MATCH_TGT1>),
 'TGT_COMPANY': Column('COMPANY', String(), table=<ESG_MATCH_TGT1>),
 'TGT_COUNTRY': Column('COUNTRY', String(), table=<ESG_MATCH_TGT1>)}

In [27]:
db_tgt2.get_mapping_to_matching()

{'TGT_ID': Column('UNIQUE_ID', Integer(), table=<ESG_MATCH_TGT2>, primary_key=True, nullable=False),
 'ISIN': Column('ISIN', String(), table=<ESG_MATCH_TGT2>),
 'LEI': Column('LEI', String(), table=<ESG_MATCH_TGT2>),
 'SEDOL': Column('SEDOL', String(), table=<ESG_MATCH_TGT2>),
 'TGT_COMPANY': Column('COMPANY', String(), table=<ESG_MATCH_TGT2>),
 'TGT_COUNTRY': Column('COUNTRY', String(), table=<ESG_MATCH_TGT2>)}

## 6. Create policies for matching each target data source with the referential

In [28]:
# Import policy module
from esgmatching.db_matcher.matching_policy import MatchingPolicy

In [29]:
# Create macthing policy object for target1
policy_match_tgt1 = MatchingPolicy(db_tgt1, 'matching_with_ref1')

In [30]:
# Set the referential and matching/no-matching sources
policy_match_tgt1.set_referential_source(db_ref)
policy_match_tgt1.set_matching_source(db_matching)
policy_match_tgt1.set_no_matching_source(db_no_matching)

In [31]:
# Create macthing policy object for target2
policy_match_tgt2 = MatchingPolicy(db_tgt2, 'matching_with_ref1')

In [32]:
# Set the referential and matching/no-matching sources
policy_match_tgt2.set_referential_source(db_ref)
policy_match_tgt2.set_matching_source(db_matching)
policy_match_tgt2.set_no_matching_source(db_no_matching)

## 7. Perform direct full matching (DFM) for each data source

In [33]:
# Import DFM module
from esgmatching.db_matcher.db_matcher_dfm import DbMatcherDfm

In [34]:
# Create a matcher object for DFM
dfm_matcher_obj = DbMatcherDfm(db_conn)

In [35]:
# Perform DFM on target 1
dfm_matcher_obj.set_policy(policy_match_tgt1)
dfm_matcher_obj.execute_matching()

In [36]:
# Perform DFM on target 2
dfm_matcher_obj.set_policy(policy_match_tgt2)
dfm_matcher_obj.execute_matching()

## 8. Perform direct residual matching (DRM) for each data source

In [37]:
# Import DRM module
from esgmatching.db_matcher.db_matcher_drm import DbMatcherDrm

In [38]:
# Create a matcher object for DRM
drm_matcher_obj = DbMatcherDrm(db_conn)

In [39]:
# Perform DRM on target 1
drm_matcher_obj.set_policy(policy_match_tgt1)
drm_matcher_obj.execute_matching()

In [40]:
# Perform DRM on target 2
drm_matcher_obj.set_policy(policy_match_tgt2)
drm_matcher_obj.execute_matching()

## 10. Perform indirect full matching (IFM) for each data source

In [41]:
# Import IFM module
from esgmatching.db_matcher.db_matcher_ifm import DbMatcherIfm

In [42]:
# Create a matcher object for IFM
ifm_matcher_obj = DbMatcherIfm(db_conn)

In [43]:
# Perform IFM on target 1
ifm_matcher_obj.set_policy(policy_match_tgt1)
ifm_matcher_obj.execute_matching()

OperationalError: (sqlite3.OperationalError) no such column: ESG_MATCHING.ISIN
[SQL: INSERT INTO "ESG_MATCHING" ("REF_NAME", "TGT_NAME", "MATCHING_TYPE", "MATCHING_SCOPE", "MATCHING_RULE", "TGT_ID", "ISIN", "LEI", "TGT_COMPANY", "TGT_COUNTRY", "REF_ID", "REF_COMPANY", "REF_COUNTRY") SELECT 'DS_REF' AS "REF_NAME", 'DS_TGT1' AS "TGT_NAME", 'indirect' AS "MATCHING_TYPE", 'full' AS "MATCHING_SCOPE", 'isin' AS "MATCHING_RULE", "ESG_NO_MATCHING"."TGT_ID", "ESG_NO_MATCHING"."ISIN", "ESG_NO_MATCHING"."LEI", "ESG_NO_MATCHING"."TGT_COMPANY", "ESG_NO_MATCHING"."TGT_COUNTRY", "ESG_MATCH_REF"."UNIQUE_ID" AS "REF_ID", "ESG_MATCH_REF"."COMPANY" AS "REF_COMPANY", "ESG_MATCH_REF"."COUNTRY" AS "REF_COUNTRY" 
FROM "ESG_NO_MATCHING" JOIN "ESG_MATCH_REF" ON "ESG_NO_MATCHING"."ISIN" = "ESG_MATCHING"."ISIN" 
WHERE "ESG_NO_MATCHING"."TGT_NAME" = ?]
[parameters: ('DS_TGT1',)]
(Background on this error at: https://sqlalche.me/e/14/e3q8)

In [None]:
# Perform IFM on target 2
ifm_matcher_obj.set_policy(policy_match_tgt2)
ifm_matcher_obj.execute_matching()

## 11. Close database connection

In [None]:
db_conn.disconnect()

In [None]:
db_conn.is_connected()

In [None]:
a = ["John", "Charles", "Mike"]
b = ["Jenny", "Christy", "Monica", "Vicky"]
x = zip(a,b)

In [None]:
for i in zip(a,b):
    print(i[0])