# How to...perform exact matching on trustable attributes

In [1]:
# Sets up the location of the api relative to this notebook 
import sys
sys.path.append('../../../')

In [2]:
# Import the module for accessing a database
from esgmatching.dbmanager.SqlEngine import SqlEngine

In [3]:
# Import the module for reading csv files
from esgmatching.reader.FileReaderCsvToDB import FileReaderCsvToDB

In [4]:
# Import the module for reading csv files
from esgmatching.matcher.ExactMatcherDB import ExactMatcherDB

## 1. Settings

In [5]:
# Localization of the database to be created in relation to this jupyter notebook
# The database will be created in the /data/dabase folder, under the project main folder (EntityMatching)
path_db = '../../../data/database/'

In [6]:
# String connection used for sqlite. Others databases might require different information.
# In this example the connection is a combination of [sqlite statement] + [database path] + [database name]
str_connection = 'sqlite:///' + path_db + 'entitymatching.db'
str_connection

'sqlite:///../../../data/database/entitymatching.db'

In [7]:
# The database engine object is created by passing the string connection 
sqlengine_obj = SqlEngine(str_connection)

In [2]:
# The connect() method of the SqlEngine is used to stablish a connection with the database if it exists, 
# or to create a new one, otherwise. The parameter show_eco is False by default and indicates if the SQL statements 
# are echoed (or printed) in the default output channel. Therefore, let's set show_echo = True to see the Sql statements. 
sqlengine_obj.connect(show_echo=True)

NameError: name 'sqlengine_obj' is not defined

In [1]:
# Check if the connection was stablished
sqlengine_obj.is_connected()

NameError: name 'sqlengine_obj' is not defined

## 2. Read the csv files into the database

In [9]:
# Path to the csv files and its mapping files
# Localization of the test files
path_test = '../../../data/test/'

In [10]:
# CSV file for Data source 1 
file1_path = path_test + 'test_data_source1.csv'
file1_path

'../../../data/test/test_data_source1.csv'

In [11]:
# Data mapping for Data source 1
file1_map = path_test + 'test_data_source1.json'
file1_map

'../../../data/test/test_data_source1.json'

In [12]:
# Initialize the FileReader
csvreader_obj = FileReaderCsvToDB()

In [13]:
# Set the database engine into the FileReader
csvreader_obj.set_database_engine(sqlengine_obj, use_session=True)

In [14]:
# Read 'test_data_source1.csv'
ref_data_source = csvreader_obj.read_file(file1_path, file1_map, delimiter=',', chunk_size=10)

2021-09-09 10:53:40,438 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2021-09-09 10:53:40,442 INFO sqlalchemy.engine.Engine 
CREATE TABLE data_source_ref (
	idx INTEGER NOT NULL, 
	data_source VARCHAR, 
	isin VARCHAR, 
	isvalid_isin BOOLEAN, 
	company_name VARCHAR, 
	original_company_name VARCHAR, 
	country VARCHAR, 
	country_alpha2 VARCHAR, 
	country_alpha3 VARCHAR, 
	original_country VARCHAR, 
	PRIMARY KEY (idx)
)


2021-09-09 10:53:40,443 INFO sqlalchemy.engine.Engine [no key 0.00113s] ()
2021-09-09 10:53:40,463 INFO sqlalchemy.engine.Engine COMMIT
2021-09-09 10:53:40,658 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2021-09-09 10:53:40,660 INFO sqlalchemy.engine.Engine INSERT INTO data_source_ref (data_source, isin, isvalid_isin, company_name, original_company_name, country, country_alpha2, country_alpha3, original_country) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?), (?, NULL, NULL, ?, ?, ?, ?, ?, ?), (?, NULL, NULL, ?, ?, ?, ?, ?, ?), (?, ?, ?, ?, ?, ?, ?, ?, ?), (?, ?, ?, ?, ?, ?, ?, ?

In [15]:
# CSV file for Data source 2 
file2_path = path_test + 'test_data_source2.csv'
file2_path

'../../../data/test/test_data_source2.csv'

In [16]:
# Data mapping for Data source 2
file2_map = path_test + 'test_data_source2.json'
file2_map

'../../../data/test/test_data_source2.json'

In [17]:
# Read 'test_data_source2.csv'
target_data_source = csvreader_obj.read_file(file2_path, file2_map, delimiter=',', chunk_size=10)

2021-09-09 10:55:16,206 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2021-09-09 10:55:16,208 INFO sqlalchemy.engine.Engine 
CREATE TABLE data_source_target1 (
	idx INTEGER NOT NULL, 
	data_source VARCHAR, 
	isin VARCHAR, 
	isvalid_isin BOOLEAN, 
	lei VARCHAR, 
	isvalid_lei BOOLEAN, 
	company_name VARCHAR, 
	original_company_name VARCHAR, 
	country VARCHAR, 
	country_alpha2 VARCHAR, 
	country_alpha3 VARCHAR, 
	original_country VARCHAR, 
	PRIMARY KEY (idx)
)


2021-09-09 10:55:16,209 INFO sqlalchemy.engine.Engine [no key 0.00079s] ()
2021-09-09 10:55:16,227 INFO sqlalchemy.engine.Engine COMMIT
2021-09-09 10:55:16,236 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2021-09-09 10:55:16,241 INFO sqlalchemy.engine.Engine INSERT INTO data_source_target1 (data_source, isin, isvalid_isin, lei, isvalid_lei, company_name, original_company_name, country, country_alpha2, country_alpha3, original_country) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?), (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?), (?, NULL, NULL, ?

In [18]:
# CSV file for Data source 3 
file3_path = path_test + 'test_data_source3.csv'
file3_path

'../../../data/test/test_data_source3.csv'

In [19]:
# Data mapping for Data source 3
file3_map = path_test + 'test_data_source3.json'
file3_map

'../../../data/test/test_data_source3.json'

In [20]:
# Read 'test_data_source3.csv'
target_data_source2 = csvreader_obj.read_file(file3_path, file3_map, delimiter=',', chunk_size=10)

2021-09-09 10:55:21,552 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2021-09-09 10:55:21,554 INFO sqlalchemy.engine.Engine 
CREATE TABLE data_source_target2 (
	idx INTEGER NOT NULL, 
	data_source VARCHAR, 
	isin VARCHAR, 
	isvalid_isin BOOLEAN, 
	lei VARCHAR, 
	isvalid_lei BOOLEAN, 
	sedol VARCHAR, 
	isvalid_sedol BOOLEAN, 
	company_name VARCHAR, 
	original_company_name VARCHAR, 
	country VARCHAR, 
	country_alpha2 VARCHAR, 
	country_alpha3 VARCHAR, 
	original_country VARCHAR, 
	PRIMARY KEY (idx)
)


2021-09-09 10:55:21,555 INFO sqlalchemy.engine.Engine [no key 0.00115s] ()
2021-09-09 10:55:21,577 INFO sqlalchemy.engine.Engine COMMIT
2021-09-09 10:55:21,586 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2021-09-09 10:55:21,594 INFO sqlalchemy.engine.Engine INSERT INTO data_source_target2 (data_source, isin, isvalid_isin, lei, isvalid_lei, sedol, isvalid_sedol, company_name, original_company_name, country, country_alpha2, country_alpha3, original_country) VALUES (?, NULL, NULL, ?, ?, ?

## 3. Perform matching

In [21]:
matcher_obj = ExactMatcherDB('Matching_DS1_DS2',ref_data_source)

In [22]:
matcher_obj.add_target_data_sources(target_data_source)

In [23]:
matcher_obj.add_target_data_sources(target_data_source2)

In [24]:
matcher_obj.set_database_engine(sqlengine_obj)

In [25]:
matching_report_obj = matcher_obj.execute_matching('Matching_Table')

2021-09-09 10:56:46,921 INFO sqlalchemy.engine.Engine SELECT name FROM sqlite_master WHERE type='table' ORDER BY name
2021-09-09 10:56:46,922 INFO sqlalchemy.engine.Engine [raw sql] ()
2021-09-09 10:56:46,928 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2021-09-09 10:56:46,930 INFO sqlalchemy.engine.Engine 
CREATE TABLE "Matching_Table" (
	idx INTEGER NOT NULL, 
	timestamp DATETIME DEFAULT (CURRENT_TIMESTAMP), 
	"match" INTEGER, 
	target_name VARCHAR, 
	ref_name VARCHAR, 
	isin VARCHAR, 
	lei VARCHAR, 
	company_name VARCHAR, 
	country VARCHAR, 
	sedol VARCHAR, 
	ref_isin VARCHAR, 
	ref_company_name VARCHAR, 
	ref_country VARCHAR, 
	PRIMARY KEY (idx)
)


2021-09-09 10:56:46,932 INFO sqlalchemy.engine.Engine [no key 0.00123s] ()
2021-09-09 10:56:46,954 INFO sqlalchemy.engine.Engine COMMIT
2021-09-09 10:56:46,957 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2021-09-09 10:56:46,959 INFO sqlalchemy.engine.Engine 
CREATE TABLE "No_Matching_Table" (
	idx INTEGER NOT NULL, 
	timestamp DATE

In [26]:
matcher_obj.execute_indirect_matching()

['target_name', 'isin', 'lei', 'company_name', 'country', 'sedol', 'ref_isin', 'ref_company_name', 'ref_country', 'ref_name']
{'target_name': Column('target_name', String(), table=<No_Matching_Table>), 'isin': Column('isin', String(), table=<No_Matching_Table>), 'lei': Column('lei', String(), table=<No_Matching_Table>), 'company_name': Column('company_name', String(), table=<No_Matching_Table>), 'country': Column('country', String(), table=<No_Matching_Table>), 'sedol': Column('sedol', String(), table=<No_Matching_Table>)}
{'ref_isin': Column('isin', String(), table=<Matching_Table>), 'ref_company_name': Column('company_name', String(), table=<Matching_Table>), 'ref_country': Column('country', String(), table=<Matching_Table>), 'ref_name': Column('target_name', String(), table=<Matching_Table>)}
['lei']
2021-09-09 11:00:01,875 INFO sqlalchemy.engine.Engine INSERT INTO "Matching_Table" (target_name, isin, lei, company_name, country, sedol, ref_isin, ref_company_name, ref_country, ref_na

In [27]:
matching_report_obj.print_report()

------------------------- REPORT ON MATCHING -------------------------
Report name: Matching_DS1_DS2
Description: 
Datetime:2021-09-09 10:56:46
----------------------------------------------------------------------
Referential          Target               Coverage (%)
----------------------------------------------------------------------
data_source_ref      data_source_target1       50.00
data_source_ref      data_source_target2       66.67
----------------------------------------------------------------------


## 5. Drop Tables

In [None]:
sqlengine_obj.drop_table(ref_data_source.table_obj)

In [None]:
sqlengine_obj.drop_table(target_data_source.table_obj)

In [None]:
sqlengine_obj.drop_table(target_data_source2.table_obj)

## 6. Close database connection

In [None]:
sqlengine_obj.disconnect()