# Exact Matching

The exact matching uses GLEIF as referential and CORPWATCH, WRI; WIKI and SPOTT as target datasets.

In [1]:
# Sets up the location of the api relative to this notebook 
import sys
sys.path.append('../../../esg-matching/')

In [2]:
# Import the module for connection to a sqllite database
from esg_matching.engine.connectors.sql_lite import SqlLiteConnector

In [3]:
# Import the modules for file management
from esg_matching.file_reader.file import File
from esg_matching.file_reader.csv_reader import FileReaderCsv

In [4]:
# Import the modules for the etl processing: reading, transformation and loading data to a database
from esg_matching.processing.etl import EtlProcessing

## 1. Database setup

In [5]:
# Localization of the database to be created in relation to this jupyter notebook
# The database will be created in the /data folder, under the project main folder
path_db = '../../data/matching/esg_matching.db'

In [6]:
# The database connector is represented by the class SqlLiteConnector 
db_conn = SqlLiteConnector()

In [7]:
# The connect() method of the SqlLiteConnector is used to stablish a connection with the database if it exists, 
# or to create a new one. The property path_db defines the location and name of the database.
# The  property show_sql_statement indicates if the SQL statements are echoed (or printed) in the default output channel.
db_conn.path_db = path_db
db_conn.show_sql_statement = False
db_conn.connect()

In [8]:
# Check if the connection was stablished
db_conn.is_connected()

True

## 2. Load the data into database from csv files

In [9]:
# Create an ETL process object
etl_proc_obj = EtlProcessing(db_conn)

In [10]:
# Crete a file reader object for csv files
csv_reader_obj = FileReaderCsv()  

In [11]:
# Referential Data Source
ref_obj = File('../../data/matching/settings/gleif.json')
db_ref = etl_proc_obj.load_file_to_db(ref_obj, csv_reader_obj)

In [12]:
# Target Data Sources
corpwatch_obj = File('../../data/matching/settings/corpwatch.json')
db_corpwatch = etl_proc_obj.load_file_to_db(corpwatch_obj, csv_reader_obj)

wiki_obj = File('../../data/matching/settings/wiki.json')
db_wiki = etl_proc_obj.load_file_to_db(wiki_obj, csv_reader_obj)

spott_obj = File('../../data/matching/settings/spott.json')
db_spott = etl_proc_obj.load_file_to_db(spott_obj, csv_reader_obj)

wri_obj = File('../../data/matching/settings/wri.json')
db_wri = etl_proc_obj.load_file_to_db(wri_obj, csv_reader_obj)

In [13]:
# Matching/No-matching tables
file_match = File('../../data/matching/settings/matching.json')
db_matching = etl_proc_obj.create_data_source(file_match)

file_no_match = File('../../data/matching/settings/no_matching.json')
db_no_matching = etl_proc_obj.create_data_source(file_no_match)

## 3. Setup matching policies

In [14]:
# Import policy module
from esg_matching.matcher.policy import MatchingPolicy

In [15]:
# Create macthing policy object for corpwatch
policy_match_corpwatch = MatchingPolicy(db_corpwatch, 'matching_with_ref')
policy_match_corpwatch.set_referential_source(db_ref)
policy_match_corpwatch.set_matching_source(db_matching)
policy_match_corpwatch.set_no_matching_source(db_no_matching)

In [16]:
# Create macthing policy object for wiki
policy_match_wiki = MatchingPolicy(db_wiki, 'matching_with_ref')
policy_match_wiki.set_referential_source(db_ref)
policy_match_wiki.set_matching_source(db_matching)
policy_match_wiki.set_no_matching_source(db_no_matching)

In [17]:
# Create macthing policy object for wri
policy_match_wri = MatchingPolicy(db_wri, 'matching_with_ref')
policy_match_wri.set_referential_source(db_ref)
policy_match_wri.set_matching_source(db_matching)
policy_match_wri.set_no_matching_source(db_no_matching)

In [18]:
# Create macthing policy object for spott
policy_match_spott = MatchingPolicy(db_spott, 'matching_with_ref')
policy_match_spott.set_referential_source(db_ref)
policy_match_spott.set_matching_source(db_matching)
policy_match_spott.set_no_matching_source(db_no_matching)

## 4. Perform exact matching

In [19]:
# Import DFM module
from esg_matching.matcher.dfm import DbMatcherDfm

In [20]:
# Import DRM module
from esg_matching.matcher.drm import DbMatcherDrm

In [21]:
# Import IFM module
from esg_matching.matcher.irm import DbMatcherIrm

In [22]:
# Matching objects
dfm_matcher_obj = DbMatcherDfm(db_conn)
drm_matcher_obj = DbMatcherDrm(db_conn)
irm_matcher_obj = DbMatcherIrm(db_conn)

In [23]:
# Execute policies (direct matching)
dfm_matcher_obj.set_policy(policy_match_corpwatch)
dfm_matcher_obj.execute_matching()

dfm_matcher_obj.set_policy(policy_match_wiki)
dfm_matcher_obj.execute_matching()

dfm_matcher_obj.set_policy(policy_match_spott)
dfm_matcher_obj.execute_matching()

dfm_matcher_obj.set_policy(policy_match_wri)
dfm_matcher_obj.execute_matching()

In [24]:
# Execute policies (residual matching)
drm_matcher_obj.set_policy(policy_match_corpwatch)
drm_matcher_obj.execute_matching()

drm_matcher_obj.set_policy(policy_match_wiki)
drm_matcher_obj.execute_matching()

drm_matcher_obj.set_policy(policy_match_spott)
drm_matcher_obj.execute_matching()

drm_matcher_obj.set_policy(policy_match_wri)
drm_matcher_obj.execute_matching()

In [25]:
# Execute policies (indirect matching)
irm_matcher_obj.set_policy(policy_match_corpwatch)
irm_matcher_obj.execute_matching()

irm_matcher_obj.set_policy(policy_match_wiki)
irm_matcher_obj.execute_matching()

irm_matcher_obj.set_policy(policy_match_spott)
irm_matcher_obj.execute_matching()

irm_matcher_obj.set_policy(policy_match_wri)
irm_matcher_obj.execute_matching()

## 5. Saving matching data to S3

In [26]:
# From the AWS Account page, copy the export scripts from the appropriate role using the "Command Line or Programmatic Access" link
# Paste the copied text into ~/credentials.env

from dotenv import dotenv_values, load_dotenv
import os
import pathlib

dotenv_dir = os.environ.get('CREDENTIAL_DOTENV_DIR', os.environ.get('PWD', '/opt/app-root/src'))
dotenv_path = pathlib.Path(dotenv_dir) / 'credentials.env'
if os.path.exists(dotenv_path):
    load_dotenv(dotenv_path=dotenv_path,override=True)

In [27]:
import boto3
s3_resource = boto3.resource(
    service_name="s3",
    endpoint_url=os.environ['S3_LANDING_ENDPOINT'],
    aws_access_key_id=os.environ['S3_LANDING_ACCESS_KEY'],
    aws_secret_access_key=os.environ['S3_LANDING_SECRET_KEY'],
)
bucket_name = os.environ['S3_LANDING_BUCKET']
bucket = s3_resource.Bucket(bucket_name)

In [28]:
# Save final results to S3
s3_filename = 'Entity-Matching/esg_matching.db'
s3_resource.meta.client.upload_file(Filename=path_db, Bucket=bucket_name, Key=s3_filename)

## 6. Saving to Trino

In [29]:
import pandas as pd

In [35]:
from esg_matching.engine.sql.dql import DqlManager

In [30]:
df_matching = db_matching.get_data_as_df()

In [33]:
df_matching.shape[0]

73459

In [36]:
dqlmanager = DqlManager(db_conn)

In [42]:
sql_stm = "select tgt_name as 'provider', count(tgt_id) as 'total_matches' from esg_matching group by tgt_name" 
df_coverage = dqlmanager.query_by_sql_statement(sql_stm, as_pandas_df=True)

In [44]:
df_coverage

Unnamed: 0,provider,total_matches
0,corpwatch,46906
1,spott,43
2,wiki,25083
3,wri,1427


In [45]:
df_no_matching = db_no_matching.get_data_as_df()

In [46]:
df_no_matching.shape[0]

856820

In [47]:
sql_stm = "select tgt_name as 'provider', count(tgt_id) as 'total_no_matches' from esg_no_matching group by tgt_name" 
df_coverage_no_match = dqlmanager.query_by_sql_statement(sql_stm, as_pandas_df=True)

In [48]:
df_coverage_no_match

Unnamed: 0,provider,total_no_matches
0,corpwatch,674282
1,spott,202
2,wiki,173546
3,wri,8790


## 7. Close database connection

In [None]:
db_conn.disconnect()

In [None]:
db_conn.is_connected()