# How to...read data from csv files and store them on Trino database

In [1]:
# Sets up the location of the api relative to this notebook 
import sys
sys.path.append('../../../')

In [2]:
# Import the module for connection to a trino database
from esg_matching.engine.connectors.trino import TrinoConnector

In [3]:
# Import the modules for file management
from esg_matching.file_reader.file import File
from esg_matching.file_reader.csv_reader import FileReaderCsv

In [4]:
# Import the modules for the etl processing: reading, transformation and loading data to a database
from esg_matching.processing.etl import EtlProcessing

## 1. Database setup

In [6]:
import os
user_trino = os.environ['TRINO_USER']
pwd_trino = os.environ['TRINO_PASSWD']
host_trino = os.environ['TRINO_HOST']
port_trino = int(os.environ['TRINO_PORT'])

In [7]:
# The database connector is represented by the class TrinoConnector 
db_conn = TrinoConnector()

In [8]:
# The connect() method of the TrinoConnector is used to stablish a connection with the database if it exists, 
# or to create a new one. The property path_db defines the location and name of the database.
# The  property show_sql_statement indicates if the SQL statements are echoed (or printed) in the default output channel.
db_conn.username = user_trino
db_conn.user_password = pwd_trino
db_conn.host_url = host_trino
db_conn.port_number = port_trino
db_conn.catalog = 'osc_datacommons_iceberg_dev'
db_conn.show_sql_statement = True
db_conn.connect()

2022-07-17 17:30:43,144 INFO sqlalchemy.engine.Engine SELECT version()
2022-07-17 17:30:43,147 INFO sqlalchemy.engine.Engine [dialect trino+rest does not support caching 0.00305s] ()


In [9]:
# Check if the connection was stablished
db_conn.is_connected()

True

## 2. File setup

In [10]:
# Settings for Referential 1
file1_settings = '../../../tests/data/howto/trino/test_referential1_trino.json'
file1_settings

'../../../tests/data/howto/trino/test_referential1_trino.json'

In [11]:
# Create a file object
file_obj = File(file1_settings)

In [12]:
# Checking some properties of the File object
print('Filename:{}'.format(file_obj.filename))
print('Json Settings:{}'.format(file_obj.filename_settings))

Filename:../../../tests/data/test_referential1.csv
Json Settings:../../../tests/data/howto/trino/test_referential1_trino.json


## 3. Read a csv file and load its content to the database

The Esg-Entity-Matching library provides a FileReaderCsv that understands the content of csv files. 
It also provides an EtlProcessing object that combines file, connector and reader in order to perform the complete pipeline of reading, transforming and loading data into a database.

In [13]:
# Crete a file reader object for csv files
csv_reader_obj = FileReaderCsv()  

In [14]:
# Create an ETL process object
etl_proc_obj = EtlProcessing(db_conn)

In [15]:
# Call the load_file_to_db() method by passing the File, FileReader and SqlLiteConnector
# The ETL process returns a database source object
db_source = etl_proc_obj.load_file_to_db(file_obj, csv_reader_obj)

2022-07-17 17:30:44,027 INFO sqlalchemy.engine.Engine SELECT "table_name"
FROM "information_schema"."tables"
WHERE "table_schema" = ?
2022-07-17 17:30:44,033 INFO sqlalchemy.engine.Engine [dialect trino+rest does not support caching 0.00604s] ('esg_matching',)
2022-07-17 17:30:45,400 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2022-07-17 17:30:45,401 INFO sqlalchemy.engine.Engine 
CREATE TABLE esg_matching.esg_match_ref (
	unique_id VARCHAR, 
	isin VARCHAR(12), 
	company VARCHAR(100), 
	country VARCHAR(100)
)


2022-07-17 17:30:45,402 INFO sqlalchemy.engine.Engine [no key 0.00109s] ()
2022-07-17 17:30:46,546 INFO sqlalchemy.engine.Engine COMMIT
{'unique_id': '1', 'isin': 'SK1120005824', 'company': 'CENTRAL PERK', 'country': 'SK'}
2022-07-17 17:30:46,549 INFO sqlalchemy.engine.Engine INSERT INTO esg_matching.esg_match_ref (unique_id, isin, company, country) VALUES (?, ?, ?, ?)
2022-07-17 17:30:46,550 INFO sqlalchemy.engine.Engine [dialect trino+rest does not support caching 0.00106s]

## 4. Report on Etl Process

In [16]:
# Printing the ELT Processing Report 
etl_proc_obj.print_report()

-------------------------------------- ETL PROCESSING REPORT ---------------------------------------
Description: Details of the ETL process performed on [ds_ref] data source.
Datetime:2022-07-17 17:31:02
----------------------------------------------------------------------------------------------------
File Name: ../../../tests/data/test_referential1.csv
Columns in the File: 4
Columns read from File: 4
Lines Extracted from File: 9


## 5. Checking the attribute names of DbDataSource

There are three methods to check the column or attribute names of the DbDataSource object:
1. Use get_original_field_names(): to retrieve the original attribute names of the columns in the csv file
2. Use get_field_names(): to retrieve the attribute names of the database table
3. Use get_primary_keys(): to retrieve the attribute names of the primary keys in the database table

In [17]:
# Retrieve the original attribute names (read from the csv file)
db_source.get_original_attribute_names()

['unique_id', 'isin', 'company', 'country']

In [18]:
# Retrieve the attribute names of the database table
db_source.get_attribute_names()

['unique_id', 'isin', 'company', 'country']

## 6. Checking the Data Source

In [19]:
print('Data Source Name: {}, Table name: {}'.format(db_source.name, db_source.table_name))

Data Source Name: ds_ref, Table name: esg_match_ref


In [20]:
# Total entries of the table
result = db_source.get_total_entries()
print('Total entries in table {} = {}'.format(db_source.table_name, result))

2022-07-17 17:31:02,156 INFO sqlalchemy.engine.Engine SELECT count(*) AS count_1 
FROM esg_matching.esg_match_ref
2022-07-17 17:31:02,157 INFO sqlalchemy.engine.Engine [dialect trino+rest does not support caching 0.00167s] ()
Total entries in table esg_match_ref = 9


In [21]:
# Total entries of the table by a column name
result = db_source.get_total_entries_by_column('isin')
print('Total entries by ISIN = {}'.format(result))

2022-07-17 17:31:03,055 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2022-07-17 17:31:03,058 INFO sqlalchemy.engine.Engine SELECT count(esg_matching.esg_match_ref.isin) AS count_1 
FROM esg_matching.esg_match_ref
 LIMIT ?
2022-07-17 17:31:03,059 INFO sqlalchemy.engine.Engine [dialect trino+rest does not support caching 0.00118s] (1,)
2022-07-17 17:31:04,758 INFO sqlalchemy.engine.Engine ROLLBACK
Total entries by ISIN = 7


In [22]:
# Total entries of the table by a column name with distinct values
result = db_source.get_total_entries_by_column('isin', distinct_values=True)
print('Total entries by ISIN with distinct values = {}'.format(result))

2022-07-17 17:31:04,770 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2022-07-17 17:31:04,772 INFO sqlalchemy.engine.Engine SELECT count(DISTINCT esg_matching.esg_match_ref.isin) AS count_1 
FROM esg_matching.esg_match_ref
 LIMIT ?
2022-07-17 17:31:04,773 INFO sqlalchemy.engine.Engine [dialect trino+rest does not support caching 0.00121s] (1,)
2022-07-17 17:31:06,485 INFO sqlalchemy.engine.Engine ROLLBACK
Total entries by ISIN with distinct values = 6


## 7. Checking the content of the DbDataSource

The get_data() method of the DbDataSource object performs a full select in the table, returning a list of tupples. Each item of the list is a row in the table and each element is the value per column.

In [23]:
# Query all the values of the table
# Equivalent to SELECT * FROM TABLE_NAME
lst_result = db_source.get_data()
lst_result

2022-07-17 17:31:06,515 INFO sqlalchemy.engine.Engine SELECT esg_matching.esg_match_ref.unique_id, esg_matching.esg_match_ref.isin, esg_matching.esg_match_ref.company, esg_matching.esg_match_ref.country 
FROM esg_matching.esg_match_ref
2022-07-17 17:31:06,517 INFO sqlalchemy.engine.Engine [dialect trino+rest does not support caching 0.00194s] ()


[('2', None, 'HONEYDUKES', 'UNITED STATES OF AMERICA'),
 ('3', None, 'STARCOURT MALL', 'AUSTRIA'),
 ('5', 'CH0012221716', 'Bluth company', 'CHE'),
 ('9', 'US0126531013', 'SPECTRE 33 SUBSIDIARY', 'USA'),
 ('1', 'SK1120005824', 'CENTRAL PERK', 'SK'),
 ('4', 'GB00B1YW4409', 'STERLING COOPER', 'GBR'),
 ('8', 'US0126531013', 'SPECTRE', 'USA'),
 ('7', 'US0231351067', 'Stark Industries', 'us'),
 ('6', 'US0200021014', 'InGen', 'usa')]

The get_data_as_df() method of the DbDataSource also performs a select in the table, but returns a pandas dataframe as result.

In [24]:
# Query the table
df_result = db_source.get_data_as_df()
df_result

2022-07-17 17:31:07,545 INFO sqlalchemy.engine.Engine SELECT esg_matching.esg_match_ref.unique_id, esg_matching.esg_match_ref.isin, esg_matching.esg_match_ref.company, esg_matching.esg_match_ref.country 
FROM esg_matching.esg_match_ref
2022-07-17 17:31:07,546 INFO sqlalchemy.engine.Engine [dialect trino+rest does not support caching 0.00149s] ()


Unnamed: 0,unique_id,isin,company,country
0,9,US0126531013,SPECTRE 33 SUBSIDIARY,USA
1,5,CH0012221716,Bluth company,CHE
2,3,,STARCOURT MALL,AUSTRIA
3,6,US0200021014,InGen,usa
4,4,GB00B1YW4409,STERLING COOPER,GBR
5,2,,HONEYDUKES,UNITED STATES OF AMERICA
6,7,US0231351067,Stark Industries,us
7,8,US0126531013,SPECTRE,USA
8,1,SK1120005824,CENTRAL PERK,SK


## 8. Drop the table using DbDataSource object

In [25]:
db_source.drop_table()

2022-07-17 17:31:08,560 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2022-07-17 17:31:08,561 INFO sqlalchemy.engine.Engine 
DROP TABLE esg_matching.esg_match_ref
2022-07-17 17:31:08,562 INFO sqlalchemy.engine.Engine [no key 0.00123s] ()
2022-07-17 17:31:11,146 INFO sqlalchemy.engine.Engine COMMIT


## 9. Close database connection

In [26]:
db_conn.disconnect()

In [27]:
db_conn.is_connected()

False