# How to...read data from csv files and store them on SqlLite database

In [1]:
# Sets up the location of the api relative to this notebook 
import sys
sys.path.append('../../../')

In [2]:
# Import the module for connection to a sqllite database
from esg_matching.engine.connectors.sql_lite import SqlLiteConnector

In [3]:
# Import the modules for file management
from esg_matching.file_reader.file import File
from esg_matching.file_reader.csv_reader import FileReaderCsv

In [4]:
# Import the modules for the etl processing: reading, transformation and loading data to a database
from esg_matching.processing.etl import EtlProcessing

## 1. Database setup

In [5]:
# Localization of the database to be created in relation to this jupyter notebook
# The database will be created in the /data/dabase folder, under the project main folder
path_db = '../../../tests/data/howto/sqlite/test_esg_matching.db'

In [6]:
# The database connector is represented by the class SqlLiteConnector 
db_conn = SqlLiteConnector()

In [7]:
# The connect() method of the SqlLiteConnector is used to stablish a connection with the database if it exists, 
# or to create a new one. The property path_db defines the location and name of the database.
# The  property show_sql_statement indicates if the SQL statements are echoed (or printed) in the default output channel.
db_conn.path_db = path_db
db_conn.show_sql_statement = True
db_conn.connect()

In [8]:
# Check if the connection was stablished
db_conn.is_connected()

True

## 2. File setup

In [9]:
# Settings for Referential 1
file1_settings = '../../../tests/data/howto/sqlite/test_referential1_sqlite.json'
file1_settings

'../../../tests/data/howto/sqlite/test_referential1_sqlite.json'

In [10]:
# Create a file object
file_obj = File(file1_settings)

In [11]:
# Checking some properties of the File object
print('Filename:{}'.format(file_obj.filename))
print('Json Settings:{}'.format(file_obj.filename_settings))

Filename:../../../tests/data/test_referential1.csv
Json Settings:../../../tests/data/howto/sqlite/test_referential1_sqlite.json


## 3. Read a csv file and load its content to the database

The Esg-Entity-Matching library provides a FileReaderCsv that understands the content of csv files. 
It also provides an EtlProcessing object that combines file, connector and reader in order to perform the complete pipeline of reading, transforming and loading data into a database.

In [12]:
# Crete a file reader object for csv files
csv_reader_obj = FileReaderCsv()  

In [13]:
# Create an ETL process object
etl_proc_obj = EtlProcessing(db_conn)

In [14]:
# Call the load_file_to_db() method by passing the File, FileReader and SqlLiteConnector
# The ETL process returns a database source object
db_source = etl_proc_obj.load_file_to_db(file_obj, csv_reader_obj)

2022-07-17 16:38:02,326 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("esg_match_ref")
2022-07-17 16:38:02,327 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-07-17 16:38:02,330 INFO sqlalchemy.engine.Engine PRAGMA temp.table_info("esg_match_ref")
2022-07-17 16:38:02,331 INFO sqlalchemy.engine.Engine [raw sql] ()
[Column('unique_id', Integer(), table=None, primary_key=True, nullable=False), Column('isin', String(), table=None), Column('company', String(), table=None), Column('country', String(), table=None)]
schema=None
2022-07-17 16:38:02,338 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2022-07-17 16:38:02,339 INFO sqlalchemy.engine.Engine 
CREATE TABLE esg_match_ref (
	unique_id INTEGER NOT NULL, 
	isin VARCHAR, 
	company VARCHAR, 
	country VARCHAR, 
	PRIMARY KEY (unique_id)
)


2022-07-17 16:38:02,341 INFO sqlalchemy.engine.Engine [no key 0.00153s] ()
2022-07-17 16:38:02,380 INFO sqlalchemy.engine.Engine COMMIT
2022-07-17 16:38:02,387 INFO sqlalchemy.engine.Engine INSERT IN

## 4. Report on Etl Process

In [15]:
# Printing the ELT Processing Report 
etl_proc_obj.print_report()

-------------------------------------- ETL PROCESSING REPORT ---------------------------------------
Description: Details of the ETL process performed on [ds_ref] data source.
Datetime:2022-07-17 16:38:02
----------------------------------------------------------------------------------------------------
File Name: ../../../tests/data/test_referential1.csv
Columns in the File: 4
Columns read from File: 4
Lines Extracted from File: 9


## 5. Checking the attribute names of DbDataSource

There are three methods to check the column or attribute names of the DbDataSource object:
1. Use get_original_field_names(): to retrieve the original attribute names of the columns in the csv file
2. Use get_field_names(): to retrieve the attribute names of the database table
3. Use get_primary_keys(): to retrieve the attribute names of the primary keys in the database table

In [16]:
# Retrieve the original attribute names (read from the csv file)
db_source.get_original_attribute_names()

['unique_id', 'isin', 'company', 'country']

In [17]:
# Retrieve the attribute names of the database table
db_source.get_attribute_names()

['unique_id', 'isin', 'company', 'country']

In [18]:
# Retrieve the attribute names of the primary keys in the database table
db_source.get_primary_keys()

['unique_id']

## 6. Checking the Data Source

In [19]:
print('Data Source Name: {}, Table name: {}'.format(db_source.name, db_source.table_name))

Data Source Name: ds_ref, Table name: esg_match_ref


In [20]:
# Total entries of the table
result = db_source.get_total_entries()
print('Total entries in table {} = {}'.format(db_source.table_name, result))

2022-07-17 16:38:02,913 INFO sqlalchemy.engine.Engine SELECT count(*) AS count_1 
FROM esg_match_ref
2022-07-17 16:38:02,914 INFO sqlalchemy.engine.Engine [generated in 0.00150s] ()
Total entries in table esg_match_ref = 9


In [21]:
# Total entries of the table by a column name
result = db_source.get_total_entries_by_column('isin')
print('Total entries by ISIN = {}'.format(result))

2022-07-17 16:38:02,958 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2022-07-17 16:38:02,962 INFO sqlalchemy.engine.Engine SELECT count(esg_match_ref.isin) AS count_1 
FROM esg_match_ref
 LIMIT ? OFFSET ?
2022-07-17 16:38:02,963 INFO sqlalchemy.engine.Engine [generated in 0.00189s] (1, 0)
2022-07-17 16:38:02,972 INFO sqlalchemy.engine.Engine ROLLBACK
Total entries by ISIN = 7


In [22]:
# Total entries of the table by a column name with distinct values
result = db_source.get_total_entries_by_column('isin', distinct_values=True)
print('Total entries by ISIN with distinct values = {}'.format(result))

2022-07-17 16:38:02,992 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2022-07-17 16:38:02,995 INFO sqlalchemy.engine.Engine SELECT count(DISTINCT esg_match_ref.isin) AS count_1 
FROM esg_match_ref
 LIMIT ? OFFSET ?
2022-07-17 16:38:02,996 INFO sqlalchemy.engine.Engine [generated in 0.00133s] (1, 0)
2022-07-17 16:38:03,005 INFO sqlalchemy.engine.Engine ROLLBACK
Total entries by ISIN with distinct values = 6


## 7. Checking the content of the DbDataSource

The get_data() method of the DbDataSource object performs a full select in the table, returning a list of tupples. Each item of the list is a row in the table and each element is the value per column.

In [23]:
# Query all the values of the table
# Equivalent to SELECT * FROM TABLE_NAME
lst_result = db_source.get_data()
lst_result

2022-07-17 16:38:03,046 INFO sqlalchemy.engine.Engine SELECT esg_match_ref.unique_id, esg_match_ref.isin, esg_match_ref.company, esg_match_ref.country 
FROM esg_match_ref
2022-07-17 16:38:03,053 INFO sqlalchemy.engine.Engine [generated in 0.00691s] ()


[(1, 'SK1120005824', 'CENTRAL PERK', 'SK'),
 (2, None, 'HONEYDUKES', 'UNITED STATES OF AMERICA'),
 (3, None, 'STARCOURT MALL', 'AUSTRIA'),
 (4, 'GB00B1YW4409', 'STERLING COOPER', 'GBR'),
 (5, 'CH0012221716', 'Bluth company', 'CHE'),
 (6, 'US0200021014', 'InGen', 'usa'),
 (7, 'US0231351067', 'Stark Industries', 'us'),
 (8, 'US0126531013', 'SPECTRE', 'USA'),
 (9, 'US0126531013', 'SPECTRE 33 SUBSIDIARY', 'USA')]

The get_data_as_df() method of the DbDataSource also performs a select in the table, but returns a pandas dataframe as result.

In [24]:
# Query the table
df_result = db_source.get_data_as_df()
df_result

2022-07-17 16:38:03,089 INFO sqlalchemy.engine.Engine SELECT esg_match_ref.unique_id, esg_match_ref.isin, esg_match_ref.company, esg_match_ref.country 
FROM esg_match_ref
2022-07-17 16:38:03,090 INFO sqlalchemy.engine.Engine [cached since 0.04395s ago] ()


Unnamed: 0,unique_id,isin,company,country
0,1,SK1120005824,CENTRAL PERK,SK
1,2,,HONEYDUKES,UNITED STATES OF AMERICA
2,3,,STARCOURT MALL,AUSTRIA
3,4,GB00B1YW4409,STERLING COOPER,GBR
4,5,CH0012221716,Bluth company,CHE
5,6,US0200021014,InGen,usa
6,7,US0231351067,Stark Industries,us
7,8,US0126531013,SPECTRE,USA
8,9,US0126531013,SPECTRE 33 SUBSIDIARY,USA


## 8. Drop the table using DbDataSource object

In [25]:
db_source.drop_table()

2022-07-17 16:38:03,129 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2022-07-17 16:38:03,135 INFO sqlalchemy.engine.Engine 
DROP TABLE esg_match_ref
2022-07-17 16:38:03,138 INFO sqlalchemy.engine.Engine [no key 0.00338s] ()
2022-07-17 16:38:03,156 INFO sqlalchemy.engine.Engine COMMIT


## 9. Close database connection

In [26]:
db_conn.disconnect()

In [27]:
db_conn.is_connected()

False