# How to...read data from csv files and store them on SqlLite database

In [1]:
# Sets up the location of the api relative to this notebook 
import sys
sys.path.append('../../../')

In [2]:
# Import the module for connection to a sqllite database
from esg_matching.db_engine.engines.connector_sql_lite import SqlLiteConnector

In [3]:
# Import the modules for file management
from esg_matching.file_reader.file import File
from esg_matching.file_reader.file_reader_csv import FileReaderCsv

In [4]:
# Import the modules for the etl processing: reading, transformation and loading data to a database
from esg_matching.processing.etl_processing import EtlProcessing

## 1. Database setup

In [5]:
# Localization of the database to be created in relation to this jupyter notebook
# The database will be created in the /data/dabase folder, under the project main folder
path_db = '../../../tests/data/notebook/sqlite/test_esg_matching.db'

In [6]:
# The database connector is represented by the class SqlLiteConnector 
db_conn = SqlLiteConnector()

In [7]:
# The connect() method of the SqlLiteConnector is used to stablish a connection with the database if it exists, 
# or to create a new one. The property path_db defines the location and name of the database.
# The  property show_sql_statement indicates if the SQL statements are echoed (or printed) in the default output channel.
db_conn.path_db = path_db
db_conn.show_sql_statement = True
db_conn.connect()

In [8]:
# Check if the connection was stablished
db_conn.is_connected()

True

## 2. File setup

In [9]:
# Settings for Referential 1
file1_settings = '../../../tests/data/notebook/sqlite/test_referential1_sqlite.json'
file1_settings

'../../../tests/data/notebook/sqlite/test_referential1_sqlite.json'

In [10]:
# Create a file object
file_obj = File(file1_settings)

In [11]:
# Checking some properties of the File object
print('Filename:{}, Json Settings:{}'.format(file_obj.filename, file_obj.filename_settings))

Filename:../../../tests/data/test_referential1.csv, Json Settings:../../../tests/data/notebook/sqlite/test_referential1_sqlite.json


## 3. Read a csv file and load its content to the database

The Esg-Entity-Matching library provides a FileReaderCsv that understands the content of csv files. 
It also provides an EtlProcessing object that combines file, connector and reader in order to perform the complete pipeline of reading, transforming and loading data into a database.

In [12]:
# Crete a file reader object for csv files
csv_reader_obj = FileReaderCsv()  

In [13]:
# Create an ETL process object
etl_proc_obj = EtlProcessing(db_conn)

In [14]:
# Call the load_file_to_db() method by passing the File, FileReader and SqlLiteConnector
# The ETL process returns a database source object
db_source = etl_proc_obj.load_file_to_db(file_obj, csv_reader_obj)

2022-01-28 09:59:22,993 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("ESG_MATCH_REF")
2022-01-28 09:59:22,994 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-01-28 09:59:23,002 INFO sqlalchemy.engine.Engine PRAGMA main.table_xinfo("ESG_MATCH_REF")
2022-01-28 09:59:23,003 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-01-28 09:59:23,009 INFO sqlalchemy.engine.Engine SELECT sql FROM  (SELECT * FROM sqlite_master UNION ALL   SELECT * FROM sqlite_temp_master) WHERE name = ? AND type = 'table'
2022-01-28 09:59:23,010 INFO sqlalchemy.engine.Engine [raw sql] ('ESG_MATCH_REF',)
2022-01-28 09:59:23,013 INFO sqlalchemy.engine.Engine PRAGMA main.foreign_key_list("ESG_MATCH_REF")
2022-01-28 09:59:23,014 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-01-28 09:59:23,016 INFO sqlalchemy.engine.Engine PRAGMA temp.foreign_key_list("ESG_MATCH_REF")
2022-01-28 09:59:23,017 INFO sqlalchemy.engine.Engine [raw sql] ()
2022-01-28 09:59:23,019 INFO sqlalchemy.engine.Engine SELECT sql FROM  (SELECT *

## 4. Report on Etl Process

In [15]:
# Printing the ELT Processing Report 
etl_proc_obj.print_report()

-------------------------------------- ETL PROCESSING REPORT ---------------------------------------
Description: Details of the ETL process performed on [DS_REF] data source.
Datetime:2022-01-28 09:59:23
----------------------------------------------------------------------------------------------------
File Name: ../../../tests/data/test_referential1.csv
Columns in the File: 4
Columns read from File: 4
Lines Extracted from File: 9


## 5. Checking the attribute names of DbDataSource

There are three methods to check the column or attribute names of the DbDataSource object:
1. Use get_original_field_names(): to retrieve the original attribute names of the columns in the csv file
2. Use get_field_names(): to retrieve the attribute names of the database table
3. Use get_primary_keys(): to retrieve the attribute names of the primary keys in the database table

In [16]:
# Retrieve the original attribute names (read from the csv file)
db_source.get_original_attribute_names()

['UNIQUE_ID', 'ISIN', 'COMPANY', 'COUNTRY']

In [17]:
# Retrieve the attribute names of the database table
db_source.get_attribute_names()

['UNIQUE_ID', 'ISIN', 'COMPANY', 'COUNTRY']

In [18]:
# Retrieve the attribute names of the primary keys in the database table
db_source.get_primary_keys()

['UNIQUE_ID']

## 6. Checking the Data Source

In [19]:
print('Data Source Name: {}, Table name: {}'.format(db_source.name, db_source.table_name))

Data Source Name: DS_REF, Table name: ESG_MATCH_REF


In [20]:
# Total entries of the table
result = db_source.get_total_entries()
print('Total entries in table {} = {}'.format(db_source.table_name, result))

2022-01-28 09:59:23,568 INFO sqlalchemy.engine.Engine SELECT count(*) AS count_1 
FROM "ESG_MATCH_REF"
2022-01-28 09:59:23,570 INFO sqlalchemy.engine.Engine [generated in 0.00233s] ()
Total entries in table ESG_MATCH_REF = 9


In [21]:
# Total entries of the table by a column name
result = db_source.get_total_entries_by_column('ISIN')
print('Total entries by ISIN = {}'.format(result))

2022-01-28 09:59:23,597 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2022-01-28 09:59:23,601 INFO sqlalchemy.engine.Engine SELECT count("ESG_MATCH_REF"."ISIN") AS count_1 
FROM "ESG_MATCH_REF"
 LIMIT ? OFFSET ?
2022-01-28 09:59:23,604 INFO sqlalchemy.engine.Engine [generated in 0.00342s] (1, 0)
2022-01-28 09:59:23,611 INFO sqlalchemy.engine.Engine ROLLBACK
Total entries by ISIN = 7


In [22]:
# Total entries of the table by a column name with distinct values
result = db_source.get_total_entries_by_column('ISIN', distinct_values=True)
print('Total entries by ISIN with distinct values = {}'.format(result))

2022-01-28 09:59:23,636 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2022-01-28 09:59:23,639 INFO sqlalchemy.engine.Engine SELECT count(DISTINCT "ESG_MATCH_REF"."ISIN") AS count_1 
FROM "ESG_MATCH_REF"
 LIMIT ? OFFSET ?
2022-01-28 09:59:23,645 INFO sqlalchemy.engine.Engine [generated in 0.00546s] (1, 0)
2022-01-28 09:59:23,648 INFO sqlalchemy.engine.Engine ROLLBACK
Total entries by ISIN with distinct values = 6


## 7. Checking the content of the DbDataSource

The get_data() method of the DbDataSource object performs a full select in the table, returning a list of tupples. Each item of the list is a row in the table and each element is the value per column.

In [23]:
# Query all the values of the table
# Equivalent to SELECT * FROM TABLE_NAME
lst_result = db_source.get_data()
lst_result

2022-01-28 09:59:23,680 INFO sqlalchemy.engine.Engine SELECT "ESG_MATCH_REF"."UNIQUE_ID", "ESG_MATCH_REF"."ISIN", "ESG_MATCH_REF"."COMPANY", "ESG_MATCH_REF"."COUNTRY" 
FROM "ESG_MATCH_REF"
2022-01-28 09:59:23,681 INFO sqlalchemy.engine.Engine [generated in 0.00094s] ()


[(1, 'SK1120005824', 'CENTRAL PERK', 'SK'),
 (2, None, 'HONEYDUKES', 'UNITED STATES OF AMERICA'),
 (3, None, 'STARCOURT MALL', 'AUSTRIA'),
 (4, 'GB00B1YW4409', 'STERLING COOPER', 'GBR'),
 (5, 'CH0012221716', 'Bluth company', 'CHE'),
 (6, 'US0200021014', 'InGen', 'usa'),
 (7, 'US0231351067', 'Stark Industries', 'us'),
 (8, 'US0126531013', 'SPECTRE', 'USA'),
 (9, 'US0126531013', 'SPECTRE 33 SUBSIDIARY', 'USA')]

The get_data_as_df() method of the DbDataSource also performs a select in the table, but returns a pandas dataframe as result.

In [24]:
# Query the table
df_result = db_source.get_data_as_df()
df_result

2022-01-28 09:59:23,701 INFO sqlalchemy.engine.Engine SELECT "ESG_MATCH_REF"."UNIQUE_ID", "ESG_MATCH_REF"."ISIN", "ESG_MATCH_REF"."COMPANY", "ESG_MATCH_REF"."COUNTRY" 
FROM "ESG_MATCH_REF"
2022-01-28 09:59:23,702 INFO sqlalchemy.engine.Engine [cached since 0.02201s ago] ()


Unnamed: 0,UNIQUE_ID,ISIN,COMPANY,COUNTRY
0,1,SK1120005824,CENTRAL PERK,SK
1,2,,HONEYDUKES,UNITED STATES OF AMERICA
2,3,,STARCOURT MALL,AUSTRIA
3,4,GB00B1YW4409,STERLING COOPER,GBR
4,5,CH0012221716,Bluth company,CHE
5,6,US0200021014,InGen,usa
6,7,US0231351067,Stark Industries,us
7,8,US0126531013,SPECTRE,USA
8,9,US0126531013,SPECTRE 33 SUBSIDIARY,USA


## 8. Drop the table using DbDataSource object

In [25]:
db_source.drop_table()

2022-01-28 09:59:23,747 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2022-01-28 09:59:23,750 INFO sqlalchemy.engine.Engine 
DROP TABLE "ESG_MATCH_REF"
2022-01-28 09:59:23,752 INFO sqlalchemy.engine.Engine [no key 0.00193s] ()
2022-01-28 09:59:23,776 INFO sqlalchemy.engine.Engine COMMIT


## 9. Close database connection

In [26]:
db_conn.disconnect()

In [27]:
db_conn.is_connected()

False