In [None]:
! pip install --upgrade --quiet python-dotenv llama-index-llms-openai llama-index llama-index-embeddings-openai

In [1]:
from llama_index.core.indices.struct_store import SQLTableRetrieverQueryEngine
from llama_index.core import VectorStoreIndex, SQLDatabase, Settings
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.query_engine import NLSQLTableQueryEngine

from llama_index.core.objects import (
    SQLTableNodeMapping,
    ObjectIndex,
    SQLTableSchema,
)

from sqlalchemy import (
    create_engine,
    text
)

import os
from dotenv import load_dotenv
load_dotenv()

True

### In this document
The first few cells go over the general SQL query engine provided by llamaindex. It will demonstrate how to get started via SQL Alchemy and the limitations of NLSQLTableQueryEngine.
You will see that because of context window limitations, you will either hit context window limits or the resulting query will not work because of model confusion due to the large amount of tables and/or columns.

### The dataset used
I chose [this database](https://www.kaggle.com/datasets/wyattowalsh/basketball) from Kaggle because it contained 16 tables with 682 columns. The dataset was Last Updated on Thurs, July 26th, 2023. Download the database and bring the .sqlite file into the directory.

### Reference Documentation
Im going through [this](https://docs.llamaindex.ai/en/stable/examples/index_structs/struct_indices/duckdb_sql_query/) documentation from Llamaindex for reference.

In [2]:
# Connecting to the sqlite db and testing the query output
db_file = "sqlite:///olist.sqlite"
engine = create_engine(db_file)

In [3]:
# Getting all the table names to populate the vector index
all_table_names = []
with engine.connect() as con:
    rows = con.execute(text("SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%';"))
    for row in rows:
       all_table_names.append(row[0])
all_table_names

['product_category_name_translation',
 'sellers',
 'customers',
 'geolocation',
 'order_items',
 'order_payments',
 'order_reviews',
 'orders',
 'products',
 'leads_qualified',
 'leads_closed']

In [6]:
# Initializing the LLM. Make sure you have OPENAI_API_KEY in your .env file
llm = OpenAI(model="gpt-4-turbo",
             temperature=0.1,
             api_key=os.environ.get('OPENAI_API_KEY'))

embedding = OpenAIEmbedding(model="text-embedding-3-small")
Settings.llm = llm
Settings.embed_model = embedding
# Settings.num_output = 500

In [7]:
sql_database = SQLDatabase(engine, include_tables=all_table_names)

In [8]:
table_node_mapping = SQLTableNodeMapping(sql_database)

table_schema_objs = []
for table_name in all_table_names:
    table_schema_objs.append(SQLTableSchema(table_name=table_name))
    print(SQLTableSchema(table_name=table_name))

obj_index = ObjectIndex.from_objects(
    table_schema_objs,
    table_node_mapping,
    VectorStoreIndex,
)

table_name='product_category_name_translation' context_str=None
table_name='sellers' context_str=None
table_name='customers' context_str=None
table_name='geolocation' context_str=None
table_name='order_items' context_str=None
table_name='order_payments' context_str=None
table_name='order_reviews' context_str=None
table_name='orders' context_str=None
table_name='products' context_str=None
table_name='leads_qualified' context_str=None
table_name='leads_closed' context_str=None


In [9]:
obj_index.persist('./db')

  obj_index.persist('./db')


In [10]:
reloaded_object_index = ObjectIndex.from_persist_dir('./db', object_node_mapping=table_node_mapping)

In [11]:
updated_query_engine = SQLTableRetrieverQueryEngine(
    sql_database,
    reloaded_object_index.as_retriever(similarity_top_k=1)
)

In [15]:
query_str = "What is the best and worst reviewed product category based on order reviews?"
obj_based_response = updated_query_engine.query(query_str)

In [16]:
print(f"""
AI Response
{obj_based_response.response}

Query Output
----------------------
{obj_based_response.source_nodes[0].text}
      
Attempted Query
----------------------
{obj_based_response.metadata['sql_query']}

""")


AI Response
The best-reviewed product category based on order reviews is 'cds_dvds_musicais' with an average review score of 4.64. On the other hand, the worst-reviewed product category is 'seguros_e_servicos' with a significantly lower average review score of 2.5. These scores indicate a notable difference in customer satisfaction across different categories, highlighting areas where improvements could be beneficial.

Query Output
----------------------
[('cds_dvds_musicais', 4.642857142857143), ('fashion_roupa_infanto_juvenil', 4.5), ('livros_interesse_geral', 4.446265938069216), ('construcao_ferramentas_ferramentas', 4.444444444444445), ('flores', 4.419354838709677), ('livros_importados', 4.4), ('livros_tecnicos', 4.368421052631579), ('alimentos_bebidas', 4.315412186379929), ('malas_acessorios', 4.315257352941177), ('portateis_casa_forno_e_cafe', 4.302631578947368), ('fashion_esporte', 4.258064516129032), ('fashion_calcados', 4.233716475095785), ('alimentos', 4.218181818181818), ('

In [17]:
from sqlalchemy.exc import OperationalError
try:
    with engine.connect() as con:
        rows = con.execute(text(obj_based_response.metadata['sql_query']))
        for row in rows:
            print(row)
except OperationalError as e:
    print(f"Error: {e.args[0]}")


('cds_dvds_musicais', 4.642857142857143)
('fashion_roupa_infanto_juvenil', 4.5)
('livros_interesse_geral', 4.446265938069216)
('construcao_ferramentas_ferramentas', 4.444444444444445)
('flores', 4.419354838709677)
('livros_importados', 4.4)
('livros_tecnicos', 4.368421052631579)
('alimentos_bebidas', 4.315412186379929)
('malas_acessorios', 4.315257352941177)
('portateis_casa_forno_e_cafe', 4.302631578947368)
('fashion_esporte', 4.258064516129032)
('fashion_calcados', 4.233716475095785)
('alimentos', 4.218181818181818)
('musica', 4.2105263157894735)
('cine_foto', 4.205479452054795)
('papelaria', 4.193857199840447)
('pet_shop', 4.185146982980918)
('pcs', 4.175)
('eletrodomesticos', 4.172456575682382)
('perfumaria', 4.161940952937737)
('brinquedos', 4.158640919090687)
('instrumentos_musicais', 4.152592592592592)
('eletroportateis', 4.149187592319055)
('cool_stuff', 4.146341463414634)
('fashion_bolsas_e_acessorios', 4.144678764100049)
('eletrodomesticos_2', 4.142857142857143)
('beleza_saud