In [9]:
import os
import openai
from IPython.display import Markdown, display
from llama_index import SQLDatabase, ServiceContext
from llama_index.llms import OpenAI
import sqlite3
from sqlalchemy import (
    create_engine,
    MetaData,
    Table,
    Column,
    String,
    Integer,
    select,
)
def inspect_database(db_path):
    # Connect to the database
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()

    # Get all tables
    cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
    tables = cursor.fetchall()

    for table in tables:
        table_name = table[0]
        print(f"\nTable: {table_name} in {db_path}")

        # Get table schema
        print("Schema:")
        cursor.execute(f"PRAGMA table_info({table_name});")
        schema = cursor.fetchall()
        for column in schema:
            print(column)

        # Get the first five rows
        # print("\nFirst 5 rows:")
        cursor.execute(f"SELECT * FROM {table_name} LIMIT 5;")
        rows = cursor.fetchall()
        for row in rows:
            print(row)

    # Close the cursor and connection
    cursor.close()
    conn.close()

# Paths to your database files
db_files = ['files/db/CCLEGisticCNDB.db', 'files/db/CCLEMutDB.db', 'files/db/CCLEVarDB.db']

# Iterate over the files and inspect tables
for db_file in db_files:
    inspect_database(db_file)
    engine = create_engine('sqlite:///' + db_file)
    metadata_obj = MetaData()
    print(metadata_obj)





Table: ccle_cn_gistic in files/db/CCLEGisticCNDB.db
Schema:
(0, 'Hugo_Symbol', 'TEXT', 0, None, 0)
(1, 'CCLE_Name', 'TEXT', 0, None, 0)
(2, 'gistic_cn', 'REAL', 0, None, 0)
(3, 'DepMap_ID', 'TEXT', 0, None, 0)
('A1BG', 'DMS53_LUNG', 0.0, 'ACH-000698')
('A1BG', 'SW1116_LARGE_INTESTINE', 0.0, 'ACH-000489')
('A1BG', 'NCIH1694_LUNG', 0.0, 'ACH-000431')
('A1BG', 'P3HR1_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE', 0.0, 'ACH-000707')
('A1BG', 'HUT78_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE', 0.0, 'ACH-000509')
MetaData()

Table: ccle_mutation in files/db/CCLEMutDB.db
Schema:
(0, 'Hugo_Symbol', 'TEXT', 0, None, 0)
(1, 'Entrez_Gene_Id', 'INTEGER', 0, None, 0)
(2, 'NCBI_Build', 'INTEGER', 0, None, 0)
(3, 'Chromosome', 'TEXT', 0, None, 0)
(4, 'Start_position', 'INTEGER', 0, None, 0)
(5, 'End_position', 'INTEGER', 0, None, 0)
(6, 'Strand', 'TEXT', 0, None, 0)
(7, 'Variant_Classification', 'TEXT', 0, None, 0)
(8, 'Variant_Type', 'TEXT', 0, None, 0)
(9, 'Reference_Allele', 'TEXT', 0, None, 0)
(10, 'Alternate_All

In [19]:
# Paths to your database files
db_files = ['files/db/CCLEGisticCNDB.db', 'files/db/CCLEMutDB.db', 'files/db/CCLEVarDB.db']

# Iterate over the files and inspect tables
for db_file in db_files:
    inspect_database(db_file)

engine = create_engine('sqlite:///' + 'files/db/CCLEGisticCNDB.db')
metadata_obj = MetaData()
print(metadata_obj)

from llama_index.indices.struct_store.sql_query import (
    SQLTableRetrieverQueryEngine,
)
from llama_index.objects import (
    SQLTableNodeMapping,
    ObjectIndex,
    SQLTableSchema,
)

from llama_index import VectorStoreIndex
llm = OpenAI(temperature=0.1, model="gpt-3.5-turbo")
service_context = ServiceContext.from_defaults(llm=llm)
sql_database = SQLDatabase(engine)
# set Logging to DEBUG for more detailed outputs
table_node_mapping = SQLTableNodeMapping(sql_database)
table_schema_objs = [
    (SQLTableSchema(table_name="ccle_cn_gistic"))
]  # add a SQLTableSchema for each table

obj_index = ObjectIndex.from_objects(
    table_schema_objs,
    table_node_mapping,
    VectorStoreIndex,
)
query_engine = SQLTableRetrieverQueryEngine(
    sql_database, obj_index.as_retriever(similarity_top_k=1)
)
response = query_engine.query("How many genes are Diploid?")
display(Markdown(f"<b>{response}</b>"))


Table: ccle_cn_gistic in files/db/CCLEGisticCNDB.db
Schema:
(0, 'Hugo_Symbol', 'TEXT', 0, None, 0)
(1, 'CCLE_Name', 'TEXT', 0, None, 0)
(2, 'gistic_cn', 'REAL', 0, None, 0)
(3, 'DepMap_ID', 'TEXT', 0, None, 0)
('A1BG', 'DMS53_LUNG', 0.0, 'ACH-000698')
('A1BG', 'SW1116_LARGE_INTESTINE', 0.0, 'ACH-000489')
('A1BG', 'NCIH1694_LUNG', 0.0, 'ACH-000431')
('A1BG', 'P3HR1_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE', 0.0, 'ACH-000707')
('A1BG', 'HUT78_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE', 0.0, 'ACH-000509')

Table: ccle_mutation in files/db/CCLEMutDB.db
Schema:
(0, 'Hugo_Symbol', 'TEXT', 0, None, 0)
(1, 'Entrez_Gene_Id', 'INTEGER', 0, None, 0)
(2, 'NCBI_Build', 'INTEGER', 0, None, 0)
(3, 'Chromosome', 'TEXT', 0, None, 0)
(4, 'Start_position', 'INTEGER', 0, None, 0)
(5, 'End_position', 'INTEGER', 0, None, 0)
(6, 'Strand', 'TEXT', 0, None, 0)
(7, 'Variant_Classification', 'TEXT', 0, None, 0)
(8, 'Variant_Type', 'TEXT', 0, None, 0)
(9, 'Reference_Allele', 'TEXT', 0, None, 0)
(10, 'Alternate_Allele', 'TEXT

<b>There are 691,749 genes that are diploid.</b>

In [None]:
# A check
import re 
def extract_sql(self, llm_response: str) -> str:
    # If the llm_response contains a markdown code block, with or without the sql tag, extract the sql from it
    sql = re.search(r"```sql\n(.*)```", llm_response, re.DOTALL)
    if sql:
        self.log(f"Output from LLM: {llm_response} \nExtracted SQL: {sql.group(1)}")
        return sql.group(1)

    sql = re.search(r"```(.*)```", llm_response, re.DOTALL)
    if sql:
        self.log(f"Output from LLM: {llm_response} \nExtracted SQL: {sql.group(1)}")
        return sql.group(1)

    return llm_response

def is_sql_valid(self, sql: str) -> bool:
    # This is a check to see the SQL is valid and should be run
    # This simple function just checks if the SQL contains a SELECT statement

    if "SELECT" in sql.upper():
        return True
    else:
        return False
