In [1]:
import os
from pyprojroot import here
import pandas as pd
import chromadb
from openai import AzureOpenAI, OpenAI
import warnings
warnings.filterwarnings("ignore")
from dotenv import load_dotenv
print(load_dotenv())

True


In [2]:
azure_openai_api_key = os.environ["OPENAI_API_KEY"]
azure_openai_endpoint = os.environ["OPENAI_API_BASE"]

In [3]:
azure_client = OpenAI(
  api_key = azure_openai_api_key
)
chroma_client = chromadb.PersistentClient(path=str(here("data/chroma")))

**Create a collection for data injection**

Throws an error if the table already exists

In [4]:
collection = chroma_client.create_collection(name="datasets_rows")

In [5]:
file_dir = here("data/for_upload/datasets_rows.csv")
df = pd.read_csv(file_dir, nrows=5)

In [6]:
df

Unnamed: 0,dataset_id,user_id,name,dataset_type,file_extension,is_active,version,status,dataset_size,viewed_at,created_at,updated_at,is_shared,organization_id
0,3,1,Cars,TEXT,CSV,True,,,366,,2024-11-18 04:17:42.552+00,2024-11-18 04:17:42.552+00,False,
1,4,1,Random,TEXT,CSV,True,,,732,,2024-11-18 04:29:59.159+00,2024-11-18 04:29:59.159+00,False,
2,5,2,Test,TEXT,CSV,True,,,13192,,2024-11-21 02:01:07.822+00,2024-11-21 02:01:07.822+00,False,


NOTE: Process in chunks if dataset is big.

In [7]:
docs = []
metadatas = []
ids = []
embeddings = []
for index, row in df.iterrows():
    output_str = ""
    # Treat each row as a separate chunk
    for col in df.columns:
        output_str += f"{col}: {row[col]},\n"
    response = azure_client.embeddings.create(
        input = output_str,
        model= "text-embedding-3-small"
    )
    embeddings.append(response.data[0].embedding)
    docs.append(output_str)
    metadatas.append({"source": "datasets_rows"})
    ids.append(f"id{index}")

In [8]:
docs

['dataset_id: 3,\nuser_id: 1,\nname: Cars,\ndataset_type: TEXT,\nfile_extension: CSV,\nis_active: True,\nversion: nan,\nstatus: nan,\ndataset_size: 366,\nviewed_at: nan,\ncreated_at: 2024-11-18 04:17:42.552+00,\nupdated_at: 2024-11-18 04:17:42.552+00,\nis_shared: False,\norganization_id: nan,\n',
 'dataset_id: 4,\nuser_id: 1,\nname: Random,\ndataset_type: TEXT,\nfile_extension: CSV,\nis_active: True,\nversion: nan,\nstatus: nan,\ndataset_size: 732,\nviewed_at: nan,\ncreated_at: 2024-11-18 04:29:59.159+00,\nupdated_at: 2024-11-18 04:29:59.159+00,\nis_shared: False,\norganization_id: nan,\n',
 'dataset_id: 5,\nuser_id: 2,\nname: Test,\ndataset_type: TEXT,\nfile_extension: CSV,\nis_active: True,\nversion: nan,\nstatus: nan,\ndataset_size: 13192,\nviewed_at: nan,\ncreated_at: 2024-11-21 02:01:07.822+00,\nupdated_at: 2024-11-21 02:01:07.822+00,\nis_shared: False,\norganization_id: nan,\n']

In [9]:
print(metadatas)
print(ids)

[{'source': 'datasets_rows'}, {'source': 'datasets_rows'}, {'source': 'datasets_rows'}]
['id0', 'id1', 'id2']


In [10]:
embeddings[0][:10]

[-0.015110057778656483,
 -0.02865656279027462,
 0.023209765553474426,
 -0.029528051614761353,
 0.02415814995765686,
 -0.057723239064216614,
 -0.01983916014432907,
 -0.008785364218056202,
 -0.014981897547841072,
 -0.02452981285750866]

In [11]:
collection.add(
    documents=docs,
    metadatas=metadatas,
    embeddings=embeddings,
    ids=ids
)

Verify the vectorDB creation

In [12]:
print("Number of vectors in vectordb:", collection.count())

Number of vectors in vectordb: 3


### RAG

In [14]:
from openai import AzureOpenAI, OpenAI

In [15]:
model_name = "text-embedding-3-small"
azure_openai_api_key = os.environ["OPENAI_API_KEY"]
azure_openai_endpoint = os.environ["OPENAI_API_BASE"]

In [16]:
azure_client = OpenAI(
    api_key=os.getenv("OPENAI_API_KEY")
)

**Perform similarity search**

**Generate embedding for Records**

In [None]:
# query_texts = "what's the name of the dataset with dataset_id = 4?"
# query_texts = "Using the schema from {schema_name} and associated records from {schema_samples}, match records from source {source_name} and return the table name and column name associated with each cell in the record."
query_texts = "Using the schema from {schema_name} and associated records from {schema_samples}, match records from input {input_name} return closest records."
response = azure_client.embeddings.create(
        input = query_texts,
        model= model_name
    )
query_embeddings = response.data[0].embedding

In [None]:
# query_texts = "what's the name of the dataset with dataset_id = 4?"
query_texts = "in the collection {name}, find the closest matching table and column names to the query {query}."
response = azure_client.embeddings.create(
        input = query_texts,
        model= model_name
    )
query_embeddings = response.data[0].embedding

In [None]:
# query_texts = "what's the name of the dataset with dataset_id = 4?"
query_texts = "in the collection {name}, find the closest matching table and column names to the query {query}."
response = azure_client.embeddings.create(
        input = query_texts,
        model= model_name
    )
query_embeddings = response.data[0].embedding
print(query_embeddings[:10])

[0.0029961096588522196, -0.01976878196001053, 0.03094029799103737, -0.004394088871777058, 0.01625843718647957, 0.0034641555976122618, -0.017637940123677254, 0.02667861618101597, -0.020433899015188217, -0.042937055230140686]


**Load the chromaDB collection for vector search**

In [None]:
vectordb = chroma_client.get_collection(name="datasets_rows")
vectordb.count()

In [None]:
results = vectordb.query(
    query_embeddings = query_embeddings,
    n_results=1 #top_k
)

results

Pass the results to an LLM

In [19]:
system_role = "You will recieve the user's question along with the search results of that question over a database. Generate SQL insert statements based on those answers for each respective table."
prompt = f"User's question: {query_texts} \n\n Search results:\n {results}"

messages = [
    {"role": "system", "content": str(
        system_role
        )},
    {"role": "user", "content": prompt}
]

In [20]:
response = azure_client.chat.completions.create(
    model=os.getenv("gpt_deployment_name"),
    messages=messages
)

In [None]:
response.choices[0].message.content

**Fact check**

In [None]:
df