In [1]:
import os
import chromadb
from dotenv import load_dotenv
from chromadb.utils import embedding_functions

load_dotenv()

False

#### Create the ChromaDB Client

We structure our data as follows:

```bash
data/
|__ chromadb/ # This is where chromadb stores our data
|__ {BACKEND_DATA_SOURCE_PATH}.csv
```

In [6]:
# Ensure that this is the same as the lines contained in docker-compose.yaml
BACKEND_CHROMADB_PATH="./data/chromadb"
BACKEND_CHROMADB_COLLECTION_NAME="np2024-dataset-test"
BACKEND_DATA_SOURCE_PATH="./data/202407251532-pandas-cleaned.csv"

chroma_client = chromadb.PersistentClient(path=BACKEND_CHROMADB_PATH)

In [7]:
collection = chroma_client.get_or_create_collection(
    name = BACKEND_CHROMADB_COLLECTION_NAME, 
    # embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="Huffon/sentence-klue-roberta-base")
)

In [8]:
# Check that the collection has been successfully created
print([entry.name for entry in chroma_client.list_collections()])

['np2024-dataset', 'np2024-dataset-test']


#### Import the dataset

In [10]:
import uuid
import pandas as pd
from glob import glob
from datetime import datetime

df = pd.read_csv(BACKEND_DATA_SOURCE_PATH).dropna(subset = ['body'])

columns = ['url', 'repository_url', 'id', 'user', 'node_id', 'title', 'state', 'labels', 'created_at', 'updated_at', 'body']

# Node ID: Assigned unique uuid
# Title: Name or Short Description of the defect
# State: Whether the defect is still open or closed
# URL: The url to whichever relevant link on more details of the defect
# Body: Description of the defect
# Create At: Date the defect is created

required_columns = ["node_id", "title", "state", "url", "body", "created_at"]
df_columns = df.columns

if "node_id" not in df_columns:
    print(f"node_id not found, creating default uuid4 column.")
    df["node_id"] = str(uuid.uuid4().hex)
if "state" not in df_columns:
    print(f"state not found, creating default state column as open.")
    df["state"] = "open"
if "url" not in df_columns:
    print(f"url not found, creating default state column as null.")
    df["url"] = None
if "created_at" not in df_columns:
    print(f"created_at not found, creating default state column as today.")
    df["url"] = datetime.today()
    
if "title" not in df_columns or "body" not in df_columns:
    raise ValueError(f"title or body not found in df.")

#### Clean the dataset

In [11]:
import re

def clean_text(text: str):
    ## Change this text cleaning to that specific to your dataset
    pattern = re.compile(r'### Pandas version checks.*?### Reproducible Example', re.DOTALL)
    # Replace the matched section with '### Reproducible Example'
    cleaned_text = re.sub(pattern, '### Reproducible Example', text)
    # Regex to match the ### Installed Versions section and its content
    pattern = re.compile(r'### Installed Versions.*?(</details>|$)', re.DOTALL)
    # Replace the matched section with an empty string
    cleaned_text = re.sub(pattern, '', cleaned_text)
    cleaned_text = cleaned_text.strip().lower().replace("###", "")
    cleaned_text = re.sub(r'\n\s*\n+', ' ', cleaned_text)
    return cleaned_text

In [12]:
df['cleaned_body'] = df['body'].apply(clean_text)

#### Create the dataset on ChromaDB

In [15]:
import uuid
from tqdm import tqdm

i, chunk_size = 0, 1_000

for row in df.itertuples():
    # Insert the embedded sentences into the database on every chunk_size chunk
    if i % chunk_size == 0 and i != 0:        
        chunk_df = df.iloc[i - chunk_size: i]
        collection.upsert(
            documents = chunk_df["cleaned_body"].to_list(), 
            ids = chunk_df["node_id"].to_list()
        )
    i += 1
    
collection.upsert(
    documents = df.iloc[i - chunk_size:]['cleaned_body'].to_list(), 
    ids = df.iloc[i - chunk_size:]["node_id"].to_list()
)

#### Test the Vector Database

In [16]:
query = "pd.ExcelWriter cannot accept an io.BytesIO instance as first arg"
results = collection.query(query_texts=[query], n_results=10)

In [17]:
results

{'ids': [['I_kwDOAA0YD86G8bmy',
   'MDExOlB1bGxSZXF1ZXN0Mzc5MDc2NzM=',
   'MDU6SXNzdWU4MjQwMDA1MzE=',
   'I_kwDOAA0YD85C6Y-i',
   'MDU6SXNzdWU5NzMwNDg0MTE=',
   'MDU6SXNzdWUzMDc5NDg3NQ==',
   'I_kwDOAA0YD850NvPV',
   'I_kwDOAA0YD85gj_j2',
   'I_kwDOAA0YD85ATo1u',
   'MDExOlB1bGxSZXF1ZXN0MjY2NzM0Mjc=']],
 'distances': [[0.6897023916244507,
   0.8048410415649414,
   0.8124653100967407,
   0.829188346862793,
   0.8643962740898132,
   0.8813604116439819,
   0.8921693563461304,
   0.9173852205276489,
   0.9438368082046509,
   0.9569565653800964]],
 'metadatas': [[None, None, None, None, None, None, None, None, None, None]],
 'embeddings': None,
 'documents': [[' reproducible example ```python\nimport pandas as pd\r excel_data = io.bytesio()\r\nwith pd.excelwriter(excel_data, engine="openpyxl", mode="w") as writer:\r\n    pd.dataframe({"a": [1, 2], "b": [3, 4]}).to_excel(writer, sheet_name="example", index=false)\n```  issue description the code above writes a dataframe in an excelfile direc

#### Copy the data/ directory to backend/

Therefore, your backend directory should look something like

```bash
backend/
|__ data/
|__ src/
|__ README.md
|__ setup.py
|__ pyproject.toml
```