In [2]:
import os
from dotenv import load_dotenv

load_dotenv()

False

#### Create the ChromaDB Client and set the embedding model

In [2]:
import chromadb
from chromadb.utils import embedding_functions

In [3]:
CHROMADB_PATH="/home/yikuang/workspace/defectsearch/notebooks/data/chromadb"

chroma_client = chromadb.PersistentClient(path=CHROMADB_PATH)

In [4]:
print([entry.name for entry in chroma_client.list_collections()])

['np2024-dataset']


In [5]:
collection = chroma_client.get_or_create_collection(
    name = "np2024-dataset", 
    # embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="Huffon/sentence-klue-roberta-base")
)

#### Import the dataset

In [3]:
import pandas as pd
from glob import glob

files = glob("data/github-dataset/warehouse/*.csv")
dfs = [pd.read_csv(file) for file in files]
df = dfs[3].dropna(subset = ['body'])

In [5]:
dfs[3]

Unnamed: 0,url,repository_url,id,user,node_id,title,state,labels,created_at,updated_at,body
0,https://api.github.com/repos/pandas-dev/pandas...,https://api.github.com/repos/pandas-dev/pandas,2428583885,sfc-gh-joshi,I_kwDOAA0YD86QwT_N,BUG: GroupBy.value_counts doesn't preserve ori...,open,Bug,2024-07-24T22:41:33Z,2024-07-24T22:41:33Z,### Pandas version checks\n\n- [X] I have chec...
1,https://api.github.com/repos/pandas-dev/pandas...,https://api.github.com/repos/pandas-dev/pandas,2424738873,behrenhoff,I_kwDOAA0YD86QhpQ5,BUG: Error message in read_csv misleading when...,open,Bug,2024-07-23T09:39:47Z,2024-07-24T21:36:58Z,### Pandas version checks\r\n\r\n- [X] I have ...
2,https://api.github.com/repos/pandas-dev/pandas...,https://api.github.com/repos/pandas-dev/pandas,2423712812,flownt,I_kwDOAA0YD86Qduws,BUG: pandas.to_datetime reports incorrect inde...,open,Bug,2024-07-22T20:26:26Z,2024-07-24T21:44:03Z,### Pandas version checks\n\n- [X] I have chec...
3,https://api.github.com/repos/pandas-dev/pandas...,https://api.github.com/repos/pandas-dev/pandas,2421504459,aram-cinnamon,PR_kwDOAA0YD852BM9q,BUG: `query` on columns with characters like #...,open,Bug,2024-07-21T16:08:46Z,2024-07-23T21:01:33Z,- [x] closes #59285\r\n- [x] [Tests added and ...
4,https://api.github.com/repos/pandas-dev/pandas...,https://api.github.com/repos/pandas-dev/pandas,2421257349,Jonathan-Wei,I_kwDOAA0YD86QUXSF,BUG: Couldn't run sql: 'Connection' object ha...,open,Bug,2024-07-21T06:53:08Z,2024-07-21T13:35:21Z,### Pandas version checks\n\n- [X] I have chec...
...,...,...,...,...,...,...,...,...,...,...,...
11679,https://api.github.com/repos/pandas-dev/pandas...,https://api.github.com/repos/pandas-dev/pandas,1239398,wesm,MDU6SXNzdWUxMjM5Mzk4,DataFrame.__ne__ not implemented,closed,Bug,2011-07-18T02:15:11Z,2011-07-20T19:46:26Z,"See\n\n```\ndm = DataFrame({'col1':[1,2],'col2..."
11680,https://api.github.com/repos/pandas-dev/pandas...,https://api.github.com/repos/pandas-dev/pandas,997165,wesm,MDU6SXNzdWU5OTcxNjU=,"""Proper"" boolean array with NA handling in Dat...",closed,Bug,2011-06-03T09:08:28Z,2011-06-14T14:06:41Z,Currently booleans are getting casted to float...
11681,https://api.github.com/repos/pandas-dev/pandas...,https://api.github.com/repos/pandas-dev/pandas,934142,surbas,MDU6SXNzdWU5MzQxNDI=,Importing data using HDFStore with pre-epoch d...,closed,Bug,2011-05-20T22:13:31Z,2011-06-23T04:17:27Z,I have data with a DataFrame that goes back to...
11682,https://api.github.com/repos/pandas-dev/pandas...,https://api.github.com/repos/pandas-dev/pandas,358952,wesm,MDU6SXNzdWUzNTg5NTI=,weights option may not be working in pandas.st...,closed,Bug,2010-10-12T16:15:10Z,2011-12-31T16:53:33Z,Need to investigate (user notified)\n


#### Clean the data uploaded to ChromaDB

In [7]:
import re

def clean_text(text: str):
    pattern = re.compile(r'### Pandas version checks.*?### Reproducible Example', re.DOTALL)
    # Replace the matched section with '### Reproducible Example'
    cleaned_text = re.sub(pattern, '### Reproducible Example', text)
    # Regex to match the ### Installed Versions section and its content
    pattern = re.compile(r'### Installed Versions.*?(</details>|$)', re.DOTALL)
    # Replace the matched section with an empty string
    cleaned_text = re.sub(pattern, '', cleaned_text)
    cleaned_text = cleaned_text.strip().lower().replace("###", "")
    cleaned_text = re.sub(r'\n\s*\n+', ' ', cleaned_text)
    return cleaned_text

In [8]:
df['cleaned_body'] = df['body'].apply(clean_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_body'] = df['body'].apply(clean_text)


#### Upload the data to ChromaDB

In [9]:
query = "pd.ExcelWriter cannot accept an io.BytesIO instance as first arg"
results = collection.query(query_texts=[query], n_results=10)

#### Import the Pretrained LLM Model

In [10]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import numpy as np

torch.random.manual_seed(0)

model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-128k-instruct",
    device_map="cuda",
    torch_dtype="auto",
    trust_remote_code=True,
)

tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct")

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

  from .autonotebook import tqdm as notebook_tqdm
`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.
Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:01<00:00,  2.00it/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [43]:
def craft_prompt(query: str, df: pd.DataFrame, chroma_collection, min_ref_docs: int = 2, distance_threshold = 0.1) -> str:

    """
    min_ref_docs : minimum number of documents we want the RAG model to reference.
    """
    
    results = chroma_collection.query(
        query_texts = [query],  # Chroma will embed this for you
        n_results = 10,  # How many results to return
    )

    reference_ids: List[str] = results['ids'][0]
    filtered_result = df[df.node_id.isin(reference_ids)]

    # Get values that deviate less than 0.1 distance away
    documents = np.array(filtered_result['cleaned_body'].to_list())
    distances = np.array(results["distances"][0])

    distance_ids = {k: v for k,v in zip(reference_ids, distances)}
    
    relevant_documents = documents[distances < min(distances) + distance_threshold]

    if len(relevant_documents) < min_ref_docs:
        relevant_documents = documents[:min_ref_docs]
    
    if len(relevant_documents) == 0:
        relevant_documents = documents[:1]  # At least take the top result if none within the threshold

    # Join results with new lines for the context
    context = "\n".join(relevant_documents) 

    return [
        {"role": "system", "content": "You are a helpful AI assistant that answers questions from a database of GitHub issues."},
        {
            "role": "user",
            "content": (
                "Answer the following question using only the provided context. Do not assume or add information beyond what is in the context. \n"
                "If the context does not contain sufficient information to answer the question, explicitly state that the context is insufficient and provide your own suggestions with a clear warning. \n"
                "The context are entries in a database, so your answer should say instead that you had referenced the database. \n"
                "Context:\n"
                "{context}\n\n"
                "Question: {query}\n\n"
                "Here are the entries in the database that might be relevant to your query:\n"
                "{entries}"
            ).format(context=context, query=query, entries="".join([f'- {entry}\n' for entry in relevant_documents]))
        }
    ], distance_ids

def search_term(query, pipe, df, chroma_collection):

    generation_args = {
        "max_new_tokens": 500,
        "return_full_text": False,
        "temperature": 0.0,
        "do_sample": False,
    }

    prompt, reference_ids = craft_prompt(query, df, chroma_collection)
    output = pipe(prompt, **generation_args)
    
    return output, reference_ids

In [44]:
# Example usage
# Ensure 'collection' is properly initialized and configured
result, reference_ids = search_term(
    "test",
    pipe, 
    df, 
    chroma_collection = collection
)

In [49]:
result

[{'generated_text': ' The context provided does not contain sufficient information to answer the question about the variable number of tests in `test_binops` failing with empty `.attrs` on Ubuntu 18. The context only mentions this issue without providing specific details or a solution. To answer this question, I would need to reference the database for more information on this issue.'}]

In [46]:
print(result[0]['generated_text'])

 The context provided does not contain sufficient information to answer the question about the variable number of tests in `test_binops` failing with empty `.attrs` on Ubuntu 18. The context only mentions this issue without providing specific details or a solution. To answer this question, I would need to reference the database for more information on this issue.


In [47]:
df[df.node_id == "MDU6SXNzdWU0MjYwMTc3MjQ="].title.values

array(['Memory leak with .rolling().max() in pandas 0.24.2 '],
      dtype=object)

In [48]:
reference_ids

{'MDExOlB1bGxSZXF1ZXN0MjAwNTUyNDE5': 1.1448930501937866,
 'MDExOlB1bGxSZXF1ZXN0NTk2NzA4ODY=': 1.163944959640503,
 'MDExOlB1bGxSZXF1ZXN0MTMzOTMyNTU=': 1.1797651052474976,
 'MDExOlB1bGxSZXF1ZXN0MjM3NDE2NzE=': 1.2089669704437256,
 'PR_kwDOAA0YD84wZZ_3': 1.2155227661132812,
 'MDExOlB1bGxSZXF1ZXN0NzIzMzE0MjI0': 1.2461981773376465,
 'MDExOlB1bGxSZXF1ZXN0MTkyNDM4NTY=': 1.2664517164230347,
 'PR_kwDOAA0YD84zGyJw': 1.269770622253418,
 'MDExOlB1bGxSZXF1ZXN0MjA1ODA3MzI0': 1.287787675857544,
 'MDU6SXNzdWU2MjQ1MDgzMTQ=': 1.298712968826294}

In [None]:
query = "groupby value count doesnt preserve my order when i added in the sort argument"

results = search_term(query)

In [None]:
print(results[-1]['generated_text'])