In [3]:
import os
from dotenv import load_dotenv

load_dotenv()

False

#### Create the ChromaDB Client and set the embedding model

In [4]:
import chromadb
from chromadb.utils import embedding_functions

In [5]:
CHROMADB_PATH="/home/yikuang/workspace/defectsearch/notebooks/data/chromadb"
CHROMADB_COLLECTION_NAME="np2024-dataset"

chroma_client = chromadb.PersistentClient(path=CHROMADB_PATH)

In [6]:
print([entry.name for entry in chroma_client.list_collections()])

['np2024-dataset']


In [7]:
collection = chroma_client.get_or_create_collection(
    name = CHROMADB_COLLECTION_NAME, 
    # embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="Huffon/sentence-klue-roberta-base")
)

#### Import the dataset

In [8]:
import pandas as pd
from glob import glob

files = glob("data/github-dataset/warehouse/*.csv")
dfs = [pd.read_csv(file) for file in files]
df = dfs[3].dropna(subset = ['body'])

#### Clean the data uploaded to ChromaDB

In [9]:
import re

def clean_text(text: str):
    pattern = re.compile(r'### Pandas version checks.*?### Reproducible Example', re.DOTALL)
    # Replace the matched section with '### Reproducible Example'
    cleaned_text = re.sub(pattern, '### Reproducible Example', text)
    # Regex to match the ### Installed Versions section and its content
    pattern = re.compile(r'### Installed Versions.*?(</details>|$)', re.DOTALL)
    # Replace the matched section with an empty string
    cleaned_text = re.sub(pattern, '', cleaned_text)
    cleaned_text = cleaned_text.strip().lower().replace("###", "")
    cleaned_text = re.sub(r'\n\s*\n+', ' ', cleaned_text)
    return cleaned_text

In [10]:
df['cleaned_body'] = df['body'].apply(clean_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_body'] = df['body'].apply(clean_text)


#### Upload the data to ChromaDB

In [11]:
query = "pd.ExcelWriter cannot accept an io.BytesIO instance as first arg"
results = collection.query(query_texts=[query], n_results=10)

#### Import the Pretrained LLM Model

In [12]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import numpy as np

torch.random.manual_seed(0)

model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-128k-instruct",
    device_map="cuda",
    torch_dtype="auto",
    trust_remote_code=True,
)

tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct")

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.
Loading checkpoint shards: 100%|██████████████████████| 2/2 [00:01<00:00,  1.74it/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [19]:
def craft_prompt(query: str, df: pd.DataFrame, chroma_collection, min_ref_docs: int = 2, distance_threshold = 0.1) -> str:

    """
    min_ref_docs : minimum number of documents we want the RAG model to reference.
    """
    
    results = chroma_collection.query(
        query_texts = [query],  # Chroma will embed this for you
        n_results = 10,  # How many results to return
    )

    reference_ids: List[str] = results['ids'][0]
    filtered_result = df[df.node_id.isin(reference_ids)]

    # Get values that deviate less than 0.1 distance away
    documents = np.array(filtered_result['cleaned_body'].to_list())
    distances = np.array(results["distances"][0])

    distance_ids = {k: v for k,v in zip(reference_ids, distances)}
    
    relevant_documents = documents[distances < min(distances) + distance_threshold]

    if len(relevant_documents) < min_ref_docs:
        relevant_documents = documents[:min_ref_docs]
    
    if len(relevant_documents) == 0:
        relevant_documents = documents[:1]  # At least take the top result if none within the threshold

    # Join results with new lines for the context
    context = "\n".join(relevant_documents) 

    return relevant_documents

    return [
        {"role": "system", "content": "You are a helpful AI assistant that answers questions from a database of GitHub issues."},
        {
            "role": "user",
            "content": (
                "Answer the following question using only the provided context. Do not assume or add information beyond what is in the context. \n"
                "If the context does not contain sufficient information to answer the question, explicitly state that the context is insufficient and provide your own suggestions with a clear warning. \n"
                "The context are entries in a database, so your answer should say instead that you had referenced the database. \n"
                "Context:\n"
                "{context}\n\n"
                "Question: {query}\n\n"
                "Here are the entries in the database that might be relevant to your query:\n"
                "{entries}"
            ).format(context=context, query=query, entries="".join([f'- {entry}\n' for entry in relevant_documents]))
        }
    ], distance_ids

def search_term(query, pipe, df, chroma_collection):

    generation_args = {
        "max_new_tokens": 500,
        "return_full_text": False,
        "temperature": 0.0,
        "do_sample": False,
    }

    prompt = craft_prompt(query, df, chroma_collection)

    return prompt
    
    output = pipe(prompt, **generation_args)
    
    return output, reference_ids

In [20]:
# Example usage
# Ensure 'collection' is properly initialized and configured
query = "test"

relevant_documents = search_term(
    query,
    pipe, 
    df, 
    chroma_collection = collection
)

In [24]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Load pre-trained model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-reranker-v2-m3')
model = AutoModelForSequenceClassification.from_pretrained('BAAI/bge-reranker-v2-m3')
model.eval()

# # Define pairs for similarity computation
# query = 'china has bears?'
# passages = [
#     'hi',
#     'The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China.'
# ]

# Prepare input pairs
pairs = [[query, passage] for passage in relevant_documents]

# Tokenize and compute scores
with torch.no_grad():
    inputs = tokenizer(pairs, padding=True, truncation=True, return_tensors='pt', max_length=512)
    outputs = model(**inputs, return_dict=True)
    scores = outputs.logits.squeeze().float()  # Adjust based on model output shape
    print(scores)

# Pair the scores with the other list
paired_list = list(zip(scores, relevant_documents))
# Sort the pairs based on the scores in descending order
sorted_pairs = sorted(paired_list, key=lambda x: x[0], reverse=True)
# Separate the sorted pairs back into two lists
sorted_scores, sorted_other_list = zip(*sorted_pairs)

tensor([-10.9792,  -7.9862, -11.0184, -11.0404, -11.0475])
