In [None]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext
from llama_index.core import Settings
import chromadb
import torch
from llama_index.llms.huggingface import HuggingFaceLLM

In [None]:
from llama_index.embeddings.fastembed import FastEmbedEmbedding #10-20x speed thank you!!!
embed_model = FastEmbedEmbedding(model_name="BAAI/bge-small-en-v1.5")

In [None]:
from llama_index.core import PromptTemplate
selected_model = "TheBloke/Llama-2-7B-Chat-GPTQ" #pc keeps crashing whenever i use 13b. tested aqlm, aql, gguf and they were slow (vllm speeds it but set-up is a hell on earth) GPTQ is a nice balance. If getting CUDA OUT OF MEMORY, try switching smaller params. WIll be a bit slower but runs on most devices
    

SYSTEM_PROMPT = """You are an AI assistant providing responses in a professional North American business context. Follow these rules:
- Ensure clarity and readability in all responses
- Respond precisely to the specific query without extraneous text.
- Maintain a professional, business-oriented tone.
- Do not use filler phrases or references to the documentation sources such as "based on the context, from the document source, etc"
- Do not use offensive or inappropriate language.

"""

query_wrapper_prompt = PromptTemplate(
    "[INST]<<SYS>>\n" + SYSTEM_PROMPT + "<</SYS>>\n\n{query_str}[/INST] "
)

In [None]:
llm = HuggingFaceLLM( #play around with the llm parameters here. Room for improvement.
    context_window=3200,
    max_new_tokens=256,
    generate_kwargs={"temperature": 0.1}, #need to test different values for this
    query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name=selected_model,
    model_name=selected_model,
    device_map="cuda", #if CUDA OUT OF MEMORY, switch to "auto". Will offload some of the work to CPU. But, will be so much slower. Try reducing the context_window and max_new_tokens if in a time crunch. 
    tokenizer_kwargs={"max_length": 3200},
    model_kwargs={"torch_dtype": torch.float16}
    
)

In [None]:
Settings.llm = llm
Settings.embed_model = embed_model

In [None]:
documents = (
    SimpleDirectoryReader("C:/Users/Adel/Desktop/AWS_FULL_DOCS", recursive=True, required_exts=[".md"]).load_data())

In [None]:
#This cell is needed for nodes to be created. You can skip this cell if you don't want to use the hybrid retriever. If you do, then all nodes must be created again. 

from llama_index.core.node_parser import TokenTextSplitter
from llama_index.core.ingestion import IngestionPipeline
import nest_asyncio

nest_asyncio.apply() #Fix for a jupyter specific runtime error. Might not be needed with full BE implementation

#https://blog.llamaindex.ai/evaluating-the-ideal-chunk-size-for-a-rag-system-using-llamaindex-6207e5d3fec5
#this is default but I've included it here to allow for easy modification
text_splitter = TokenTextSplitter(
    separator=" ", chunk_size=1024, chunk_overlap=20
)

pipeline = IngestionPipeline(
    transformations=[text_splitter]
)

nodes = pipeline.run(
    documents=documents,
    show_progress=True,
)

#TODO: Add back metadata (title, questionanswers, etc) to the nodes. They were removed as they were MIND NUMBINGLY slow but improve performance. For the demo, we can do without them but for the final product, they are a must.
#TODO: Add persistent memory storage for nodes. They are needed for the hybrid retriever and is not feasible to have to run the ingestion pipeline every time (although it only takes a few minutes)

In [None]:
#DO NOT RUN THIS CELL IF YOU HAVE ALREADY CREATED THE EMBEDDINGS. RUN THE FOLLOWING ONE INSTEAD
db = chromadb.PersistentClient(path="./chroma_db")
chroma_collection = db.get_or_create_collection("aws_documentation_test")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

index = VectorStoreIndex(
    nodes, storage_context=storage_context, embed_model=embed_model, show_progress=True
)


In [None]:
#Loads the embeddings from chromaDB into index
db2 = chromadb.PersistentClient(path="./chroma_db")
chroma_collection = db2.get_or_create_collection("aws_documentation_test")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
index = VectorStoreIndex.from_vector_store(
    vector_store,
    embed_model=embed_model
)

In [None]:
#https://medium.com/@trent.niemeyer/10k-gpt-upgrading-fb94603cc38b
#https://docs.llamaindex.ai/en/stable/examples/retrievers/bm25_retriever.html
from llama_index.retrievers.bm25 import BM25Retriever

# retrieves the top 10 most similar nodes using embeddings
vector_retriever = index.as_retriever(similarity_top_k=10)

# retrieves the top 10 most similar nodes using bm25
bm25_retriever = BM25Retriever.from_defaults(nodes=nodes, similarity_top_k=10)

In [None]:
from llama_index.core.retrievers import BaseRetriever

class HybridRetriever(BaseRetriever):
    def __init__(self, vector_retriever, bm25_retriever):
        self.vector_retriever = vector_retriever
        self.bm25_retriever = bm25_retriever
        super().__init__()

    def _retrieve(self, query, **kwargs):
        bm25_nodes = self.bm25_retriever.retrieve(query, **kwargs)
        vector_nodes = self.vector_retriever.retrieve(query, **kwargs)

        # combine the two lists of nodes
        all_nodes = []
        node_ids = set()
        for n in bm25_nodes + vector_nodes:
            if n.node.node_id not in node_ids:
                all_nodes.append(n)
                node_ids.add(n.node.node_id)
        return all_nodes

In [None]:
index.as_retriever(similarity_top_k=5)

hybrid_retriever = HybridRetriever(vector_retriever, bm25_retriever)

In [None]:
from llama_index.core.postprocessor import SentenceTransformerRerank

reranker = SentenceTransformerRerank(top_n=4, model="BAAI/bge-reranker-base")

In [None]:
import pandas as pd
from llama_index.core.query_engine import RetrieverQueryEngine

df = pd.read_csv("C:/Users/Adel/Downloads/QA_true.csv")
if 'Generated_answer' not in df.columns:
    df['Generated_answer'] = ''


#slow (for the time being) but better results due to custom retriever and reranker. Expect a 2x slowdown. If standard takes 15s, this will take 30s. Looking into ways to speed it up
query_engine = RetrieverQueryEngine.from_args(
    retriever=hybrid_retriever,
    node_postprocessors=[reranker],
)

def get_answer(question):
    response = query_engine.query(question)
    return response

for index, row in df.iloc[53:].iterrows(): #crashed midway so had to start from 53
    if pd.isna(row['Generated_answer']) or row['Generated_answer'] == '':
        question = row['Question']
        answer = get_answer(question)
        df.at[index, 'Generated_answer'] = answer
        print(f"Question: {question}\nAnswer: {answer}")
    else:
        print(f"Question: {row['Question']}\nAnswer: {row['Generated_answer']} (Already answered)")
    print(f"Completed {index+1}/{len(df)} questions")

df.to_csv("C:/Users/Adel/Downloads/QA_Updated.csv", index=False)

In [None]:
for i in range (7): #CUDA OUT OF MEMORY ERROR. Only way past is to burn the cache down multiple times/restart. Doing it once doesn't do anything 
    torch.cuda.empty_cache() 