In [1]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.core import StorageContext
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.embeddings.huggingface_optimum import OptimumEmbedding
from llama_index.core import Settings
from IPython.display import Markdown, display
import chromadb
import torch

from llama_index.llms.huggingface import HuggingFaceLLM

In [2]:
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

In [None]:
documents = (
    SimpleDirectoryReader("C:/Users/skfrqt/Desktop/jk/milenium3/documents", recursive=True, required_exts=[".md", ".pdf"]).load_data())

In [None]:
from llama_index.core import PromptTemplate
selected_model = "TheBloke/Llama-2-7B-Chat-GPTQ" #pc keeps crashing whenever i use 13b. testing aqlm, aql, gguf and they were slow (vllm speeds it but set-up is a hell on earth) GPTQ is a nice balance. If getting CUDA OUT OF MEMORY, try switching smaller params. WIll be a bit slower but runs on most devices
    

SYSTEM_PROMPT = """You are an AI assistant that answers questions in a friendly manner, based on the given source documents. Here are some rules you always follow:
- Generate human readable output, avoid creating output with gibberish text.
- Generate only the requested output, don't include any other language before or after the requested output.
- Never say thank you, that you are happy to help, that you are an AI agent, etc. Just answer directly.
- Generate professional language typically used in business documents in North America.
- Never generate offensive or foul language.
"""

query_wrapper_prompt = PromptTemplate(
    "[INST]<<SYS>>\n" + SYSTEM_PROMPT + "<</SYS>>\n\n{query_str}[/INST] "
)

In [None]:
llm = HuggingFaceLLM( #play around with the llm parameters here. every number here is either random or from docs - i have not tested new values
    context_window=3900,
    max_new_tokens=256,
    generate_kwargs={"temperature": 0.1, "do_sample": False}, #need to test different values for this
    query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name=selected_model,
    model_name=selected_model,
    device_map="cuda",
    tokenizer_kwargs={"max_length": 3900},
    model_kwargs={"torch_dtype": torch.float16}
)
llm.model_kwargs = {"torch_dtype": torch.float16}  # Use lower precision (if supported)


In [None]:
Settings.llm = llm
Settings.embed_model = embed_model

In [None]:
db = chromadb.PersistentClient(path="./chroma_db")
chroma_collection = db.get_or_create_collection("math_docs")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

index = VectorStoreIndex.from_documents(
    documents, storage_context=storage_context, embed_model=embed_model
)

In [None]:
db2 = chromadb.PersistentClient(path="./chroma_db")
chroma_collection = db2.get_or_create_collection("math_docs")
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
index = VectorStoreIndex.from_vector_store(
    vector_store,   
    embed_model=embed_model
)

In [None]:
query_engine = index.as_query_engine() 
response = query_engine.query("Tell me the rules of multiplying matrices if they are not square matrices")
display(Markdown(f"<b>{response}</b>"))