# this is Pixegami tutorial
### from this video :https://www.youtube.com/watch?v=tcqEUSNCn8I

In [30]:
from dotenv import load_dotenv
import os
from langchain_community.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.vectorstores import Chroma
from langchain_core.prompts import ChatPromptTemplate
from datetime import datetime

load_dotenv()
directory_path='/Users/matansharon/python/chat_with_docs/data/text'

def load_and_split_documents():
    loader = DirectoryLoader(directory_path)
    documents = loader.load()
    
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=100,
        length_function=len,
        add_start_index=True
    )
    chunks=text_splitter.split_documents(documents)
    return chunks
def create_db(chunks):
    path='chroma_db'
    if not os.path.exists(path):
        
        db=Chroma.from_documents(documents=chunks,embedding=OpenAIEmbeddings(),persist_directory=path)
        return db
    return load_db()
def load_db():
    db = Chroma(persist_directory="chroma_db",embedding_function=OpenAIEmbeddings())
    return db
def get_results_with_scores(query,db):
    bar=0.5
    res=db.similarity_search_with_relevance_scores(query,k=3)
    if len(res)==0 or res[0][1]<bar:
        return 'No results found'
    r=[]
    for i in range(len(res)):
        if res[i][1]>bar:
            r.append(res[i])
    return r
def get_prompt_template(results,query):
    template="""
    answer the question base only on the following context:
    {context}
    answer the question base on the above context: {query}
    
    """
    context_texts = []
    for result in results:
        try:
            
            context_texts.append(result[0].page_content)
        except Exception as error:
            # Handle the error or skip the item
            print(f"Skipping item due to error: {datetime.now()}\n and the error is: {error}")
            
            break
    context_text = "\n\n---\n\n".join(context_texts)
    prompt_tamplate=ChatPromptTemplate.from_template(template)
    res=prompt_tamplate.format(context=context_text,query=query)
    return res
def get_response(query,db,model):
    results=get_results_with_scores(query,db)
    prompt_template=get_prompt_template(results,query)
    response=model.invoke(prompt_template)
    return response
def main_app():

    db=load_db()
    model=ChatOpenAI()
    return db,model


In [52]:
db=create_db(load_and_split_documents())

In [55]:
db.load()

AttributeError: 'Chroma' object has no attribute 'load'

In [43]:
query="what is qlora?"
response=get_response(query,db,model)
response.content

'QLORA is an efficient finetuning approach that reduces memory usage, allowing for the finetuning of large language models on a single GPU while maintaining high performance. It involves backpropagating gradients through a frozen, 4-bit quantized pretrained language model into Low Rank Adapters (LoRA). The best model family resulting from QLORA, named Guanaco, outperforms previously released models on the Vicuna benchmark. Additionally, QLORA introduces innovations such as a new data type called 4-bit NormalFloat (NF4) and a Double---processing technique to save memory without sacrificing performance.'

In [51]:
results=get_results_with_scores(query,db)
# print(results)
def get_prompt_template(results,query):
    template="""
    answer the question base only on the following context:
    {context}
    answer the question base on the above context: {query}
    
    """
    context_texts = []
    for result in results:
        try:
            
            context_texts.append(result[0].page_content)
        except Exception as error:
            # Handle the error or skip the item
            print(f"Skipping item due to error: {datetime.now()}\n and the error is: {error}")
            print(result[0].page_content)
            
            break
    context_text = "\n\n---\n\n".join(context_texts)
    prompt_tamplate=ChatPromptTemplate.from_template(template)
    res=prompt_tamplate.format(context=context_text,query=query)
    return res
p=get_prompt_template(results,query)
print(p)

Human: 
    answer the question base only on the following context:
    QL ORA: Efficient Finetuning of Quantized LLMs

Tim Dettmers∗Artidoro Pagnoni∗Ari Holtzman

Luke Zettlemoyer

University of Washington

{dettmers,artidoro,ahai,lsz}@cs.washington.edu

Abstract

We present QLORA, an efficient finetuning approach that reduces memory us-

age enough to finetune a 65B parameter model on a single 48GB GPU while

preserving full 16-bit finetuning task performance. QLORAbackpropagates gradi-

ents through a frozen, 4-bit quantized pretrained language model into Low Rank

Adapters (LoRA). Our best model family, which we name Guanaco , outperforms

all previous openly released models on the Vicuna benchmark, reaching 99.3%

of the performance level of ChatGPT while only requiring 24 hours of finetuning

on a single GPU. QLORAintroduces a number of innovations to save memory

without sacrificing performance: (a) 4-bit NormalFloat (NF4), a new data type that

is information theoretically opti

In [38]:
res=results[0]
res[0].page_content

'QL ORA: Efficient Finetuning of Quantized LLMs\n\nTim Dettmers∗Artidoro Pagnoni∗Ari Holtzman\n\nLuke Zettlemoyer\n\nUniversity of Washington\n\n{dettmers,artidoro,ahai,lsz}@cs.washington.edu\n\nAbstract\n\nWe present QLORA, an efficient finetuning approach that reduces memory us-\n\nage enough to finetune a 65B parameter model on a single 48GB GPU while\n\npreserving full 16-bit finetuning task performance. QLORAbackpropagates gradi-\n\nents through a frozen, 4-bit quantized pretrained language model into Low Rank\n\nAdapters (LoRA). Our best model family, which we name Guanaco , outperforms\n\nall previous openly released models on the Vicuna benchmark, reaching 99.3%\n\nof the performance level of ChatGPT while only requiring 24 hours of finetuning\n\non a single GPU. QLORAintroduces a number of innovations to save memory\n\nwithout sacrificing performance: (a) 4-bit NormalFloat (NF4), a new data type that\n\nis information theoretically optimal for normally distributed weights (b) 

In [27]:
template="""
    answer the question base only on the following context:
    {context}
    answer the question base on the above context: {query}
    
    """
context_texts = []
for result in results:
    try:
        res, _score = result
        context_texts.append(res.page_content)
    except ValueError:
        # Handle the error or skip the item
        print(f"Skipping item due to error: {result}")
context_text = "\n\n---\n\n".join(context_texts)
prompt_tamplate=ChatPromptTemplate.from_template(template)
res=prompt_tamplate.format(context=context_text,query=query)

In [29]:
print(res)

Human: 
    answer the question base only on the following context:
    QL ORA: Efficient Finetuning of Quantized LLMs

Tim Dettmers∗Artidoro Pagnoni∗Ari Holtzman

Luke Zettlemoyer

University of Washington

{dettmers,artidoro,ahai,lsz}@cs.washington.edu

Abstract

We present QLORA, an efficient finetuning approach that reduces memory us-

age enough to finetune a 65B parameter model on a single 48GB GPU while

preserving full 16-bit finetuning task performance. QLORAbackpropagates gradi-

ents through a frozen, 4-bit quantized pretrained language model into Low Rank

Adapters (LoRA). Our best model family, which we name Guanaco , outperforms

all previous openly released models on the Vicuna benchmark, reaching 99.3%

of the performance level of ChatGPT while only requiring 24 hours of finetuning

on a single GPU. QLORAintroduces a number of innovations to save memory

without sacrificing performance: (a) 4-bit NormalFloat (NF4), a new data type that

is information theoretically opti

In [56]:
import streamlit as st
from helper_functions import *



def main():
    # st.header("Chat with me")
    chunks=load_and_split_documents()
    db=create_db(chunks)
    model=ChatOpenAI()
    # st.write("I am ready to chat")
    
    
    
    query="what is qlora?"
    results=get_results_with_scores(query,db)
    # st.write(results)
    # st.write(db.similarity_search(query))
    print(results)
    
    
    
if __name__ == "__main__":
    main()

[(Document(page_content='QL ORA: Efficient Finetuning of Quantized LLMs\n\nTim Dettmers∗Artidoro Pagnoni∗Ari Holtzman\n\nLuke Zettlemoyer\n\nUniversity of Washington\n\n{dettmers,artidoro,ahai,lsz}@cs.washington.edu\n\nAbstract\n\nWe present QLORA, an efficient finetuning approach that reduces memory us-\n\nage enough to finetune a 65B parameter model on a single 48GB GPU while\n\npreserving full 16-bit finetuning task performance. QLORAbackpropagates gradi-\n\nents through a frozen, 4-bit quantized pretrained language model into Low Rank\n\nAdapters (LoRA). Our best model family, which we name Guanaco , outperforms\n\nall previous openly released models on the Vicuna benchmark, reaching 99.3%\n\nof the performance level of ChatGPT while only requiring 24 hours of finetuning\n\non a single GPU. QLORAintroduces a number of innovations to save memory\n\nwithout sacrificing performance: (a) 4-bit NormalFloat (NF4), a new data type that\n\nis information theoretically optimal for normally 