# this is Pixegami tutorial
### from this video :https://www.youtube.com/watch?v=tcqEUSNCn8I

In [1]:
from dotenv import load_dotenv
import os
from langchain_community.document_loaders import DirectoryLoader
from langchain.text_splitter import TextSplitter, RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.vectorstores.chroma import Chroma
from langchain_core.prompts import ChatPromptTemplate
from datetime import datetime
import json
import PyPDF2
import chromadb
chroma_client = chromadb.Client()

load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
DIRECTORY_PATH='/Users/matansharon/python/chat_with_docs/AI_Apps/chat_with_txt/data'
DOCS_PATH='/Users/matansharon/python/chat_with_docs/AI_Apps/chat_with_txt/docs.json'

def load_and_read_pdf(file_path):
    with open(file_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in reader.pages:
            text += page.extract_text()
    return text

def get_documents_list():
    
    with open(DOCS_PATH,'r') as f:
        data=json.load(f)
        docs=data['documents']
    return docs

def load_all_docs_in_data_folder():
    loader = DirectoryLoader(DIRECTORY_PATH)
    documents = loader.load()
    return documents

def split_text(text:str):
    
    
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=100,
        length_function=len,
        add_start_index=True
    )
    chunks=text_splitter.split_text(text)
    return chunks

def create_new_db(chunks):
    chroma_client = chromadb.Client()
    return chroma_client
    # path='chroma_db'
    # if not os.path.exists(path):
        
    #     db=Chroma.from_texts(texts=[''],embedding=OpenAIEmbeddings(),persist_directory=path)
    #     return db
    # return load_db()

def load_db():
    db = Chroma(persist_directory="chroma_db",embedding_function=OpenAIEmbeddings())
    return db

def get_results_with_scores(query,db):
    bar=0.5
    res=db.similarity_search_with_relevance_scores(query,k=3)
    
    return res
def get_prompt_template(results,query):
    template="""
    answer the question base only on the following context:
    {context}
    answer the question base on the above context: {query}
    
    """
    context_texts = []
    for i in range(len(results)):
        context_texts.append(results[i][0].page_content)
    temp = "\n\n---\n\n".join(context_texts)
    prompt_tamplate=ChatPromptTemplate.from_template(template)
    res=prompt_tamplate.format(context=temp,query=query)
    return res

def get_response(query,db,model):
    results=get_results_with_scores(query,db)
    prompt_template=get_prompt_template(results,query)
    response=model.invoke(prompt_template)
    return response.content

def main_app():
    db=create_new_db('')
    print(db)
main_app()


<chromadb.api.client.Client object at 0x105ba7a00>


In [None]:
# len(chunks)
# db=Chroma.from_texts(texts=chunks,embedding=OpenAIEmbeddings())
# model=ChatOpenAI()
# query='what is qlora?'

# response=get_response(query,db,model)
# print(response)
# db=create_new_db('')

In [2]:
import chromadb.utils.embedding_functions as embedding_functions
import chromadb
from PyPDF2 import PdfReader
path='/Users/matansharon/python/chat_with_docs/data/pdf/qlora.pdf'
doc=PdfReader(path)
#get the name of the document
doc_name=path.split('/')[-1]

text=''
for page in doc.pages:
    text+=page.extract_text()



chunks=RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=100,length_function=len,add_start_index=True).split_text(text)

openai_ef = embedding_functions.OpenAIEmbeddingFunction(
    api_key=OPENAI_API_KEY,
                model_name="text-embedding-3-small"
            )

chroma_client = chromadb.Client()
collection=chroma_client.create_collection(name="My_pdf_collection",embedding_function= openai_ef)


In [3]:
emb_list=openai_ef(chunks)
emb_list

[[0.021688230335712433,
  0.046878207474946976,
  0.03174163028597832,
  0.006223364267498255,
  -0.007525929249823093,
  -0.006703442428261042,
  -0.02835284359753132,
  0.03275826573371887,
  -0.06602484732866287,
  0.018920721486210823,
  0.007991887629032135,
  -0.004306582268327475,
  -0.05057763308286667,
  0.05342986062169075,
  0.02223890833556652,
  -0.0018426524475216866,
  0.032334666699171066,
  -0.04029831290245056,
  -0.02147643268108368,
  0.04716060683131218,
  0.00295989285223186,
  -0.0388580821454525,
  0.009538020938634872,
  -0.01602613367140293,
  0.003907694015651941,
  -0.04111726954579353,
  0.014741219580173492,
  0.0006534885615110397,
  0.0263478122651577,
  -0.03239114582538605,
  0.050972990691661835,
  -0.026799650862812996,
  -0.023382624611258507,
  -0.013286865316331387,
  -0.027364447712898254,
  0.0014852414606139064,
  0.044788457453250885,
  -0.008493144996464252,
  -0.011818391270935535,
  0.004055953584611416,
  -0.046031009405851364,
  -0.085792

In [4]:
metadatas=[{"page_number":i,"page_content":chunks[i]} for i in range(len(chunks))]
metadatas

[{'page_number': 0,
  'page_content': 'QL ORA: Efficient Finetuning of Quantized LLMs\nTim Dettmers∗Artidoro Pagnoni∗Ari Holtzman\nLuke Zettlemoyer\nUniversity of Washington\n{dettmers,artidoro,ahai,lsz}@cs.washington.edu\nAbstract\nWe present QLORA, an efficient finetuning approach that reduces memory us-\nage enough to finetune a 65B parameter model on a single 48GB GPU while\npreserving full 16-bit finetuning task performance. QLORAbackpropagates gradi-\nents through a frozen, 4-bit quantized pretrained language model into Low Rank\nAdapters (LoRA). Our best model family, which we name Guanaco , outperforms\nall previous openly released models on the Vicuna benchmark, reaching 99.3%\nof the performance level of ChatGPT while only requiring 24 hours of finetuning\non a single GPU. QLORAintroduces a number of innovations to save memory\nwithout sacrificing performance: (a) 4-bit NormalFloat (NF4), a new data type that\nis information theoretically optimal for normally distributed weig

In [14]:
ids=["id"+str(i) for i in range(len(chunks))]

In [15]:
collection.add(embeddings=emb_list,documents=chunks,metadatas=metadatas,ids=ids)

In [23]:
res=collection.query(query_texts=["what is qlora?"],n_results=3)
content=res['documents'][0]


In [None]:
print(get_response('who is the authors of the paper "Attention Is All You Need"',db,model))

In [24]:
chat=ChatOpenAI(model='gpt-4-turbo-preview')


content='QLORA is an efficient fine-tuning approach designed for quantized large language models (LLMs), which significantly reduces memory usage without sacrificing performance. The key innovations introduced by QLORA include:\n\n1. **4-bit NormalFloat (NF4) Quantization:** This is a new data type optimized for normally distributed weights, which is information-theoretically optimal. It helps in reducing the memory footprint of the model while maintaining the integrity of the data.\n\n2. **Double Quantization:** This technique further aids in memory optimization during the fine-tuning process.\n\n3. **Paged Optimizers:** To address the issue of memory spikes during gradient checkpointing, which can lead to out-of-memory errors on single machines, QLORA introduces Paged Optimizers. This innovation allows for efficient memory management, preventing such errors and making it feasible to fine-tune large models on a single machine.\n\n4. **Low-Precision Storage and High-Precision Computati

In [25]:
print(chat.invoke(f'base on this: {content} what is qlora?'))

content="QLORA is an efficient finetuning approach designed to significantly reduce memory usage during the finetuning process of large language models (LLMs), enabling the finetuning of models with up to 65 billion parameters on a single 48GB GPU while preserving the full 16-bit finetuning task performance. It achieves this by backpropagating gradients through a frozen, 4-bit quantized pretrained language model into Low Rank Adapters (LoRA). The key innovations introduced by QLORA include:\n\n1. **4-bit NormalFloat (NF4) Quantization**: This is a new data type optimized for normally distributed weights. It builds upon Quantile Quantization, aiming to be information-theoretically optimal by ensuring each quantization bin has an equal number of data points, thus minimizing the loss of information due to quantization.\n\n2. **Double Quantization**: This technique is part of QLORA's approach to maintain high fidelity in the finetuning process, though the text does not provide an extensive

In [None]:
db2=Chroma.from_documents(documents=chunks,embedding=OpenAIEmbeddings(),persist_directory='chorma_test')

In [None]:
print(get_response('what is qlora?',db2,chat))

In [None]:
#read the json file
import json
with open('docs.json') as f:
    data = json.load(f)
    print(data['documents'])
