<a href="https://colab.research.google.com/github/mehrdad-bhm/Gen-AI/blob/main/RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install langchain langchain_community pypdf chromadb langchain_huggingface openai tiktoken huggingface_hub accelerate

In [None]:
from langchain.document_loaders.pdf import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.schema import Document
from langchain.vectorstores.chroma import Chroma
from langchain.chat_models import ChatOpenAI
from langchain_community.chat_models.huggingface import ChatHuggingFace
import os
import shutil

In [None]:
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "xxx"
os.environ["OPENAI_API_KEY"] = "xxx"

In [None]:
DATA_PATH = r"data"
CHROMA_PATH = "chroma"

In [None]:
def load_documents():
    document_loader = PyPDFDirectoryLoader(DATA_PATH)
    return document_loader.load()

In [None]:
def split_text(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=400,
        chunk_overlap=100,
        length_function=len,
        add_start_index=True,
    )
    chunks = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(chunks)} chunks.")

    return chunks

In [None]:
def save_to_chroma(chunks: list[Document]):
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)

    db = Chroma.from_documents(
        chunks, HuggingFaceEmbeddings(), persist_directory=CHROMA_PATH
    )
    db.persist()
    print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")

In [None]:
def generate_data_store():
    documents = load_documents()
    chunks = split_text(documents)
    save_to_chroma(chunks)


In [None]:
generate_data_store()

In [None]:
query_text = "Explain how to discard structure results"

In [None]:
PROMPT_TEMPLATE = """
Answer the question based only on the following context:

{context}

---

Answer the question based on the above context: {question}
"""


In [None]:
db = Chroma(persist_directory=CHROMA_PATH, embedding_function=HuggingFaceEmbeddings())

results = db.similarity_search_with_relevance_scores(query_text, k=3)
if len(results) == 0 or results[0][1] < 0.1:
    print(f"Unable to find matching results.")

In [None]:
from langchain.prompts import ChatPromptTemplate

In [None]:
context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
prompt = prompt_template.format(context=context_text, question=query_text)
print(prompt)

Human: 
Answer the question based only on the following context:

simulation result fields (volume fields) from the memory. 
 
Selecting Discard Volume Files eliminates only the simulation result fields (volume 
fields) that are currently loaded  from the memory. The structure displayed in the 
visualization area is not deleted.

---

into GeoDict (green dot), or not (red dot). 
The Structure model is identified by File Name, Description, Voxel Count, Voxel 
Length, Domain Size and the internally hashed Structure ID number. A structure 
generated, e.g., with FiberGeo, GrainGeo, PaperGeo, WeaveGeo, GridGeo, or 
PleatGeo, with the same parameters and the same random seed, has always the same

---

GeoDict Graphical User Interface – Menu bar 
GeoDict 2024 User Guide  17 
DISCARD STRUCTURE OR SIMULATION RESULTS  
Selecting Discard Structure and Volume Files  makes the structure, and all 
simulation result fields available for this structure, disappear from memory and from 
the Visualizatio

In [None]:
!huggingface-cli login

In [None]:
from langchain_huggingface import ChatHuggingFace


#1 using openai
'''
from langchain.chat_models import ChatOpenAI
llm = ChatOpenAI(
    model_name="gpt-3.5-turbo",
    openai_api_key=os.environ["OPENAI_API_KEY"],
    max_tokens=512,
    temperature=0.7
)'''

#2 using hf pipline
'''
from langchain_huggingface import HuggingFacePipeline
llm = HuggingFacePipeline.from_model_id(
    # model_id="gpt2",
    model_id="HuggingFaceH4/zephyr-7b-beta",
    task="text-generation",
    pipeline_kwargs=dict(
        max_new_tokens=512,
        do_sample=False,
        repetition_penalty=1.03,
    ),
)'''

#3 using hf endpoint
from langchain_huggingface import HuggingFaceEndpoint
llm = HuggingFaceEndpoint(repo_id="HuggingFaceH4/zephyr-7b-beta")


model = ChatHuggingFace(llm=llm)
response_text = model.predict(prompt)
sources = [doc.metadata.get("source", None) for doc, _score in results]
formatted_response = f"Response: {response_text}\n\nSources: {sources}"

In [None]:
print(formatted_response)

Response: Based on the given context, to discard both structure and simulation result fields in GeoDict, you need to follow these steps:

1. Open the GeoDict software and load the structure you want to discard by selecting it in the Project tree or by browsing in the Open dialog.

2. In the menu bar, go to "File" and select "Discard Structure and Volume Files". This will delete both the structure and all simulation result fields associated with

Sources: ['data/Base Reference 2024.pdf', 'data/Base Reference 2024.pdf', 'data/Base Reference 2024.pdf']
