In [68]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain_ollama.llms import OllamaLLM
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import RetrievalQA
from langchain import hub

from langchain.schema.output_parser import StrOutputParser
from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate
from langchain.prompts import ChatMessagePromptTemplate, PromptTemplate
from langchain.schema.runnable import RunnablePassthrough

import pickle
import os

In [2]:

# Load the cleaned_documents list from the file
with open('final_pdf_pages.pkl', 'rb') as file:
    docs = pickle.load(file)

print("Documents loaded successfully!")


Documents loaded successfully!


In [3]:
len(docs)

220

In [6]:
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    separators=['\n'],
    chunk_size=2024,
    chunk_overlap=204,
    length_function=len,
)

In [7]:
data = text_splitter.split_documents(docs)

In [8]:
len(data)

615

In [10]:
#loading the embedding model from huggingface
embedding_model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {"device": "cuda"}
embeddings = HuggingFaceEmbeddings(
  model_name=embedding_model_name,
  model_kwargs=model_kwargs
)

  embeddings = HuggingFaceEmbeddings(
  from tqdm.autonotebook import tqdm, trange


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [15]:
#loading the data and correspond embedding into the FAISS
vectorstore = FAISS.from_documents(docs, embeddings)

In [16]:
vectorstore.save_local("faiss_index_")

In [17]:
# Load from local storage
persisted_vectorstore = FAISS.load_local("faiss_index_", embeddings,allow_dangerous_deserialization=True)

In [18]:
#creating a retriever on top of database
retriever = persisted_vectorstore.as_retriever()

In [48]:
template = """
        Task: You are an assistant that generates multiple variations of a given question. 
        For each variation, maintain the original intent of the question, but change the phrasing, structure, 
        or tone to create a diverse set of queries.

Generate 5-7 variations that cover:

Synonym replacements while keeping the question concise.
Alternative structures, such as rephrasing into "why," "how," or "what" forms if relevant.
Casual and formal tones.
Slightly more specific or broader wording.
Examples:

Original Question: "What is the impact of inflation on the stock market?"
Variations:
"How does inflation affect stock prices?"
"What are the effects of inflation on the stock market?"
"In what ways does inflation influence stock market trends?"
"Could inflation lead to changes in stock market values?"
"How does rising inflation impact the performance of stocks?"
"What influence does inflation have on market prices?"
"What happens to stock prices when inflation increases?"

the final answer should be a python list: [
    "What tests can be used to detect stomach ulcers?",
    "Why are endoscopies commonly used in gastric ulcer diagnosis?",
    "How do doctors figure out if someone has a stomach ulcer?",
    "Can an upper endoscopy be used to diagnose all types of gastric ulcers, or are there other methods more effective for certain cases?",
    "What tests and procedures are typically involved in diagnosing gastric ulcers, including acid reflux disease?",
    "How can a patient determine if they have been diagnosed with a stomach ulcer based on symptoms, test results, or imaging studies?",
    "When considering the diagnosis of a gastric ulcer, what role does symptom severity play in determining the need for further testing or treatment?"
]

Now generate the list of variations for the given question: {question}

        Answer:
        """

prompt = ChatPromptTemplate.from_template(template)

# model = OllamaLLM(model="llama3.2")

chain = prompt | model

print(chain.invoke({"question": "How is a gastric ulcer diagnosed?"}))

Here are 7 variations that cover different aspects of diagnosing gastric ulcers:

1. "What tests are used to confirm the presence of a stomach ulcer?"
2. "How does a doctor determine if a patient has a gastric ulcer?"
3. "Can an endoscopy be used to diagnose all types of gastric ulcers, or are there other methods more effective for certain cases?"
4. "In what ways can symptom severity influence the need for further testing or treatment when diagnosing a gastric ulcer?"
5. "What imaging studies and procedures are typically involved in diagnosing gastric ulcers?"
6. "How does a patient's medical history impact the diagnosis of a stomach ulcer?"
7. "Can a patient be diagnosed with a gastric ulcer based solely on symptoms, or do they need to undergo further testing?"

These variations maintain the original intent of the question while changing the phrasing, structure, and tone to create a diverse set of queries.


In [51]:
fusion_template = """
        Task: You are an assistant that generates multiple variations of a given question. 
        For each variation, maintain the original intent of the question, but change the phrasing, structure, 
        or tone to create a diverse set of queries.

Generate 5-7 variations that cover:

Synonym replacements while keeping the question concise.
Alternative structures, such as rephrasing into "why," "how," or "what" forms if relevant.
Casual and formal tones.
Slightly more specific or broader wording.
Examples:

Original Question: "What is the impact of inflation on the stock market?"
Variations:
"How does inflation affect stock prices?"
"What are the effects of inflation on the stock market?"
"In what ways does inflation influence stock market trends?"
"Could inflation lead to changes in stock market values?"
"How does rising inflation impact the performance of stocks?"
"What influence does inflation have on market prices?"
"What happens to stock prices when inflation increases?"

the final answer should be a python list: [
    "What tests can be used to detect stomach ulcers?",
    "Why are endoscopies commonly used in gastric ulcer diagnosis?",
    "How do doctors figure out if someone has a stomach ulcer?",
    "Can an upper endoscopy be used to diagnose all types of gastric ulcers, or are there other methods more effective for certain cases?",
    "What tests and procedures are typically involved in diagnosing gastric ulcers, including acid reflux disease?",
    "How can a patient determine if they have been diagnosed with a stomach ulcer based on symptoms, test results, or imaging studies?",
    "When considering the diagnosis of a gastric ulcer, what role does symptom severity play in determining the need for further testing or treatment?"
]

Now generate the list of variations for the given question
        """

In [52]:
prompt = ChatPromptTemplate(input_variables=['original_query'],
                            messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[],template=fusion_template)),
                            HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['original_query'], template='Generate multiple search queries related to: {question} \n OUTPUT (7 queries):'))])
     

In [53]:
original_query = "How is a gastric ulcer diagnosed?"

In [54]:
generate_queries = (
    prompt | model | StrOutputParser() | (lambda x: x.split("\n"))
)

In [61]:
print(generate_queries)

first=ChatPromptTemplate(input_variables=['question'], input_types={}, partial_variables={}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], input_types={}, partial_variables={}, template='\n        Task: You are an assistant that generates multiple variations of a given question. \n        For each variation, maintain the original intent of the question, but change the phrasing, structure, \n        or tone to create a diverse set of queries.\n\nGenerate 5-7 variations that cover:\n\nSynonym replacements while keeping the question concise.\nAlternative structures, such as rephrasing into "why," "how," or "what" forms if relevant.\nCasual and formal tones.\nSlightly more specific or broader wording.\nExamples:\n\nOriginal Question: "What is the impact of inflation on the stock market?"\nVariations:\n"How does inflation affect stock prices?"\n"What are the effects of inflation on the stock market?"\n"In what ways does inflation influence stock market tren

In [62]:
from langchain.load import dumps, loads


def reciprocal_rank_fusion(results: list[list], k=60):
    fused_scores = {}
    for docs in results:
        # Assumes the docs are returned in sorted order of relevance
        for rank, doc in enumerate(docs):
            doc_str = dumps(doc)
            if doc_str not in fused_scores:
                fused_scores[doc_str] = 0
            previous_score = fused_scores[doc_str]
            fused_scores[doc_str] += 1 / (rank + k)

    reranked_results = [
        (loads(doc), score)
        for doc, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
    ]
    return reranked_results

In [63]:
ragfusion_chain = generate_queries | retriever.map() | reciprocal_rank_fusion

In [65]:
import langchain
langchain.debug = True

In [66]:

ragfusion_chain.input_schema.schema()

{'properties': {'question': {'title': 'Question', 'type': 'string'}},
 'required': ['question'],
 'title': 'PromptInput',
 'type': 'object'}

In [None]:
ragfusion_chain.invoke({"question": original_query})

In [86]:
template = """Answer the question based only on the following context:
{context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

full_rag_fusion_chain = (
    {
        "context": ragfusion_chain,
        "question": RunnablePassthrough()
    }
    | prompt
    | model
    | StrOutputParser()
)

In [87]:
full_rag_fusion_chain.input_schema.schema()

{'properties': {'question': {'title': 'Question', 'type': 'string'},
  'root': {'title': 'Root'}},
 'required': ['question', 'root'],
 'title': 'RunnableParallel<context,question>Input',
 'type': 'object'}

In [None]:
result = full_rag_fusion_chain.invoke({"question": "How long does it typically take for a gastric ulcer to heal?"})

In [89]:
print(result)

The question is not directly answered in the provided text, but I can provide some general information on the healing time of gastric ulcers.

The healing time for a gastric ulcer can vary depending on several factors, such as the size and location of the ulcer, the effectiveness of treatment, and the individual's overall health.

Typically, with proper treatment, including medication to reduce acid production (e.g., proton pump inhibitors) and possibly surgery in some cases, it can take several weeks to several months for a gastric ulcer to heal.

Here are some general guidelines:

* Small ulcers: 2-6 weeks to heal
* Medium-sized ulcers: 4-12 weeks to heal
* Large ulcers: 8-20 weeks or more to heal

It's essential to note that everyone's healing process is different, and the time it takes for an ulcer to heal can vary significantly. Regular follow-up with a healthcare provider is crucial to monitor progress and adjust treatment as needed.

The provided text does not offer a specific t