In [None]:
!pip install rank_bm25

Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2


In [None]:
import os
os.getcwd()
import warnings
warnings.filterwarnings("ignore")
from langchain_community.document_loaders import PyPDFLoader
from langchain.schema import Document
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import TextLoader
from rank_bm25 import BM25Okapi
import numpy as np

from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import pipeline

import textwrap
model_name = "Qwen/Qwen2.5-3B-Instruct"
#model_name = "meta-llama/Llama-3.2-3B-Instruct"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto", #device_map='cuda'
    torch_dtype="auto",
    trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
print(model.dtype)
total_params = sum(p.numel() for p in model.parameters())
print(f"Total Parameters: {total_params / 1e6} million")
memory_footprint = total_params * 2 / (1024 ** 2)  # Convert to MB
print(f"Estimated Memory Footprint: {memory_footprint:.2f} MB")

# Create a pipeline
generator = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
return_full_text=False,
max_new_tokens=5000,
do_sample=False
)
import requests
import re
from langchain.schema import Document

# 🔗 Replace with your actual text file URL
url = "https://archive.org/stream/feeling-good-the-new-mood-therapy/David%20Burns%20-%20Feeling%20Good_djvu.txt"  # <-- Replace this

# 🔽 Download the book
text = requests.get(url).text
loader = PyPDFLoader(doc_path)
pages = loader.load()
print(len(pages))
# 📌 Markers
start_marker = "I have been amazed by the interest in cognitive behavioral "
end_marker = "Chapter 15"

# 🔍 Find start and end positions
start = text.find(start_marker)
end = text.find(end_marker)

if start != -1 and end != -1:
    text = text[start:end + len(end_marker)]
else:
    print("Could not find one or both markers. Proceeding with full text.")

# 🧽 Clean up
text = text.replace('\r', '').replace('\xa0', ' ')
text = re.sub(r'\n{2,}', '\n\n', text.strip())  # Normalize spacing

# 📄 Wrap into LangChain Document
documents = [Document(page_content=text)]
# Chunking text
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)
chunks = text_splitter.split_documents(documents)

# Prepare documents and their metadata
texts = [chunk.page_content for chunk in chunks]
metadata = [chunk.metadata for chunk in chunks]
print(len(texts))
print(chunks[10])
#print(textwrap.fill(chunks[0],80))

# Initialize embedding model
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
#model_name="BAAI/bge-small-en", sentence-transformers/all-MiniLM-L6-v2

# Create FAISS vector store
vectordb = FAISS.from_texts(texts, embedding_model, metadatas=metadata)
vectordb.save_local("faiss_index_chunked")
# BM25 Indexing
tokenized_texts = [text.split() for text in texts]
bm25 = BM25Okapi(tokenized_texts)

def reciprocal_rank_fusion(results_bm25, results_embedding, k=2):
    scores = {}

    # Use document content or metadata as the key
    for rank, (doc, score) in enumerate(results_bm25):
        doc_id = doc.page_content  # Or use doc.metadata.get("source", "unknown") if available
        scores[doc_id] = scores.get(doc_id, 0) + 1 / (rank+1) # (k + rank + 1)
        print("BM25", scores[doc_id])

    for rank, (doc, score) in enumerate(results_embedding):
        doc_id = doc.page_content  # Use the same identifier
        scores[doc_id] = scores.get(doc_id, 0) + 1 / (rank+1) # (k + rank + 1)
        print("Dense", scores[doc_id])

    return sorted(scores.items(), key=lambda x: x[1], reverse=True)


# Extract page content and metadata properly
def format_response(doc):
    return f"Page {doc.metadata.get('page', 'Unknown')}: {doc.page_content.strip()}"
# Retrieve function
def retrieve(query, k=3):
    query_embedding = embedding_model.embed_query(query)
    results_embedding = vectordb.similarity_search_with_score_by_vector(query_embedding, k=k)
    results_embedding = sorted(results_embedding, key=lambda x: x[1], reverse=True)

    print("============Dense Embeddings=============")
    for doc, score in results_embedding:
        print(f"page {doc.metadata.get('page','Unknown')} - Score: {score:.4f} - {doc.page_content[:100]}...")

    # Get BM25 scores for all documents and sort to get top-k results
    results_bm25 = [(idx, bm25.get_scores(query.split())[idx]) for idx in range(len(texts))]
    results_bm25 = sorted(results_bm25, key=lambda x: x[1], reverse=True)[:k]  # Keep only top-k results
    # Convert BM25 results to (Document, score) format
    results_bm25_docs = [(Document(page_content=texts[idx], metadata=metadata[idx]), score) for idx, score in results_bm25]

    print("************BM25 Results*************")
    for doc, score in results_bm25_docs:
        print(f"page {doc.metadata.get('page','Unknown')} - Score: {score:.4f} - {doc.page_content[:100]}...")

    # Create a lookup dictionary {document content -> Document object}
    doc_lookup = {doc.page_content: doc for doc, _ in results_bm25_docs}
    doc_lookup.update({doc.page_content: doc for doc, _ in results_embedding})

    # Fuse results
    fused_results = reciprocal_rank_fusion(results_bm25_docs, results_embedding)

    # Format results, ensuring document IDs are mapped back to actual Documents
    return [format_response(doc_lookup[doc_id]) for doc_id, _ in fused_results if doc_id in doc_lookup]

    #fused_results = reciprocal_rank_fusion(results_bm25, results_embedding)
    #return [(texts[idx], metadata[idx]["page"] if "page" in metadata[idx] else "Unknown") for idx, _ in fused_results]


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


torch.bfloat16
Total Parameters: 3085.938688 million
Estimated Memory Footprint: 5885.96 MB
881
page_content='The effectiveness of cognitive therapy has been con¬ 
firmed by many outcome studies by researchers throughout 
the world during the past two decades. In a recent landmark 
article entitled “Psychotherapy vs. Medication for Depres¬ 
sion; Challenging the Conventional Wisdom with Data,” 
Drs. David O. Antonuccio and William G. Danton from the 
University of Nevada and Dr. Gurland Y. DeNelsky from 
the Cleveland CUnic reviewed many of the most carefully 
conducted studies on depression that have been published 
in scientific journals throughout the world.' The studies re¬ 
viewed compared the antidepressant medications with psy¬ 
chotherapy in the treatment of depression and anxiety. 
Short-term studies as well as long-term follow-up studies 
were included in this review. The authors came to a number 
of startling conclusions that are at odds with the conven¬ 
tional wisdom:'


In [None]:

# Query example
#question = "What is the makeup exam policy for health related reasons?"
question = "What is depression?"
retrieved_responses = retrieve(question, k=3)
len(retrieved_responses)
for i in range(0,len(retrieved_responses)):
    print(retrieved_responses[i])
    print("-------")
# Construct the RAG prompt
prompt = f"""
You are an AI assistant tasked with answering questions based on retrieved knowledge.

### **Retrieved Information**:
1. {retrieved_responses[0]}

2. {retrieved_responses[1]}

3. {retrieved_responses[2]}

### **Question**:
{question}

### **Instructions**:
- Integrate the key points from all retrieved responses into a **cohesive, well-structured answer**.
- If the responses are **contradictory**, mention the different perspectives.
- If none of the retrieved responses contain relevant information, reply:
  **"I couldn't find a good response to your query in the database."**
"""



page Unknown - Score: 0.7980 - Although the idea has been around for ages, most de¬ 
pressed people do not leally comprehend it. If...
page Unknown - Score: 0.7623 - When a depression clearly appears after an obvious stress, 
such as ill health, the death of a loved...
page Unknown - Score: 0.7509 - Either depression or sadness can develop after a loss or 
a failure in your efforts to reach a goal ...
************BM25 Results*************
page Unknown - Score: 9.0592 - Chapter 9 

“Dr. Bums, you seem to be claiming that distorted thinking 
is the only cause of depress...
page Unknown - Score: 8.8582 - then we could put the finger someplace. Since 
we don’t know, isn’t it silly to blame Hal for his 
o...
page Unknown - Score: 7.3204 - You will notice that the vertical-arrow technique is the 
opposite of the usual strategy you use whe...
BM25 1.0
BM25 0.5
BM25 0.3333333333333333
Dense 1.0
Dense 0.5
Dense 0.3333333333333333
Page Unknown: Chapter 9 

“Dr. Bums, you seem to be claiming that

In [None]:
# Generate response using Qwen2.5
messages = [{"role": "user", "content": prompt}]
output = generator(messages)
print(textwrap.fill(output[0]["generated_text"], width=80))

Based on the information provided, depression is a complex mental health
condition characterized by persistent feelings of sadness, hopelessness, and a
lack of interest or pleasure in activities. It's important to note that while
many people believe depression is caused by external factors like financial
troubles, old age, permanent disabilities, terminal illnesses, or the loss of
loved ones, these issues alone do not necessarily lead to depression.  The core
issue lies in how individuals perceive and react to their circumstances.
Depression involves a distorted way of thinking that leads to negative self-
perceptions and behaviors. People who experience depression often have a
pessimistic outlook on life, viewing themselves and their situations negatively.
This negative perspective can manifest as feelings of worthlessness,
helplessness, and hopelessness, which can significantly impact their daily
functioning and overall quality of life.  It's crucial to distinguish between
healthy sa