In [4]:
# RAG System Tools and IBM Cloud Connectors
import os
from docling.document_converter import DocumentConverter
from langchain_docling import DoclingLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter # FIX 1 (Package install: langchain-text-splitters)
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_classic.chains import RetrievalQA # FIX 2 (Package install: langchain-classic)
from langchain_ibm import WatsonxLLM

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# IBM watsonx.ai Credentials and Connection Setup
import os

os.environ["WATSONX_API_KEY"] = "heqOEuyYXy-ngbBIHva-GR8-0HNBFystyN9V6Vv2oMJB"
os.environ["IBM_PROJECT_ID"] = "0ed5949a-bd21-4f66-9733-4646506adc34"
os.environ["WATSONX_URL"] = "https://us-south.ml.cloud.ibm.com" 

print("✅ Credentials and URL loaded into session memory (Correct variable names used).")

✅ Credentials and URL loaded into session memory (Correct variable names used).


In [5]:
import os
print(f"Current Working Directory: {os.getcwd()}")

Current Working Directory: c:\FYP\FYP_RAG


In [6]:
# RAG Data Ingestion and Preparation Pipeline
import os
from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

# 1. Define the correct directory path
directory_path = os.path.join(os.getcwd(), "granite-snack-cookbook", "fyp_document")

print(f"Attempting to load all PDFs from directory: {directory_path}")

# 2. Load Documents using DirectoryLoader with PyPDFLoader
# PyPDFLoader is now correctly imported and used.
loader = DirectoryLoader(
    path=directory_path,
    glob="*.pdf",  # Only load files ending in .pdf
    loader_cls=PyPDFLoader  
)

# This should execute and load the files quickly.
all_documents = loader.load()

print(f"✅ Loaded {len(all_documents)} total pages/documents across all files.")

# 3. Split Text into Chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, 
    chunk_overlap=100 
)
texts = text_splitter.split_documents(all_documents)
print(f"✅ Split documents into {len(texts)} chunks of size 1000.")

Attempting to load all PDFs from directory: c:\FYP\FYP_RAG\granite-snack-cookbook\fyp_document
✅ Loaded 130 total pages/documents across all files.
✅ Split documents into 551 chunks of size 1000.


In [7]:
# Vector Indexing and Storage
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS 

# 1. Create Embeddings
# This uses a pre-trained model to convert text chunks into numerical vectors.
embeddings = HuggingFaceEmbeddings()

# 2. Create Vector Store (FAISS Index)
# This stores the vectors for fast retrieval.
db = FAISS.from_documents(texts, embeddings)
print("✅ Vector store created successfully using FAISS.")

# OPTIONAL: Save the index for faster reloading later
db.save_local("faiss_index_who_report")

  embeddings = HuggingFaceEmbeddings()
  embeddings = HuggingFaceEmbeddings()
2025-11-25 21:14:15,146 - INFO - Use pytorch device_name: cpu
2025-11-25 21:14:15,147 - INFO - Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2
2025-11-25 21:17:37,996 - INFO - Loading faiss with AVX2 support.
2025-11-25 21:17:38,079 - INFO - Successfully loaded faiss with AVX2 support.


✅ Vector store created successfully using FAISS.


In [8]:
# The Highly Optimized RAG Application Setup
import os 
from langchain_ibm import WatsonxLLM 
from langchain_classic.chains import RetrievalQA

# Corrected Imports for RAG components
from langchain_community.document_loaders import DirectoryLoader, PDFPlumberLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter 
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

# --- 1. LLM Initialization (Reliability & Output Length) ---
watsonx_llm = WatsonxLLM(                  
    model_id="ibm/granite-3-8b-instruct",                   
    project_id=os.getenv("IBM_PROJECT_ID"),
    # FIX: Use 'params' dictionary for generation controls (required by your LangChain-IBM version)
    params={'max_new_tokens': 1024, 'temperature': 0.1} # Lower temperature for factual accuracy
)

# --- 2. Document Processing and Vector Store Creation (Accuracy Tuning) ---
loader = DirectoryLoader(
    path='./granite-snack-cookbook/fyp_document', 
    loader_cls=PDFPlumberLoader, # ⬅️ USE PDFPlumberLoader HERE
    glob='*.pdf'             
)
documents = loader.load()

# OPTIMIZATION: Adjusted Chunking for better metadata retrieval (like ISBN/DOI)
text_splitter = RecursiveCharacterTextSplitter( 
    chunk_size=300,     # Very small chunk size
    chunk_overlap=150   # Greatly increased overlap to capture context across breaks
)
docs = text_splitter.split_documents(documents)

embeddings = HuggingFaceEmbeddings(
    model_name="all-MiniLM-L6-v2" # Explicitly named embedding model
)

db = FAISS.from_documents(docs, embeddings) 

# --- 3. Create the RetrievalQA Chain (Citation & Speed) ---
qa_chain = RetrievalQA.from_chain_type(
    llm=watsonx_llm,           
    chain_type="stuff",      
    # OPTIMIZATION: Set k=3 for faster response time and less noise
    retriever=db.as_retriever(search_kwargs={'k': 10}), 
    # CRITICAL FOR CITATION: Returns the document chunks that were used by the LLM
    return_source_documents=True 
)     

print("✅ OPTIMIZED RAG Setup Complete.")

2025-11-25 21:20:10,218 - INFO - Client successfully initialized
2025-11-25 21:20:12,359 - INFO - HTTP Request: GET https://us-south.ml.cloud.ibm.com/ml/v1/foundation_model_specs?version=2025-10-29&project_id=0ed5949a-bd21-4f66-9733-4646506adc34&filters=%21lifecycle_withdrawn&limit=200 "HTTP/1.1 200 OK"
2025-11-25 21:20:12,638 - INFO - Successfully finished Get available foundation models for url: 'https://us-south.ml.cloud.ibm.com/ml/v1/foundation_model_specs?version=2025-10-29&project_id=0ed5949a-bd21-4f66-9733-4646506adc34&filters=%21lifecycle_withdrawn&limit=200'
2025-11-25 21:21:30,058 - INFO - Use pytorch device_name: cpu
2025-11-25 21:21:30,059 - INFO - Load pretrained SentenceTransformer: all-MiniLM-L6-v2


✅ OPTIMIZED RAG Setup Complete.


In [9]:
# 1. Manually retrieve the top 5 most similar documents
retrieved_docs = db.similarity_search(question, k=5)

print(f"✅ Retrieved {len(retrieved_docs)} documents.")
print("\n--- CONTENT OF RETRIEVED CHUNKS ---")

# 2. Print the content and page number of each retrieved document
for i, doc in enumerate(retrieved_docs):
    # This prints the first 200 characters of the chunk
    content_snippet = doc.page_content[:200].replace('\n', ' ') 
    page_number = doc.metadata.get('page', 'N/A')
    
    # CRITICAL CHECK: Look for the ISBN, DOI, or 'PAPERBACK' in the output below.
    print(f"\n--- Document {i+1} (Page {page_number}) ---")
    print(f"Snippet: {content_snippet}...")

✅ Retrieved 5 documents.

--- CONTENT OF RETRIEVED CHUNKS ---

--- Document 1 (Page 8) ---
Snippet: Development Goal (SDG) targets by 2030. This is evidenced by the falling annual rate of reduction in indicators such as the maternal mortality ratio, under-five and neonatal mortality rates, premature...

--- Document 2 (Page 11) ---
Snippet: Reductions in both maternal and child mortality were among the targets of the Millennium Development Goals (MDGs), declared in 2000, that the world strived to achieve by 2015. They continue to be amon...

--- Document 3 (Page 5) ---
Snippet: risks to health, particularly for the most vulnerable and societies. populations. For the world to attain the targets of the Sustainable Development Goals (SDGs) by 2030, a substantial increase in foc...

--- Document 4 (Page 11) ---
Snippet: child mortality Improvement of maternal and child health has been high on the global development agenda since the turn of the millennium. Reductions in both maternal and ch

In [10]:
question = "What is the Sustainable Development Goal (SDG) target for the global maternal mortality ratio by 2030?"

# ⬇️ FIX: Use .invoke() instead of .run() ⬇️
result_dict = qa_chain.invoke({'query': question})

# --- Extract the Answer ---
answer = result_dict['result']

# --- Extract the Source Documents for Citation ---
sources = result_dict['source_documents']

print(f"Question: {question}")
print("---")
print(f"Answer: {answer}")

# --- CITATION SECTION (meets your criteria) ---
print("\n--- Sources Used (for Citation) ---")
for doc in sources:
    # This extracts the source file name and page number from the document metadata
    source_file = doc.metadata.get('source', 'N/A').split('/')[-1]
    page_number = doc.metadata.get('page', 'N/A')
    print(f"File: {source_file} (Page: {page_number})")

2025-11-25 21:25:25,185 - INFO - HTTP Request: POST https://us-south.ml.cloud.ibm.com/ml/v1/text/generation?version=2025-10-29 "HTTP/1.1 200 OK"
2025-11-25 21:25:25,190 - INFO - Successfully finished generate for url: 'https://us-south.ml.cloud.ibm.com/ml/v1/text/generation?version=2025-10-29'


Question: What is the Sustainable Development Goal (SDG) target for the global maternal mortality ratio by 2030?
---
Answer: 

The Sustainable Development Goal (SDG) target for the global maternal mortality ratio by 2030 is to reduce it to less than 70 maternal deaths per 100,000 live births. This represents a 2.7% (UI: 2.0--3.2%) average annual rate of reduction (ARR).

--- Sources Used (for Citation) ---
File: granite-snack-cookbook\fyp_document\world-health-statistics-2023_20230519_.pdf (Page: 8)
File: granite-snack-cookbook\fyp_document\world-health-statistics-2023_20230519_.pdf (Page: 11)
File: granite-snack-cookbook\fyp_document\world-health-statistics-2023_20230519_.pdf (Page: 5)
File: granite-snack-cookbook\fyp_document\world-health-statistics-2023_20230519_.pdf (Page: 11)
File: granite-snack-cookbook\fyp_document\world-health-statistics-2023_20230519_.pdf (Page: 12)
File: granite-snack-cookbook\fyp_document\world-health-statistics-2023_20230519_.pdf (Page: 8)
File: granite-sna