In [None]:
!pip install rank-bm25
!pip install -U langchain-community


In [None]:
import requests
from bs4 import BeautifulSoup
import os
from concurrent.futures import ThreadPoolExecutor


# Configuration
SAVE_FOLDER = r"C:\Users\Imtiaz.arif\Downloads\HadithTexts"
os.makedirs(SAVE_FOLDER, exist_ok=True)

# List of URLs to scrape with descriptive names
urls = [
    ("bukhari_5", "https://sunnah.com/bukhari/5"),
    ("bukhari_1", "https://sunnah.com/bukhari/1"),
    ("bukhari_2", "https://sunnah.com/bukhari/2"),
    ("bukhari_3", "https://sunnah.com/bukhari/3"),
    ("bukhari_4", "https://sunnah.com/bukhari/4"),
    ("bukhari_6", "https://sunnah.com/bukhari/6"),
    ("bukhari_7", "https://sunnah.com/bukhari/7"),
    ("bukhari_8", "https://sunnah.com/bukhari/8"),
    ("bukhari_9", "https://sunnah.com/bukhari/9"),
    ("bukhari_10", "https://sunnah.com/bukhari/10"),
    ("bukhari_11", "https://sunnah.com/bukhari/11"),
    ("bukhari_12", "https://sunnah.com/bukhari/12"),
    ("bukhari_13", "https://sunnah.com/bukhari/13"),
    ("bukhari_14", "https://sunnah.com/bukhari/14"),
    ("bukhari_15", "https://sunnah.com/bukhari/15"),
    ("bukhari_16", "https://sunnah.com/bukhari/16"),
    ("bukhari_18", "https://sunnah.com/bukhari/18"),
]

def scrape_single_url(url_data):
    name, url = url_data
    output_file = os.path.join(SAVE_FOLDER, f"{name}_english.txt")

    print(f"Scraping: {url}")
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')
        english_text = ""

        # Find all main hadith containers
        main_containers = soup.find_all('div', class_='actualHadithContainer')

        if not main_containers:
            print(f"No hadith containers found at {url}")
            return

        for container in main_containers:
            # Find the hadith text container
            text_container = container.find('div', class_='hadithTextContainers')
            if not text_container:
                continue

            # Find the english container
            english_container = text_container.find('div', class_='englishcontainer')
            if not english_container:
                continue

            # Extract narrator and hadith text
            narrator = english_container.find('div', class_='hadith_narrated')
            text_details = english_container.find('div', class_='text_details')

            # Extract reference
            reference = container.find('div', class_='hadith_reference')
            ref_text = reference.get_text(strip=True) if reference else "Reference not available"

            # Extract hadith number
            number = container.find('div', class_='hadith_number')
            num_text = number.get_text(strip=True) if number else ""

            # Format the output
            english_text += "="*50 + "\n"
            if narrator:
                english_text += "Narrator: " + narrator.get_text(strip=True) + "\n\n"
            if text_details:
                english_text += "Hadith: " + text_details.get_text(strip=True) + "\n\n"
            english_text += f"Reference: {ref_text} {num_text}\n\n"

        # Save to file
        with open(output_file, 'w', encoding='utf-8') as f:
            f.write(english_text)
        print(f"Successfully saved hadiths to {output_file}")

    except Exception as e:
        print(f"Error processing {url}: {str(e)}")

def scrape_all_urls():
    print(f"Starting scraping. Saving files to: {SAVE_FOLDER}")

    # Use ThreadPoolExecutor for parallel scraping
    with ThreadPoolExecutor(max_workers=5) as executor:
        executor.map(scrape_single_url, urls)

    print("All scraping completed!")

# Run the script
if __name__ == "__main__":
    scrape_all_urls()

In [4]:
import os
os.getcwd()
import warnings
warnings.filterwarnings("ignore")

#from langchain.schema import Document
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
#from langchain.text_splitter import RecursiveCharacterTextSplitter
#from langchain.document_loaders import TextLoader
from rank_bm25 import BM25Okapi
import numpy as np

from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import pipeline

import textwrap
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from tqdm import tqdm

In [5]:
# Configuration
SAVE_FOLDER = r"C:\Users\Imtiaz.arif\Downloads\HadithTexts"
EMBEDDINGS_INDEX = os.path.join(SAVE_FOLDER, "faiss_index_hadith")
MODEL_NAME = "Qwen/Qwen2.5-3B-Instruct"

In [None]:
#model_name = "meta-llama/Llama-3.2-3B-Instruct"
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME,
    device_map="cuda", #device_map='cuda'
    torch_dtype="auto",
    trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)


In [None]:
print(model.dtype)
total_params = sum(p.numel() for p in model.parameters())
print(f"Total Parameters: {total_params / 1e6} million")
memory_footprint = total_params * 2 / (1024 ** 2)  # Convert to MB
print(f"Estimated Memory Footprint: {memory_footprint:.2f} MB")

# Create a pipeline
generator = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
return_full_text=False,
max_new_tokens=5000,
do_sample=False
)

In [None]:
!pip install chromadb

In [None]:
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
from tqdm import tqdm

# Configuration
SAVE_FOLDER = r"C:\Users\Imtiaz.arif\Downloads\HadithTexts"
EMBEDDINGS_INDEX = os.path.join(SAVE_FOLDER, "faiss_index_hadith")
MODEL_NAME = "BAAI/bge-small-en"

# Initialize text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,
    length_function=len,
    add_start_index=True
)

# Initialize variables
documents = []
failed_files = []
hadith_files = [f for f in os.listdir(SAVE_FOLDER) if f.endswith("_english.txt")]

# Process each file
for file_name in tqdm(hadith_files, desc="Processing files"):
    try:
        file_path = os.path.join(SAVE_FOLDER, file_name)

        # Read file content
        with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
            content = f.read().strip()

        # Split into individual Hadiths
        hadiths = [h.strip() for h in content.split('='*50) if h.strip()]

        for hadith_num, hadith_text in enumerate(hadiths, 1):
            # Create PROPER LangChain Document object
            doc = Document(
                page_content=hadith_text,
                metadata={
                    "source": file_name,
                    "hadith_num": hadith_num,
                    "is_chunked": False
                }
            )

            # Split the Hadith into chunks
            hadith_chunks = text_splitter.split_documents([doc])

            # Update metadata for chunks

            for chunk_num, chunk in enumerate(hadith_chunks, 1):
                chunk.metadata.update({
                    "is_chunked": len(hadith_chunks) > 1,
                    "chunk_num": chunk_num if len(hadith_chunks) > 1 else 0,  # Use 0 instead of None
                    "total_chunks": len(hadith_chunks) if len(hadith_chunks) > 1 else 1  # Use 1 if not chunked
    })

            documents.extend(hadith_chunks)

    except Exception as e:
        failed_files.append((file_name, str(e)))
        continue

# Print processing summary
print(f"\nProcessed {len(documents)} chunks from {len(set(d.metadata['source'] for d in documents))} files")
if failed_files:
    print(f"\nFailed to process {len(failed_files)} files:")
    for f, err in failed_files[:3]:
        print(f"- {f}: {err}")

# Only proceed if we have documents
if documents:
    print("\nCreating vector store...")
    texts = [doc.page_content for doc in documents]
    metadata = [doc.metadata for doc in documents]

    from langchain_community.vectorstores import Chroma
    from langchain.embeddings import HuggingFaceEmbeddings

    # Generate embeddings using HuggingFaceEmbeddings
    embedding_model = HuggingFaceEmbeddings(model_name=MODEL_NAME)
    #embeddings = embedding_model.embed_documents(texts)

    # Create a Chroma vector store
    vector_store = Chroma.from_texts(texts, embedding_model, metadatas=metadata)

    # Optionally save the vector store
    vector_store.persist()
    print(f"Vector store saved to {EMBEDDINGS_INDEX}")
else:
    print("\nNo documents processed - cannot create vector store")

In [10]:
!# BM25 Indexing
tokenized_texts = [text.split() for text in texts]
bm25 = BM25Okapi(tokenized_texts)

def reciprocal_rank_fusion(results_bm25, results_embedding, k=2):
    scores = {}

    # Use document content or metadata as the key
    for rank, (doc, score) in enumerate(results_bm25):
        doc_id = doc.page_content  # Or use doc.metadata.get("source", "unknown") if available
        scores[doc_id] = scores.get(doc_id, 0) + 1 / (rank+1) # (k + rank + 1)
        print("BM25", scores[doc_id])

    for rank, (doc, score) in enumerate(results_embedding):
        doc_id = doc.page_content  # Use the same identifier
        scores[doc_id] = scores.get(doc_id, 0) + 1 / (rank+1) # (k + rank + 1)
        print("Dense", scores[doc_id])

    return sorted(scores.items(), key=lambda x: x[1], reverse=True)


# Extract page content and metadata properly
def format_response(doc):
    return f"Page {doc.metadata.get('page', 'Unknown')}: {doc.page_content.strip()}"

In [11]:
# Retrieve function
def retrieve(query, k=3):
    query_embedding = embedding_model.embed_query(query)
    results_embedding = vector_store.similarity_search_with_score(query, k=k)
    results_embedding = sorted(results_embedding, key=lambda x: x[1], reverse=True)

    print("============Dense Embeddings=============")
    for doc, score in results_embedding:
        print(f"page {doc.metadata.get('page','Unknown')} - Score: {score:.4f} - {doc.page_content[:100]}...")

    # Get BM25 scores for all documents and sort to get top-k results
    results_bm25 = [(idx, bm25.get_scores(query.split())[idx]) for idx in range(len(texts))]
    results_bm25 = sorted(results_bm25, key=lambda x: x[1], reverse=True)[:k]  # Keep only top-k results
    # Convert BM25 results to (Document, score) format
    results_bm25_docs = [(Document(page_content=texts[idx], metadata=metadata[idx]), score) for idx, score in results_bm25]

    print("************BM25 Results*************")
    for doc, score in results_bm25_docs:
        print(f"page {doc.metadata.get('page','Unknown')} - Score: {score:.4f} - {doc.page_content[:100]}...")

    # Create a lookup dictionary {document content -> Document object}
    doc_lookup = {doc.page_content: doc for doc, _ in results_bm25_docs}
    doc_lookup.update({doc.page_content: doc for doc, _ in results_embedding})

    # Fuse results
    fused_results = reciprocal_rank_fusion(results_bm25_docs, results_embedding)

    # Format results, ensuring document IDs are mapped back to actual Documents
    return [format_response(doc_lookup[doc_id]) for doc_id, _ in fused_results if doc_id in doc_lookup]

    #fused_results = reciprocal_rank_fusion(results_bm25, results_embedding)
    #return [(texts[idx], metadata[idx]["page"] if "page" in metadata[idx] else "Unknown") for idx, _ in fused_results]


In [12]:
# Example usage:
question = "What are the teachings regarding wudu?"
results = retrieve(question, k=5)
for i, result in enumerate(results, 1):
    print(f"\nRESULT #{i}:")
    print(result)

page Unknown - Score: 0.3149 - Narrator: Narrated `Abdullah:

Hadith: The Prophet (ﷺ) used to get a Harba planted in front of him (...
page Unknown - Score: 0.3123 - Narrator: Narrated Ibn `Abbas:

Hadith: The Prophet (ﷺ) and Maimuna used to take a bath from a singl...
page Unknown - Score: 0.3098 - Narrator: Narrated Abu Huraira:

Hadith: Allah's Messenger (ﷺ) said, "You see me facing the Qibla; b...
page Unknown - Score: 0.3059 - Narrator: Narrated Ibn `Abbas:

Hadith: When the Prophet (ﷺ) entered the Ka`ba, he invoked Allah in ...
page Unknown - Score: 0.3014 - Narrator: Narrated `Urwa on the authority of `Aisha:...
************BM25 Results*************
page Unknown - Score: 8.6036 - Hadith: that he differed with Hur bin Qais bin Hisn Al-Fazari regarding the companion of the Prophet...
page Unknown - Score: 8.3675 - Hadith: That he differed with Hur bin Qais bin Hisn Al-Fazari regarding the companion of (the Prophe...
page Unknown - Score: 8.0758 - Narrator: Narrated Maimuna:

Hadit

In [13]:
for i in range(0,len(results)):
    print(results[i])
    print("-------")

Page Unknown: Hadith: that he differed with Hur bin Qais bin Hisn Al-Fazari regarding the companion of the Prophet (ﷺ) Moses. 
Meanwhile, Ubai bin Ka`b passed by them and Ibn `Abbas called him saying, "My friend (Hur) and I 
have differed regarding Moses' companion whom Moses asked the way to meet. Have you heard 
Allah's Messenger (ﷺ) mentioning something about him? Ubai bin Ka`b said: "Yes, I heard the Prophet (ﷺ)
-------
Page Unknown: Narrator: Narrated `Abdullah:

Hadith: The Prophet (ﷺ) used to get a Harba planted in front of him (as a Sutra) and pray behind it.

Reference: Reference not available
-------
Page Unknown: Hadith: That he differed with Hur bin Qais bin Hisn Al-Fazari regarding the companion of (the Prophet) 
Moses. Ibn `Abbas said that he was Al Khadir. Meanwhile, Ubai bin Ka`b passed by them and Ibn 
`Abbas called him, saying "My friend (Hur) and I have differed regarding Moses' companion, whom 
Moses asked the way to meet. Have you heard the Prophet (ﷺ) mentioning s

In [14]:
# Construct the RAG prompt
prompt = f"""
You are an AI assistant tasked with answering questions based on retrieved knowledge.

### **Retrieved Information**:
1. {results[0]}

2. {results[1]}

3. {results[2]}



### **Question**:
{question}

### **Instructions**:
- Integrate the key points from all retrieved responses into a **cohesive, well-structured answer**.
- If the responses are **contradictory**, mention the different perspectives.
- If none of the retrieved responses contain relevant information, reply:
  **"I couldn't find a good response to your query in the database."**
"""

# Generate response using Qwen2.5
messages = [{"role": "user", "content": prompt}]
output = generator(messages)
print(textwrap.fill(output[0]["generated_text"], width=80))

I couldn't find a good response to your query in the database. The provided
hadiths do not discuss teachings regarding wudu (ablution). They focus on
different aspects of Islamic tradition, such as the companions of the Prophet
Muhammad (ﷺ) and his interactions with others. For information on wudu, you may
want to refer to authoritative sources on Islamic jurisprudence or religious
texts.


In [15]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def evaluate_rag_output(query, generated_answer, retrieved_docs, embedding_model):
    """
    Evaluate the RAG output for faithfulness and relevance

    Args:
        query: User's original question
        generated_answer: The answer produced by the LLM
        retrieved_docs: List of retrieved documents (with scores)
        embedding_model: The embedding model used for similarity calculations

    Returns:
        dict: Contains faithfulness and relevance scores
    """
    # Calculate relevance scores
    relevance_scores = calculate_relevance(query, retrieved_docs, embedding_model)

    # Calculate faithfulness score
    faithfulness_score = calculate_faithfulness(generated_answer, retrieved_docs, embedding_model)

    return {
        "faithfulness": faithfulness_score,
        "relevance": relevance_scores
    }

def calculate_relevance(query, retrieved_docs, embedding_model):
    """
    Calculate relevance of retrieved documents to the query

    Args:
        query: Original user question
        retrieved_docs: List of (doc, score) tuples
        embedding_model: Embedding model for similarity calculation

    Returns:
        dict: Contains average and individual relevance scores
    """
    # Get query embedding
    query_embedding = embedding_model.embed_query(query)
    query_embedding = np.array(query_embedding).reshape(1, -1)

    relevance_scores = []

    for doc, _ in retrieved_docs:
        # Get document embedding
        doc_embedding = embedding_model.embed_query(doc.page_content)
        doc_embedding = np.array(doc_embedding).reshape(1, -1)

        # Calculate cosine similarity
        similarity = cosine_similarity(query_embedding, doc_embedding)[0][0]
        relevance_scores.append(similarity)

    return {
        "average_relevance": np.mean(relevance_scores),
        "individual_relevance": relevance_scores
    }

def calculate_faithfulness(generated_answer, retrieved_docs, embedding_model):
    """
    Calculate how faithful the generated answer is to the retrieved documents

    Args:
        generated_answer: LLM's generated answer
        retrieved_docs: List of retrieved documents
        embedding_model: Embedding model for similarity calculation

    Returns:
        float: Faithfulness score (0-1)
    """
    # Combine all retrieved documents into one context
    context = " ".join([doc.page_content for doc, _ in retrieved_docs])

    # Get embeddings
    answer_embedding = np.array(embedding_model.embed_query(generated_answer)).reshape(1, -1)
    context_embedding = np.array(embedding_model.embed_query(context)).reshape(1, -1)

    # Calculate cosine similarity between answer and context
    faithfulness_score = cosine_similarity(answer_embedding, context_embedding)[0][0]

    return max(0, min(1, faithfulness_score))  # Ensure score is between 0 and 1

# Example usage after generating your RAG response:
# Assuming you have:
# - query = the original question
# - output = the generated answer from your LLM
# - results = the retrieved documents

# First format the retrieved documents properly:
retrieved_docs_formatted = []
for doc in results:
    # Assuming results contains strings formatted as "Page X: content..."
    content = doc.split(":", 1)[1].strip() if ":" in doc else doc
    retrieved_docs_formatted.append((Document(page_content=content), 1.0))  # Using dummy score

# Then evaluate:
evaluation = evaluate_rag_output(
    query=question,
    generated_answer=output[0]["generated_text"],
    retrieved_docs=retrieved_docs_formatted,
    embedding_model=embedding_model
)

print("\nEvaluation Results:")
print(f"Faithfulness Score: {evaluation['faithfulness']:.3f}")
print(f"Average Relevance: {evaluation['relevance']['average_relevance']:.3f}")
print("Individual Relevance Scores:")
for i, score in enumerate(evaluation['relevance']['individual_relevance']):
    print(f"  Doc {i+1}: {score:.3f}")


Evaluation Results:
Faithfulness Score: 0.892
Average Relevance: 0.823
Individual Relevance Scores:
  Doc 1: 0.780
  Doc 2: 0.843
  Doc 3: 0.768
  Doc 4: 0.844
  Doc 5: 0.825
  Doc 6: 0.845
  Doc 7: 0.817
  Doc 8: 0.847
  Doc 9: 0.812
  Doc 10: 0.849
