***RAG Implementation Workflow***
1. Load Documents
2. Generate Document Chunks
3. Vectorize Document Chunks
4. Store Embeddings with Document Chunk IDs
5. Vectorize Question/Query
6. Use Question Embeddings to Retrieve Relevant Document Chunk IDs
7. Use Document Chunk IDs to Retrieve Document Chunks from Storage
8. Use Question + Relevant Document Chunks + Prompt to Answer Questions
9. Generate Answer

**Load Documents**

**Task** : Fetch PDF documents from ICAR-CRIDA website.

In [67]:
# Import required libraries
import requests
from bs4 import BeautifulSoup
import json
import os

In [68]:
# URL of the webpage to scrape
url = "https://www.icar-crida.res.in/publications_annualreports.html"

def fetch_pdf_urls(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        content = BeautifulSoup(response.content, 'html.parser')
        links = content.find_all('a')
        pdf_links = []
        for link in links:
            href = link.get('href')
            if href and '.pdf' in href:
                if not href.startswith('http'):
                    href = 'https://www.icar-crida.res.in/' + href.lstrip('/')
                pdf_links.append(href)
        return pdf_links
    except requests.RequestException as e:
        print(f"Failed to fetch webpage: {url} with error: {e}")
        return []

def download_pdfs(pdf_urls, download_dir='downloaded_pdfs'):
    os.makedirs(download_dir, exist_ok=True)
    for url in pdf_urls:
        filename = url.split('/')[-1]
        filepath = os.path.join(download_dir, filename)
        if os.path.exists(filepath):
            print(f"Skipping {filename}. Already downloaded.")
            continue
        try:
            response = requests.get(url)
            response.raise_for_status()
            with open(filepath, 'wb') as f:
                f.write(response.content)
            print(f"Downloaded {filename} to {download_dir}")
        except requests.RequestException as e:
            print(f"Failed to download {filename}: {e}")

# Fetch PDF URLs and download PDFs
pdf_urls = fetch_pdf_urls(url)
# download_pdfs(pdf_urls)

# Save PDF URLs to a JSON file
with open('icar_crida_report_urls.json', 'w', encoding='utf-8') as f:
    json.dump(pdf_urls, f, ensure_ascii=False, indent=2)

print(pdf_urls)
print("PDF download and URL scraping completed!")


['https://www.icar-crida.res.in/./assets_c/img/Annualreports/AR22.pdf', 'https://www.icar-crida.res.in/./assets_c/img/Annualreports/AR21.pdf', 'https://www.icar-crida.res.in/./assets_c/img/Annualreports/AR20.pdf', 'https://www.icar-crida.res.in/./assets/img/Annualreports/AR19.pdf', 'https://www.icar-crida.res.in/./assets_c/img//Annualreports/AR18-19.pdf', 'https://www.icar-crida.res.in/./assets_c/img//Annualreports/AR17-18.pdf', 'https://www.icar-crida.res.in/assets_c/img/Annualreports/AR16-17.pdf', 'https://www.icar-crida.res.in/assets_c/img/Annualreports/AR15-16.pdf', 'https://www.icar-crida.res.in/assets_c/img/Annualreports/AR14-15.pdf', 'https://www.icar-crida.res.in/assets_c/img/Annualreports/AR13-14.pdf', 'https://www.icar-crida.res.in/assets_c/img/Annualreports/AICRPDA/AR18-19.pdf', 'https://www.icar-crida.res.in/assets_c/img/Annualreports/AICRPDA/AR17-18.pdf', 'https://www.icar-crida.res.in/assets_c/img/Annualreports/AICRPDA/AR16-17.pdf', 'https://www.icar-crida.res.in/assets_c

***Generate Document Chunks***

**Task** : Divide the text of each document into smaller chunks.

In [69]:
# Import required libraries
import fitz
from tqdm import tqdm
import re


In [70]:
# Function to clean text
def clean_text(text):
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    text = re.sub(r'\n+', '\n', text)
    text = re.sub(r'[^\w\s.,!?]', ' ', text)
    return text

# Function to chunk data
def chunk_data(text, lines_per_chunk=10):
    lines = text.split('\n')
    chunks = ['\n'.join(lines[i:i + lines_per_chunk]) for i in range(0, len(lines), lines_per_chunk)]
    return chunks

# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    try:
        doc = fitz.open(pdf_path)
        text = "\n".join(page.get_text() for page in doc)
        return text
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
        return ""

# Function to process PDFs and generate chunks
def process_pdfs(pdf_dir='downloaded_pdfs'):
    pdf_texts = []
    for filename in tqdm(os.listdir(pdf_dir), desc="Processing PDFs"):
        if filename.endswith('.pdf'):
            pdf_path = os.path.join(pdf_dir, filename)
            text = extract_text_from_pdf(pdf_path)
            if text:
                clean_text_content = clean_text(text)
                chunks = chunk_data(clean_text_content)
                pdf_texts.append({
                    'filename': filename,
                    'chunks': chunks,
                    'total_chunks': len(chunks)
                })
    return pdf_texts

# Process PDFs and generate chunked text
pdf_texts = process_pdfs()

# Dump pdf_texts to JSON file
output_file = 'icar_crida_reports_processed_pdfs.json'
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(pdf_texts, f, ensure_ascii=False, indent=2)

print(f"PDF processing completed. Results saved to {output_file}")


Processing PDFs:   0%|          | 0/1 [00:00<?, ?it/s]

Processing PDFs: 100%|██████████| 1/1 [00:00<00:00,  1.10it/s]

PDF processing completed. Results saved to icar_crida_reports_processed_pdfs.json





***Vectorize Document Chunks***

**Task** : Convert each chunk of text into numerical vectors (embeddings).

In [71]:
# Import required libraries
from transformers import BertModel, BertTokenizer
import torch
import pickle

In [72]:
# Tokenization and saving tokenized chunks
# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_text_chunks(pdf_texts):
    for pdf in tqdm(pdf_texts, desc="Tokenizing Chunks"):
        for chunk in pdf['chunks']:
            tokenized_text = tokenizer(chunk, return_tensors='pt', truncation=True, padding=True)
            original_text = chunk
            tokenized_chunk = {
                'original_text': original_text,
                'input_ids': tokenized_text['input_ids'].tolist(),
                'attention_mask': tokenized_text['attention_mask'].tolist()
            }
            pdf.setdefault('tokenized_chunks', []).append(tokenized_chunk)
    return pdf_texts

# Load processed PDF texts
with open('icar_crida_reports_processed_pdfs.json', 'r', encoding='utf-8') as f:
    pdf_texts = json.load(f)

# Tokenize the chunks
pdf_texts = tokenize_text_chunks(pdf_texts)

# Save tokenized chunks to a JSON file
tokenized_output_file = 'icar_crida_reports_tokenized.json'
with open(tokenized_output_file, 'w', encoding='utf-8') as f:
    json.dump(pdf_texts, f, ensure_ascii=False, indent=2)

print(f"Tokenization completed. Results saved to {tokenized_output_file}")


Tokenizing Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.42s/it]

Tokenization completed. Results saved to icar_crida_reports_tokenized.json





In [73]:
# Initialize BERT model and tokenizer
model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Function to embed text
def embed(text, nums):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    num_tokens = inputs['input_ids'].size(1)  # Get the number of tokens
    nums.append(num_tokens)
    with torch.no_grad():
        outputs = model(**inputs)
    token_embeddings = outputs.last_hidden_state
    mean_embedding = torch.mean(token_embeddings, dim=1)
    return mean_embedding.squeeze().tolist()

# Function to process and embed PDFs
def process_and_embed_pdfs(pdf_texts):
    nums = []  # List to store number of tokens processed per chunk
    all_embeddings = []  # List to store all embeddings
    for pdf in tqdm(pdf_texts, desc="Embedding Chunks"):
        pdf.setdefault('tokenized_chunks', [])
        for chunk in pdf['chunks']:
            original_text = chunk
            embeddings = embed(original_text, nums)
            tokenized_chunk = {
                'original_text': original_text,
                'embeddings': embeddings
            }
            pdf['tokenized_chunks'].append(tokenized_chunk)
            all_embeddings.append(embeddings)
    return pdf_texts, all_embeddings

# Load tokenized PDF texts
with open('icar_crida_reports_tokenized.json', 'r', encoding='utf-8') as f:
    pdf_texts = json.load(f)

# Process and embed the PDFs
pdf_texts, all_embeddings = process_and_embed_pdfs(pdf_texts)

# Check the shape of embeddings in all_embeddings
print(f"Shape of embeddings in all_embeddings: {len(all_embeddings)}, {len(all_embeddings[0])}")


Embedding Chunks: 100%|██████████| 1/1 [00:02<00:00,  2.80s/it]

Shape of embeddings in all_embeddings: 1, 768





In [74]:
print(max(nums))
print(all_embeddings[:3])

512
[[-0.3430640697479248, 0.08783604204654694, 0.035757556557655334, -0.1816527098417282, 0.5251249670982361, -0.1327868402004242, 0.02471209689974785, 0.34966331720352173, -0.09805116057395935, -0.12245539575815201, -0.041326649487018585, -0.34544122219085693, -0.0604957677423954, 0.6165102124214172, 0.14404965937137604, 0.7463268041610718, 0.419908732175827, -0.1181224063038826, -0.226030170917511, 0.31585773825645447, 0.2541721761226654, -0.07760126143693924, 0.2717641592025757, 0.7797965407371521, 0.35269099473953247, 0.13884028792381287, 0.29306456446647644, 0.18969954550266266, -0.022022098302841187, -0.06472940742969513, 0.5332173705101013, -0.03516602888703346, -0.18561290204524994, -0.037354208528995514, 0.18858635425567627, -0.10143300890922546, -0.2946678400039673, -0.1413755863904953, -0.14595064520835876, 0.011805424466729164, -0.37029996514320374, -0.5466173887252808, -0.25966358184814453, 0.02492462657392025, -0.14755800366401672, -0.07804714143276215, 0.101397432386875

***Store Embeddings with Document Chunk IDs***

**Task** : Save the embeddings along with metadata (document chunk IDs, original text).

In [75]:
# Save embeddings and text chunks using pickle
with open('allembeddings.pkl', 'wb') as f:
    pickle.dump(all_embeddings, f)

# Save text chunks using pickle
text_chunks = [pdf['chunks'] for pdf in pdf_texts]  # Extract chunks from each PDF
with open('text_chunks.pkl', 'wb') as f:
    pickle.dump(text_chunks, f)

print("Embeddings and text chunks saved to pickle files.")

Embeddings and text chunks saved to pickle files.


***Vectorize Question/Query***

**Task** : Convert user queries into numerical vectors (embeddings) for similarity matching.

In [83]:
# Example query text
query_text = "Who is the current director of ICAR-CRIDA, and who were the members of the editorial committee for the 2022 annual report?"
query_nums = []
query_embedding = embed(query_text, query_nums)
print(f"Shape of query_embedding: {len(query_embedding)}")

print("Query vectorization completed!")

Shape of query_embedding: 768
Query vectorization completed!


***Use Question Embeddings to Retrieve Relevant Document Chunk IDs***

**Task** : Retrieve document chunk IDs that are most relevant to the user query.

In [84]:
from scipy.spatial.distance import cosine

def find_nearest_embeddings(query_embedding, embeddings, top_k=5):
    distances = []
    for idx, embedding in enumerate(embeddings):
        distance = cosine(query_embedding, embedding)
        distances.append((idx, embedding, distance))
    distances.sort(key=lambda x: x[2])  # Sort by distance (smaller is closer)
    nearest_embeddings = [embedding for _, embedding, _ in distances[:top_k]]
    nearest_ids = [idx for idx, _, _ in distances[:top_k]]
    return nearest_ids, nearest_embeddings

def query(query_embedding, top_k=5):
    # Load stored embeddings
    with open('allembeddings.pkl', 'rb') as f:
        loaded_allembeddings = pickle.load(f)
    # Find and return the nearest embeddings
    return find_nearest_embeddings(query_embedding, loaded_allembeddings, top_k)

# Perform the query
nearest_ids, nearest_embeddings = query(query_embedding)
print("Nearest IDs:", nearest_ids)


Nearest IDs: [0]


***Use Document Chunk IDs to Retrieve Document Chunks from Storage***

**Task** : Fetch the actual document chunks corresponding to the retrieved chunk IDs.

In [82]:
# Perform the query
nearest_ids, nearest_embeddings = query(query_embedding)
print(nearest_ids)

[0]


***Use Question + Relevant Document Chunks + Prompt to Answer Questions***

**Task** : Combine the user query, relevant document chunks, and a prompt to generate an answer.

In [80]:
import pickle
from transformers import BertTokenizer, BertForQuestionAnswering
import torch

# Load pre-trained BERT model for question answering
qa_model_name = "bert-large-uncased-whole-word-masking-finetuned-squad"
qa_tokenizer = BertTokenizer.from_pretrained(qa_model_name)
qa_model = BertForQuestionAnswering.from_pretrained(qa_model_name)

def generate_answer(query, context):
    # Tokenize text chunk and query
    query_tokens = qa_tokenizer.encode(query, add_special_tokens=False)
    text_chunk_tokens = qa_tokenizer.encode(context, add_special_tokens=False, truncation=True, max_length=500-len(query_tokens))
    
    # Combine text and query tokens with special tokens
    input_tokens = [qa_tokenizer.cls_token_id] + query_tokens + [qa_tokenizer.sep_token_id] + text_chunk_tokens + [qa_tokenizer.sep_token_id]
    
    # Convert tokens to tensors
    input_ids = torch.tensor(input_tokens).unsqueeze(0)  # Batch size 1

    # Perform inference
    outputs = qa_model(input_ids)
    start_logits = outputs.start_logits
    end_logits = outputs.end_logits

    # Find the answer span with the highest probability
    start_index = torch.argmax(start_logits)
    end_index = torch.argmax(end_logits)

    # Get the answer tokens and convert them to string
    answer_tokens = input_tokens[start_index:end_index+1]
    answer = qa_tokenizer.decode(answer_tokens)
    return answer

def get_context(nearest_ids):
    with open('text_chunks.pkl', 'rb') as f:
        loaded_chunks = pickle.load(f)
    
    chunks = [chunk['original_text'] for idx, chunk in enumerate(loaded_chunks) if idx in nearest_ids]
    context = " ".join(chunks)
    return context

def query_and_answer(query_text):
    # Perform query and answer generation
    nearest_ids = [0, 1]  # Adjust as per your requirement
    
    # Get context based on nearest_ids
    context = get_context(nearest_ids)
    
    # Generate answer
    answer = generate_answer(query_text, context)
    return answer

query_text = "Who is the current director of ICAR-CRIDA, and who were the members of the editorial committee for the 2022 annual report?"
answer = query_and_answer(query_text)
print("Answer:", answer)


Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


TypeError: list indices must be integers or slices, not str

***Generate Answer***

**Task** : Produce a final answer to the user query.

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch
import nltk
from nltk.tokenize import sent_tokenize

nltk.download('punkt')

def generate_answer(query: str, context: str, model_name: str = 'gpt2', max_length: int = 1024) -> str:
    """
    Generate an answer based on the query and context using a GPT-2 model.
    
    :param query: The query to be answered.
    :param context: The context in which to find the answer.
    :param model_name: The name of the GPT-2 model to use (default is 'gpt2').
    :param max_length: The maximum length of the generated answer.
    :return: The generated answer as a string.
    """

    # Load the tokenizer and model
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    model = GPT2LMHeadModel.from_pretrained(model_name)

    # Prepare the input text
    input_text = f"Context: {context}\n\nQuery: {query}\n\nAnswer:"
    
    # Encode the input text
    input_ids = tokenizer.encode(input_text, return_tensors='pt')

    # Generate the output
    output_ids = model.generate(input_ids, 
                                max_length=max_length, 
                                num_return_sequences=1,
                                no_repeat_ngram_size=2, 
                                temperature=0.5,
                                top_p=0.9,
                                do_sample=True,
                                pad_token_id=tokenizer.eos_token_id)

    # Decode the output
    answer = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    # Post-process the output to extract the answer part
    # answer = answer.split("Answer:")[1].strip()
    answer = answer.split("Answer:")[1].strip().split("\n")[0]

    # Tokenize the answer into sentences
    sentences = sent_tokenize(answer)

    # Reconstruct the answer without the last incomplete sentence
    complete_answer = ' '.join(sentences[:-1]) if not answer.endswith('.') else answer

    print("complete answer: ", complete_answer)
    return answer


def get_context(nearest_ids):
    with open('text_chunks.pkl', 'rb') as f:
        loaded_chunks = pickle.load(f)

    # nearest_ids.sort()
    chunks = [loaded_chunks[i] for i in nearest_ids]
    context = " ".join([i for i in chunks])
    print("context: ", context)
    return context

context = get_context(nearest_ids)
answer = generate_answer(query, context)
print("answer: ", answer)