In [1]:
!pip install -q PyPDF2 sentence-transformers faiss-cpu transformers

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m225.3/232.6 kB[0m [31m9.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m345.7/345.7 kB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m53.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import PyPDF2
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from transformers import GPTNeoForCausalLM, GPT2Tokenizer
import torch

In [3]:
def extract_text_from_pdf(path):
    reader = PyPDF2.PdfReader(path)
    text = ""
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:
            text += page_text + "\n"
    return text

pdf_text = extract_text_from_pdf('/content/offer letter (3).pdf')
print(f"Extracted {len(pdf_text)} characters from PDF.")

Extracted 35778 characters from PDF.


In [4]:
chunk_size = 1000
chunks = [pdf_text[i:i+chunk_size] for i in range(0, len(pdf_text), chunk_size)]
print(f"Split text into {len(chunks)} chunks, each up to {chunk_size} characters.")

Split text into 36 chunks, each up to 1000 characters.


In [5]:
embedder = SentenceTransformer('all-MiniLM-L6-v2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [6]:
chunk_embeddings = embedder.encode(chunks, convert_to_numpy=True)
print(f"Generated embeddings of shape: {chunk_embeddings.shape}")

Generated embeddings of shape: (36, 384)


In [7]:
d = chunk_embeddings.shape[1]
index = faiss.IndexFlatL2(d)
index.add(np.array(chunk_embeddings))
print(f"FAISS index built with {index.ntotal} vectors of dimension {d}.")

FAISS index built with 36 vectors of dimension 384.


In [8]:
tokenizer = GPT2Tokenizer.from_pretrained('EleutherAI/gpt-neo-125M')
model = GPTNeoForCausalLM.from_pretrained('EleutherAI/gpt-neo-125M')

tokenizer_config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/357 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/526M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

In [18]:
def answer_question(question, k=3):
    # 1. Embed the question
    q_emb = embedder.encode([question], convert_to_numpy=True)

    # 2. Search FAISS for k nearest chunks
    D, I = index.search(np.array(q_emb), k)

    # 3. Retrieve the top-k relevant chunks
    retrieved_chunks = [chunks[i] for i in I[0]]
    context = " ".join(retrieved_chunks)

    # 4. Build prompt for GPT-Neo
    prompt = f"Context: {context}\nQuestion: {question}\nAnswer:"
    inputs = tokenizer(prompt, return_tensors='pt').input_ids

    # 5. Generate answer (sampling to allow variety)
    outputs = model.generate(
        inputs,
        max_new_tokens=200,
        do_sample=True, top_p=0.95, top_k=50
    )

    # 6. Decode and return only the answer
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True).split('Answer:')[-1].strip()
    return answer

In [21]:
user_question = "Tell about Personal Allowance?"
print("Answer:", answer_question(user_question))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Answer: 1)TCSL has the capacity to manage personal care at all TCSl facilities and TCSl
facilities are responsible for this type of health insurance
2) TCSL does not have the right to offer personal medical benefits to any person under
the scheme and it is only to cover the personal medical benefits. 
3) TCSL will not have any personal insurance to cover for you if you are a 
personally insured. 
4) TCSL will not have any direct medical coverage for you if you are a personally 
insured. 
5) TCSL is not accountable for the quality of health insurance you have. However 
in the event you are a personally insured, TCSL will not have to compensate you in the 
event you are not physically a person as per your health plan. 
6) TCSL has a professional medical practice to provide you with proper care for a


In [22]:
import json

notebook_path = "/content/RAG_with_Zephyr.ipynb"

# Load the notebook file
with open(notebook_path, "r", encoding="utf-8") as f:
    data = json.load(f)

# Remove the widget metadata if present
if "widgets" in data.get("metadata", {}):
    print("Cleaning widget metadata...")
    del data["metadata"]["widgets"]

    # Save the cleaned notebook
    with open(notebook_path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=1)

    print("✅ Notebook cleaned! You can now re-open it.")
else:
    print("No widget metadata found.")

FileNotFoundError: [Errno 2] No such file or directory: '/content/RAG_with_Zephyr.ipynb'