In [1]:
#!pip install ragatouille==0.0.8 tqdm

In [3]:
#pip install "transformers==4.36.2" "torch>=2.1.0" "faiss-cpu" --upgrade


In [1]:
import json
from ragatouille import RAGPretrainedModel
from tqdm import tqdm
import torch
import warnings
warnings.filterwarnings('ignore')

********************************************************************************
--------------------------------------------
RAGatouille version 0.0.10 will be migrating to a PyLate backend 
instead of the current Stanford ColBERT backend.
PyLate is a fully mature, feature-equivalent backend, that greatly facilitates compatibility.
However, please pin version <0.0.10 if you require the Stanford ColBERT backend.
********************************************************************************
  from ragatouille import RAGPretrainedModel
  import pynvml  # type: ignore[import]
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [2]:
# Verify GPU availability
print(f"CUDA Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU Device: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
else:
    print("WARNING: GPU not detected, will use CPU")


CUDA Available: True
GPU Device: Tesla T4
GPU Memory: 15.83 GB


In [18]:
# ============================================
# CONFIGURATION
# ============================================

# File paths
CONTEXT_FILE = "all_context.jsonl"
QUESTION_FILE = "test_questions.jsonl"  # Change to "test.jsonl" or "val.jsonl" as needed
OUTPUT_FILE = "test_fine_tune_with_context.jsonl"  # Change accordingly

In [4]:
# Retrieval settings
TOP_K = 2  # Number of contexts to retrieve
INDEX_NAME = "tqa_colbert_index"  # Name for the ColBERT index
MAX_DOC_LENGTH = 512  # Max passage size is 510 tokens, so 512 is perfect


In [5]:
# ============================================
# STEP 1: Load Context Data
# ============================================

def load_jsonl(file_path):
    """Load JSONL file into a list of dictionaries"""
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line.strip()))
    return data

print("Loading context data...")
contexts = load_jsonl(CONTEXT_FILE)
print(f"✓ Loaded {len(contexts)} passages from {CONTEXT_FILE}")


Loading context data...
✓ Loaded 6810 passages from all_context.jsonl


In [6]:
# ============================================
# STEP 2: Initialize ColBERT Model
# ============================================

print("\nInitializing ColBERT model with GPU acceleration...")
# Using colbert-ir/colbertv2.0 - will automatically use GPU if available
rag = RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0")
print("✓ ColBERT model loaded on GPU")


Initializing ColBERT model with GPU acceleration...
✓ ColBERT model loaded on GPU


In [7]:
# #just to find the max_length of the passages. No need to run
# from transformers import AutoTokenizer
# import statistics

# tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
# lengths = [len(tokenizer.encode(ctx["content"])) for ctx in contexts]
# print("Avg tokens:", int(statistics.mean(lengths)))
# print("Max tokens:", max(lengths))


In [8]:
# ============================================
# STEP 3: Build Index (One-time operation)
# ============================================

print(f"\nBuilding ColBERT index '{INDEX_NAME}' on GPU...")
print("Note: With T4 GPU, this should take ~5-10 minutes. This is a ONE-TIME operation.")

# Extract just the content text for indexing
documents = [ctx["content"] for ctx in contexts]

# Build the index with GPU acceleration
# RAGatouille will automatically save the index for reuse
rag.index(
    collection=documents,
    index_name=INDEX_NAME,
    max_document_length=MAX_DOC_LENGTH,  # Set to 512 for your 510-token passages
    split_documents=False  # Your passages are already chunked
)
print(f"✓ Index built and saved as '{INDEX_NAME}'")
#gpu is not orking done on cpu


Building ColBERT index 'tqa_colbert_index' on GPU...
Note: With T4 GPU, this should take ~5-10 minutes. This is a ONE-TIME operation.
This is a behaviour change from RAGatouille 0.8.0 onwards.
This works fine for most users and smallish datasets, but can be considerably slower than FAISS and could cause worse results in some situations.
If you're confident with FAISS working on your machine, pass use_faiss=True to revert to the FAISS-using behaviour.
--------------------


[Oct 28, 20:18:29] #> Note: Output directory .ragatouille/colbert/indexes/tqa_colbert_index already exists


[Oct 28, 20:18:29] #> Will delete 1 files already at .ragatouille/colbert/indexes/tqa_colbert_index in 20 seconds...
[Oct 28, 20:18:49] [0] 		 #> Encoding 6810 passages..
[Oct 28, 20:20:02] [0] 		 avg_doclen_est = 114.52349853515625 	 len(local_sample) = 6,810
[Oct 28, 20:20:03] [0] 		 Creating 8,192 partitions.
[Oct 28, 20:20:03] [0] 		 *Estimated* 779,905 embeddings.
[Oct 28, 20:20:03] [0] 		 #> Saving the 

0it [00:00, ?it/s]

[Oct 28, 20:33:09] [0] 		 #> Encoding 6810 passages..


1it [01:11, 71.88s/it]
100%|██████████| 1/1 [00:00<00:00, 351.55it/s]

[Oct 28, 20:34:21] #> Optimizing IVF to store map from centroids to list of pids..
[Oct 28, 20:34:21] #> Building the emb2pid mapping..
[Oct 28, 20:34:21] len(emb2pid) = 779905



100%|██████████| 8192/8192 [00:00<00:00, 55539.15it/s]

[Oct 28, 20:34:21] #> Saved optimized IVF to .ragatouille/colbert/indexes/tqa_colbert_index/ivf.pid.pt





Done indexing!
✓ Index built and saved as 'tqa_colbert_index'


In [19]:
# ============================================
# STEP 4: Load Questions
# ============================================

print(f"\nLoading questions from {QUESTION_FILE}...")
questions = load_jsonl(QUESTION_FILE)
print(f"✓ Loaded {len(questions)} questions")


Loading questions from test_questions.jsonl...
✓ Loaded 2512 questions


In [20]:
# ============================================
# STEP 5: Retrieve Contexts for Each Question
# ============================================

print(f"\nRetrieving top-{TOP_K} contexts for each question...")

enhanced_data = []

for q_item in tqdm(questions, desc="Processing questions"):
    query = q_item["question"]
    
    # Search for top-k relevant passages
    results = rag.search(query, k=TOP_K)
    
    # Combine retrieved contexts into single string
    # RAGatouille returns results as list of dicts with 'content' key
    retrieved_texts = [result["content"] for result in results]
    combined_context = "\n\n".join(retrieved_texts)
    
    # Create enhanced entry
    enhanced_entry = {
        "question": q_item["question"],
        "answerChoices": q_item["answerChoices"],
        "correctAnswer": q_item["correctAnswer"],
        "context": combined_context
    }
    
    enhanced_data.append(enhanced_entry)

print(f"✓ Retrieved contexts for all {len(questions)} questions")



Retrieving top-2 contexts for each question...


Processing questions: 100%|██████████| 2512/2512 [01:17<00:00, 32.25it/s]

✓ Retrieved contexts for all 2512 questions





In [21]:
# ============================================
# STEP 6: Save Enhanced Dataset
# ============================================

print(f"\nSaving enhanced dataset to {OUTPUT_FILE}...")
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
    for item in enhanced_data:
        f.write(json.dumps(item, ensure_ascii=False) + '\n')

print(f"✓ Saved {len(enhanced_data)} question-answer pairs with contexts")
print("\n" + "="*50)
print("PIPELINE COMPLETE!")
print("="*50)
print(f"\nOutput file: {OUTPUT_FILE}")
print(f"Each entry now contains:")
print("  - question")
print("  - answerChoices")
print("  - correctAnswer")
print("  - context (top-2 passages combined)")
print("\nTo process other files:")
print("1. Change QUESTION_FILE = 'test.jsonl' or 'val.jsonl'")
print("2. Change OUTPUT_FILE accordingly")
print("3. Re-run the script (index reuse will be automatic)")



Saving enhanced dataset to test_fine_tune_with_context.jsonl...
✓ Saved 2512 question-answer pairs with contexts

PIPELINE COMPLETE!

Output file: test_fine_tune_with_context.jsonl
Each entry now contains:
  - question
  - answerChoices
  - correctAnswer
  - context (top-2 passages combined)

To process other files:
1. Change QUESTION_FILE = 'test.jsonl' or 'val.jsonl'
2. Change OUTPUT_FILE accordingly
3. Re-run the script (index reuse will be automatic)


In [22]:
# ============================================
# OPTIONAL: Preview Sample Output
# ============================================

print("\n" + "="*50)
print("SAMPLE OUTPUT:")
print("="*50)
print(json.dumps(enhanced_data[0], indent=2, ensure_ascii=False)[:1000] + "...")


SAMPLE OUTPUT:
{
  "question": "Steps of the scientific method include all of the following except",
  "answerChoices": "(A) doing background research.. (B) constructing a hypothesis.. (C) asking a question.. (D) proving a theory..",
  "correctAnswer": "D",
  "context": "The scientific method is a process used to investigate the unknown ( Figure 1.1). It is the general process of a scientific investigation. This process uses evidence and testing. Scientists use the scientific method so they can find information. A common method allows all scientists to answer questions in a similar way. Scientists who use this method can reproduce another scientists experiments. Almost all versions of the scientific method include the following steps, although some scientists do use slight variations. 1. 2. 3. 4. 5. 6. 7. Make observations. Identify a question you would like to answer based on the observation. Find out what is already known about your observation (research). Form a hypothesis. Test th