In [1]:
from src.config import LocalConfig, ColabConfig, is_colab

config = ColabConfig(embedding_model="BAAI/bge-base-en") if is_colab() else LocalConfig()

print("Using configuration:", type(config).__name__)
print("Base directory:", config.BASE_DIR)

config.ensure_dirs()

Using configuration: LocalConfig
Base directory: /mnt/c/dev/ml/rag-qa
‚úÖ Ensured directory exists: /mnt/c/dev/ml/rag-qa/.hf_cache
‚úÖ Ensured directory exists: /mnt/c/dev/ml/rag-qa/data
‚úÖ Ensured directory exists: /mnt/c/dev/ml/rag-qa/data/train
‚úÖ Ensured directory exists: /mnt/c/dev/ml/rag-qa/data/validation
‚úÖ Ensured directory exists: /mnt/c/dev/ml/rag-qa/data/test


In [2]:
# Single cell: safe to run on a fresh environment
from src.load_data import ensure_data_available

# ‚úÖ Creates folders if missing and downloads only if needed
ensure_data_available(config=config)

print("üöÄ Dataset ready")

  from .autonotebook import tqdm as notebook_tqdm


‚úî Dataset already downloaded ‚Äî skipping.
üöÄ Dataset ready


In [3]:
# Single cell to explore dataset shards
from src.explore_data import load_shards, explore_dataset

# Load first few shards to save memory
train_ds = load_shards(config.TRAIN_DIR, max_shards=3)
val_ds   = load_shards(config.VAL_DIR, max_shards=3)
test_ds  = load_shards(config.TEST_DIR, max_shards=3)

# Explore datasets
explore_dataset(train_ds, "Train set")
explore_dataset(val_ds, "Validation set")
explore_dataset(test_ds, "Test set")



Exploring Train set:
Total examples across all shards: 3000
Columns: ['question', 'question_id', 'question_source', 'entity_pages', 'search_results', 'answer']

Column types:
 - question: Value('string')
 - question_id: Value('string')
 - question_source: Value('string')
 - entity_pages: {'doc_source': List(Value('string')), 'filename': List(Value('string')), 'title': List(Value('string')), 'wiki_context': List(Value('string'))}
 - search_results: {'description': List(Value('null')), 'filename': List(Value('null')), 'rank': List(Value('null')), 'search_context': List(Value('null')), 'title': List(Value('null')), 'url': List(Value('null'))}
 - answer: {'aliases': List(Value('string')), 'matched_wiki_entity_name': Value('string'), 'normalized_aliases': List(Value('string')), 'normalized_matched_wiki_entity_name': Value('string'), 'normalized_value': Value('string'), 'type': Value('string'), 'value': Value('string')}

Sample data from first 3 examples (strings truncated to 50 chars):
{'q

In [4]:
from src.analyze_data import load_shards_concat, dataset_info, analyze_lengths, most_common_answers, print_sample_qa

# Load datasets
train_ds = load_shards_concat(config.TRAIN_DIR)
val_ds   = load_shards_concat(config.VAL_DIR)
test_ds  = load_shards_concat(config.TEST_DIR)

# Explore datasets and save plots in the 'plots/' folder
# for name, ds in [("Train", train_ds), ("Validation", val_ds), ("Test", test_ds)]:
#     if ds is None:
#         print(f"No dataset found for {name}")
#         continue
#     dataset_info(ds, name)
#     analyze_lengths(ds, "question", name)
#     analyze_lengths(ds, "answer", name)
#     most_common_answers(ds)
#     print_sample_qa(ds, name, n=5)


In [4]:
from src.compute_embeddings import compute_embeddings, retrieve_top_k

# Compute embeddings (will load from file if already exists)
corpus, corpus_embeddings = compute_embeddings(config=config)

# Test retrieval
query = "What is the capital of france?"
results, scores = retrieve_top_k(query=query, corpus=corpus, corpus_embeddings=corpus_embeddings, config=config, top_k=3)

print("\nTop 3 retrieved passages for query:")
for passage, score in zip(results, scores):
    print(f"[score: {score:.4f}] {passage}\n---")


Embeddings not found or force_recompute=True, computing embeddings...
Reused cached passages from /mnt/c/dev/ml/rag-qa/corpus_passages.pkl (978526 passages).
Using model: SentenceTransformer
Using device: cuda


Computing embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1912/1912 [21:56<00:00,  1.45it/s]


Saved embeddings to /mnt/c/dev/ml/rag-qa/corpus_embeddings_unique.pkl
Saved FAISS index to /mnt/c/dev/ml/rag-qa/corpus_faiss.index (dim=384, n=978526)

Top 3 retrieved passages for query:
[score: 0.6164] Geography of France: france is a country in western europe. france borders the atlantic ocean and the mediterranean. to the west is the bay of biscay, to the north is the english channel and the north sea. france also has territory in south america, the caribbean, and the indian ocean, as well as numerous territories of various status. area * total area : 673, 801 km * * ( whole territory of the french republic, including all the overseas departments and territories, but excluding the disputed french territory of terre adelie in antarctica ) * metropolitan france : 551, 695 km * * ( metropolitan - i. e. european - france only, french national geographic institute data ) * metropolitan france : 543, 965 km * * ( metropolitan - i. e. european - france only, french land register data, whi

In [7]:
from src.generator import load_embeddings, generate_answer_combined

corpus, emb = load_embeddings()

query = "What is the capital of france?"
answer, ctx = generate_answer_combined(query, corpus, emb, top_k=5)

print("\nüîç Used Context Passages:\n")
for i,p in enumerate(ctx,1):
    print(f"{i}. {p[:200].replace(chr(10),' ')}...\n")

print("üí° Final Answer:\n", answer)

üîπ Loaded 100 passages from corpus_embeddings_unique.pkl
üîπ Loading FLAN-T5 model...
üîπ Loading embedding model...

üîç Used Context Passages:

1. Mediterranean Sea: the city of Haifa, Israel File:Gaza Beach.jpg|Beach on the Gaza Strip, State of Palestine File:Coast of Alexandria, A view From Bibliotheca Alexandrina, Egypt.jpg|Coast of Alexandri...

2. Mediterranean Sea: of Cape Trafalgar (Spain) and Cape Spartel (Africa). **On the northeast: The west coast of Italy. In the Strait of Messina a line joining the north extreme of Cape Paci (15¬∞42'E) wi...

3. Mediterranean Sea: as well as food (from fishing and the gathering of other seafood) for numerous communities throughout the ages. Due to the shared climate, geology, and access to the sea, cultures c...

4. Cannes Film Festival: last section of the Official Selection: la Cin√©fondation. Its aim was to support the creation of works of cinema in the world and to contribute to the entry of the new scenario w...

5. Mediterranea

# Evaluate ONLY RETRIEVE Performance

In [5]:
from src.evaluate_retrieve import run_evaluation

run_evaluation(config=config)

üîπ Loaded FAISS index with 978526 passages

=== üî• Evaluating TRAIN ‚Äî first 1000 samples ===


Loading /mnt/c/dev/ml/rag-qa/data/train: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 54/54 [00:00<00:00, 61.66it/s]
Evaluating Recall: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1000/1000 [00:44<00:00, 22.50it/s]


Recall@1: 0.6310
Recall@3: 0.8350
Recall@5: 0.8930
Recall@7: 0.9140
Recall@10: 0.9290

=== üî• Evaluating VALIDATION ‚Äî first 1000 samples ===


Loading /mnt/c/dev/ml/rag-qa/data/validation: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 8/8 [00:00<00:00, 75.20it/s]
Evaluating Recall: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1000/1000 [00:44<00:00, 22.26it/s]


Recall@1: 0.5240
Recall@3: 0.7410
Recall@5: 0.7890
Recall@7: 0.8180
Recall@10: 0.8440

=== üî• Evaluating TEST ‚Äî first 1000 samples ===


Loading /mnt/c/dev/ml/rag-qa/data/test: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 8/8 [00:00<00:00, 73.01it/s]
Evaluating Recall: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1000/1000 [00:44<00:00, 22.67it/s]

Recall@1: 0.6280
Recall@3: 0.8030
Recall@5: 0.8480
Recall@7: 0.8700
Recall@10: 0.8940





In [None]:
from src.evaluate_rag_full import run_full_rag_eval

run_full_rag_eval(config=config)


=== Loading embeddings ===



=== Loading Test dataset (100 samples) ===

=== Running RAG Evaluation ===


  0%|          | 0/100 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (3761 > 512). Running this sequence through the model will result in indexing errors
  0%|          | 0/100 [00:00<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 864.00 MiB. GPU 0 has a total capacity of 7.58 GiB of which 854.50 MiB is free. Including non-PyTorch memory, this process has 6.71 GiB memory in use. Of the allocated memory 6.54 GiB is allocated by PyTorch, and 32.87 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)