In [1]:
from src.config import LocalConfig, ColabConfig, is_colab

config = ColabConfig(embedding_model="BAAI/bge-base-en") if is_colab() else LocalConfig(embedding_model="BAAI/bge-base-en")

print("Using configuration:", type(config).__name__)
print("Base directory:", config.BASE_DIR)

config.ensure_dirs()

Using configuration: LocalConfig
Base directory: /mnt/c/dev/ml/rag-qa
‚úÖ Ensured directory exists: /mnt/c/dev/ml/rag-qa/.hf_cache
‚úÖ Ensured directory exists: /mnt/c/dev/ml/rag-qa/data
‚úÖ Ensured directory exists: /mnt/c/dev/ml/rag-qa/data/train
‚úÖ Ensured directory exists: /mnt/c/dev/ml/rag-qa/data/validation
‚úÖ Ensured directory exists: /mnt/c/dev/ml/rag-qa/data/test


In [2]:
# Single cell: safe to run on a fresh environment
from src.load_data import ensure_data_available

# ‚úÖ Creates folders if missing and downloads only if needed
ensure_data_available(config=config)

print("üöÄ Dataset ready")

  from .autonotebook import tqdm as notebook_tqdm


‚úî Dataset already downloaded ‚Äî skipping.
üöÄ Dataset ready


In [3]:
# Single cell to explore dataset shards
from src.explore_data import load_shards, explore_dataset

# Load first few shards to save memory
train_ds = load_shards(config.TRAIN_DIR, max_shards=3)
val_ds   = load_shards(config.VAL_DIR, max_shards=3)
test_ds  = load_shards(config.TEST_DIR, max_shards=3)

# Explore datasets
explore_dataset(train_ds, "Train set")
explore_dataset(val_ds, "Validation set")
explore_dataset(test_ds, "Test set")



Exploring Train set:
Total examples across all shards: 3000
Columns: ['question', 'question_id', 'question_source', 'entity_pages', 'search_results', 'answer']

Column types:
 - question: Value('string')
 - question_id: Value('string')
 - question_source: Value('string')
 - entity_pages: {'doc_source': List(Value('string')), 'filename': List(Value('string')), 'title': List(Value('string')), 'wiki_context': List(Value('string'))}
 - search_results: {'description': List(Value('null')), 'filename': List(Value('null')), 'rank': List(Value('null')), 'search_context': List(Value('null')), 'title': List(Value('null')), 'url': List(Value('null'))}
 - answer: {'aliases': List(Value('string')), 'matched_wiki_entity_name': Value('string'), 'normalized_aliases': List(Value('string')), 'normalized_matched_wiki_entity_name': Value('string'), 'normalized_value': Value('string'), 'type': Value('string'), 'value': Value('string')}

Sample data from first 3 examples (strings truncated to 50 chars):
{'q

In [4]:
from src.analyze_data import load_shards_concat, dataset_info, analyze_lengths, most_common_answers, print_sample_qa

# Load datasets
train_ds = load_shards_concat(config.TRAIN_DIR)
val_ds   = load_shards_concat(config.VAL_DIR)
test_ds  = load_shards_concat(config.TEST_DIR)

# Explore datasets and save plots in the 'plots/' folder
# for name, ds in [("Train", train_ds), ("Validation", val_ds), ("Test", test_ds)]:
#     if ds is None:
#         print(f"No dataset found for {name}")
#         continue
#     dataset_info(ds, name)
#     analyze_lengths(ds, "question", name)
#     analyze_lengths(ds, "answer", name)
#     most_common_answers(ds)
#     print_sample_qa(ds, name, n=5)


In [5]:
from src.compute_embeddings import compute_embeddings, retrieve_top_k

# Compute embeddings (will load from file if already exists)
corpus, corpus_embeddings = compute_embeddings(config=config)

# Test retrieval
query = "What is the capital of france?"
results, scores = retrieve_top_k(query=query, corpus=corpus, corpus_embeddings=corpus_embeddings, config=config, top_k=3)

print("\nTop 3 retrieved passages for query:")
for passage, score in zip(results, scores):
    print(f"[score: {score:.4f}] {passage}\n---")


Loading saved embeddings from /mnt/c/dev/ml/rag-qa/corpus_embeddings_unique.pkl...
Loaded 978526 passages.

Top 3 retrieved passages for query:
[score: 0.8731] Paris: paris ( french : ) is the capital and most populous city of france. situated on the river seine in northern metropolitan france, it is in the centre of the ile - de - france region, also known as the region parisienne, " paris region ". the commune of paris largely retains its one and a half century old administrative boundaries, with an area of 105 km¬≤ ( 41 mi¬≤ ) and a population of 2, 241, 346. together with its suburbs, the whole agglomeration has a population of 10, 550, 350 ( jan. 2012 census ). paris'metropolitan area spans most of the ile - de - france region and has a population of 12, 405, 426 ( jan. 2013 census ), constituting one - fifth of the population of france. the administrative region covers 12, 012 km¬≤ ( 4, 638 mi¬≤ ), with approximately 12 million inhabitants, and has its own regional council and pr

In [None]:
from src.generator import load_embeddings, generate_answer_combined

corpus, emb = load_embeddings()

query = "What is the capital of france?"
answer, ctx = generate_answer_combined(query, corpus, emb, top_k=5)

print("\nüîç Used Context Passages:\n")
for i,p in enumerate(ctx,1):
    print(f"{i}. {p[:200].replace(chr(10),' ')}...\n")

print("üí° Final Answer:\n", answer)

# Evaluate ONLY RETRIEVE Performance

In [6]:
from src.evaluate_retrieve import run_evaluation

run_evaluation(config=config)

üîπ Loaded FAISS index with 978526 passages
BAAI/bge-base-en

=== üî• Evaluating TRAIN ‚Äî first 1000 samples ===


Loading /mnt/c/dev/ml/rag-qa/data/train: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 54/54 [00:00<00:00, 140.49it/s]
Evaluating Recall: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1000/1000 [02:27<00:00,  6.77it/s]


Recall@1: 0.8130
Recall@3: 0.9050
Recall@5: 0.9260
Recall@7: 0.9390
Recall@10: 0.9410

=== üî• Evaluating VALIDATION ‚Äî first 1000 samples ===


Loading /mnt/c/dev/ml/rag-qa/data/validation: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 8/8 [00:00<00:00, 126.32it/s]
Evaluating Recall: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1000/1000 [02:27<00:00,  6.79it/s]


Recall@1: 0.7260
Recall@3: 0.8320
Recall@5: 0.8580
Recall@7: 0.8700
Recall@10: 0.8820

=== üî• Evaluating TEST ‚Äî first 1000 samples ===


Loading /mnt/c/dev/ml/rag-qa/data/test: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 8/8 [00:00<00:00, 131.04it/s]
Evaluating Recall: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1000/1000 [02:27<00:00,  6.76it/s]

Recall@1: 0.7820
Recall@3: 0.8790
Recall@5: 0.9030
Recall@7: 0.9140
Recall@10: 0.9250





In [None]:
from src.evaluate_rag_full import run_full_rag_eval

run_full_rag_eval(config=config)