In [None]:
from src.config import LocalConfig, ColabConfig, is_colab, OllamaConfig

USE_OLLAMA = True

if USE_OLLAMA:
    OLLAMA_HOST = "172.19.176.1"
    OLLAMA_PORT = 11434
    OLLAMA_URL = f"http://{OLLAMA_HOST}:{OLLAMA_PORT}/api/chat"
    config = OllamaConfig(embedding_model="BAAI/bge-base-en", ollama_url=OLLAMA_URL)
else:
    config = ColabConfig(embedding_model="BAAI/bge-base-en") if is_colab() else LocalConfig(embedding_model="BAAI/bge-base-en")

print("Using configuration:", type(config).__name__)
print("Base directory:", config.BASE_DIR)

config.ensure_dirs()

In [None]:
import datasets

# Single cell: safe to run on a fresh environment
from src.load_data import ensure_data_available

# ‚úÖ Creates folders if missing and downloads only if needed
ensure_data_available(config=config)

print("üöÄ Dataset ready")

In [None]:
# Single cell to explore dataset shards
from src.explore_data import load_shards, explore_dataset

# Load first few shards to save memory
train_ds = load_shards(config.TRAIN_DIR, max_shards=3)
val_ds   = load_shards(config.VAL_DIR, max_shards=3)
test_ds  = load_shards(config.TEST_DIR, max_shards=3)

# Explore datasets
explore_dataset(train_ds, "Train set")
explore_dataset(val_ds, "Validation set")
explore_dataset(test_ds, "Test set")


In [None]:
from src.analyze_data import load_shards_concat, dataset_info, analyze_lengths, most_common_answers, print_sample_qa

# Load datasets
train_ds = load_shards_concat(config.TRAIN_DIR)
val_ds   = load_shards_concat(config.VAL_DIR)
test_ds  = load_shards_concat(config.TEST_DIR)

# Explore datasets and save plots in the 'plots/' folder
# for name, ds in [("Train", train_ds), ("Validation", val_ds), ("Test", test_ds)]:
#     if ds is None:
#         print(f"No dataset found for {name}")
#         continue
#     dataset_info(ds, name)
#     analyze_lengths(ds, "question", name)
#     analyze_lengths(ds, "answer", name)
#     most_common_answers(ds)
#     print_sample_qa(ds, name, n=5)


In [None]:
from src.compute_embeddings import compute_embeddings, retrieve_top_k

# Compute embeddings (will load from file if already exists)
corpus, corpus_embeddings = compute_embeddings(config=config)
print("embeddings loaded")
# Test retrieval
query = "What is the capital of france?"
results, scores = retrieve_top_k(query=query, corpus=corpus, corpus_embeddings=corpus_embeddings, config=config, top_k=3)

print("\nTop 3 retrieved passages for query:")
for passage, score in zip(results, scores):
    print(f"[score: {score:.4f}] {passage}\n---")


In [None]:
from src.generator import load_embeddings, generate_answer_combined

corpus, emb = load_embeddings(config=config)

query = "What is the capital of france?"
answer, ctx = generate_answer_combined(query, corpus, emb, config=config, top_k=5)

print("\nüîç Used Context Passages:\n")
for i,p in enumerate(ctx,1):
    print(f"{i}. {p[:200].replace(chr(10),' ')}...\n")

print("üí° Final Answer:\n", answer)

# Evaluate ONLY RETRIEVE Performance

In [None]:
from src.evaluate_retrieve import run_evaluation

run_evaluation(config=config)

In [None]:
from src.evaluate_rag_full import run_full_rag_eval

run_full_rag_eval(config=config, max_questions=10000)