In [1]:
import os
from typing import Literal

REPO_URL = "https://github.com/lucas937-code/rag-qa"
REPO_DIR = "rag-qa"
BRANCH = "data_preprocessing"
MODE = "DEBUG"

# Clone repo only if it does not exist yet
if not os.path.isdir(REPO_DIR):
    print(f"Cloning repository from {REPO_URL}...")
    !git clone {REPO_URL} {REPO_DIR}
else:
    print(f"Repository '{REPO_DIR}' already exists, skipping clone.")

# Change into repo directory
%cd {REPO_DIR}

# Checkout the correct branch
if BRANCH != "main":
  !git checkout {BRANCH}

# Install dependencies
!pip install -r requirements.txt

Cloning repository from https://github.com/lucas937-code/rag-qa...
Cloning into 'rag-qa'...
remote: Enumerating objects: 166, done.[K
remote: Counting objects: 100% (166/166), done.[K
remote: Compressing objects: 100% (120/120), done.[K
remote: Total 166 (delta 79), reused 115 (delta 39), pack-reused 0 (from 0)[K
Receiving objects: 100% (166/166), 211.01 KiB | 8.44 MiB/s, done.
Resolving deltas: 100% (79/79), done.
/content/rag-qa
Branch 'data_preprocessing' set up to track remote branch 'data_preprocessing' from 'origin'.
Switched to a new branch 'data_preprocessing'
Collecting datasets (from -r requirements.txt (line 1))
  Downloading datasets-4.4.1-py3-none-any.whl.metadata (19 kB)
Collecting faiss-cpu (from -r requirements.txt (line 5))
  Downloading faiss_cpu-1.13.0-cp39-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (7.7 kB)
Collecting dill<0.4.1,>=0.3.0 (from datasets->-r requirements.txt (line 1))
  Downloading dill-0.4.0-py3-none-any.whl.metadata (10 kB)
Col

In [2]:
from google.colab import drive
from pathlib import Path
import os

# Mount Google Drive
drive.mount('/content/drive')

# Base project directory in Drive
PROJECT_ROOT = "/content/drive/MyDrive/RAG-Project"

RAW_DIR = f"{PROJECT_ROOT}/raw_dataset"
PROCESSED_DIR = f"{PROJECT_ROOT}/processed"
CHUNK_DIR = f"{PROCESSED_DIR}/chunks"
SPLIT_DIR = f"{PROCESSED_DIR}/splits"
EMB_DIR = f"{PROCESSED_DIR}/embeddings"
INDEX_DIR = f"{PROJECT_ROOT}/indexes"

# Create directories if they do not exist yet
for p in [RAW_DIR, PROCESSED_DIR, CHUNK_DIR, SPLIT_DIR, EMB_DIR, INDEX_DIR]:
    Path(p).mkdir(parents=True, exist_ok=True)

print("Working directories ready:")
print("RAW_DIR      =", RAW_DIR)
print("PROCESSED_DIR=", PROCESSED_DIR)
print("SPLIT_DIR    =", SPLIT_DIR)
print("CHUNK_DIR    =", CHUNK_DIR)
print("EMB_DIR      =", EMB_DIR)
print("INDEX_DIR    =", INDEX_DIR)

Mounted at /content/drive
Working directories ready:
RAW_DIR      = /content/drive/MyDrive/RAG-Project/raw_dataset
PROCESSED_DIR= /content/drive/MyDrive/RAG-Project/processed
SPLIT_DIR    = /content/drive/MyDrive/RAG-Project/processed/splits
CHUNK_DIR    = /content/drive/MyDrive/RAG-Project/processed/chunks
EMB_DIR      = /content/drive/MyDrive/RAG-Project/processed/embeddings
INDEX_DIR    = /content/drive/MyDrive/RAG-Project/indexes


In [3]:
from pathlib import Path
from src.data_prep.load_dataset import prepare_full_dataset

raw_train_parquet = Path(RAW_DIR) / "train.parquet"

# use file directly if it aready exists
if raw_train_parquet.exists():
    print(f"Found existing raw train dataset at {raw_train_parquet}, skipping download.")
    paths = {"train": raw_train_parquet}
else:
    print("No raw train dataset found, preparing full dataset...")
    paths = prepare_full_dataset(
        output_dir=RAW_DIR,
        dataset_name="trivia_qa",
        subset="rc.wikipedia",
        splits=("train",),
        # max_examples_per_split={"train": 50},  # optional for debugging
    )

raw_train_path = Path(paths["train"])
print(f"Using raw_train_path = {raw_train_path}")

No raw train dataset found, preparing full dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

Resolving data files:   0%|          | 0/26 [00:00<?, ?it/s]

rc.wikipedia/train-00000-of-00007.parque(…):   0%|          | 0.00/240M [00:00<?, ?B/s]

rc.wikipedia/train-00001-of-00007.parque(…):   0%|          | 0.00/261M [00:00<?, ?B/s]

rc.wikipedia/train-00002-of-00007.parque(…):   0%|          | 0.00/319M [00:00<?, ?B/s]

rc.wikipedia/train-00003-of-00007.parque(…):   0%|          | 0.00/266M [00:00<?, ?B/s]

rc.wikipedia/train-00004-of-00007.parque(…):   0%|          | 0.00/240M [00:00<?, ?B/s]

rc.wikipedia/train-00005-of-00007.parque(…):   0%|          | 0.00/259M [00:00<?, ?B/s]

rc.wikipedia/train-00006-of-00007.parque(…):   0%|          | 0.00/253M [00:00<?, ?B/s]

rc.wikipedia/validation-00000-of-00001.p(…):   0%|          | 0.00/235M [00:00<?, ?B/s]

rc.wikipedia/test-00000-of-00001.parquet:   0%|          | 0.00/221M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/61888 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/7993 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7701 [00:00<?, ? examples/s]

Using raw_train_path = /content/drive/MyDrive/RAG-Project/raw_dataset/train.parquet


In [4]:
from pathlib import Path
from src.data_prep.split_dataset import create_train_val_split

split_train_path = Path(SPLIT_DIR) / "train.parquet"
split_val_path = Path(SPLIT_DIR) / "val_7900.parquet"
split_meta_path = Path(SPLIT_DIR) / "splits_meta.json"

# check if splits already exist
if split_train_path.exists() and split_val_path.exists():
    print("Found existing splits, skipping split creation.")
    split_paths = {
        "train": split_train_path,
        "val": split_val_path,
        "meta": split_meta_path if split_meta_path.exists() else None,
    }
else:
    print("No existing splits found, creating new train/val split...")
    split_paths = create_train_val_split(
        raw_train_path=raw_train_path,
        output_dir=SPLIT_DIR,
        val_size=7900,
        strategy="first_n",
    )

print("Split paths:")
for k, v in split_paths.items():
    print(f"  {k}: {v}")

No existing splits found, creating new train/val split...
Split paths:
  train: /content/drive/MyDrive/RAG-Project/processed/splits/train.parquet
  val: /content/drive/MyDrive/RAG-Project/processed/splits/val_7900.parquet
  meta: /content/drive/MyDrive/RAG-Project/processed/splits/splits_meta.json


In [5]:
if MODE == "DEBUG":
  import pandas as pd
  from pathlib import Path

  val_path = Path(split_paths["val"])
  df_val = pd.read_parquet(val_path)

  print("Validation shape:", df_val.shape)

  expected_val_size = 7900
  if df_val.shape[0] != expected_val_size:
      raise ValueError(
          f"Expected {expected_val_size} validation examples, "
          f"but found {df_val.shape[0]}"
      )
  else:
      print(f"Validation split has the expected size of {expected_val_size} rows.")

  train_path = Path(split_paths["train"])
  df_train = pd.read_parquet(train_path)

  print("Train shape:", df_train.shape)

  # print a random example
  print("\n--- Example 0 ---")
  rand_idx = df_train.sample(1).index[0]
  print(f"\n--- Random example (index={rand_idx}) ---")
  display(df_train.loc[rand_idx])

Validation shape: (7900, 7)
Validation split has the expected size of 7900 rows.
Train shape: (53988, 7)

--- Example 0 ---

--- Random example (index=9158) ---


Unnamed: 0,9158
orig_index,17058
question_id,bb_2520
question,The French term ligne (equating to 'line' in E...
answer_aliases_json,"[""Biggity"", ""Resized"", ""Size changing"", ""Sized..."
answer_normalized_json,"[""resize"", ""resized"", ""sized"", ""sizable"", ""siz..."
doc_titles_json,"[""Watch""]"
evidence_text,A watch is a small timepiece intended to be ca...
