# TRAIN DATA SPLIT INTO TRAIN / VALIDATION AND SAVED TO JSON FILES

## Train Folder Structure

```text
asr_train_set_30_sec/
├── audio_segments/
│   └── olomeleia-xxxx_yyy.wav
└── transcriptions/
    └── olomeleia-xxxx_yyy.txt
```

## `split.json` Format

```json
[
  {
    "audio": "asr_train_set_30_sec/audio_segments/olomeleia-0001_001.wav",
    "text": "asr_train_set_30_sec/transcriptions/olomeleia-0001_001.txt"
  }
]
```


In [11]:
import os
import json
from collections import defaultdict

BASE_DIR = "/data/asr_train_set_30_sec"
AUDIO_DIR = os.path.join(BASE_DIR, "audio_segments")
TEXT_DIR = os.path.join(BASE_DIR, "transcriptions")

TRAIN_JSON = "train.json"
VAL_JSON = "validation.json"

SPLIT_RATIO = 0.7

# Group segments by original recording
groups = defaultdict(list)

In [3]:
for fname in sorted(os.listdir(AUDIO_DIR)):
    if fname.endswith(".wav"):
        base_id = fname.split("_")[0]  # olomeleia-xxxx
        groups[base_id].append(fname)

In [6]:
groups.keys()

dict_keys(['olomeleia-20250530', 'olomeleia-20250611'])

In [7]:
train_entries = []
val_entries = []

# Sequential split per recording
for base_id, audio_files in groups.items():
    audio_files = sorted(audio_files)
    total = len(audio_files)
    split_idx = int(total * SPLIT_RATIO)

    for i, audio_file in enumerate(audio_files):
        txt_file = audio_file.replace(".wav", ".txt")

        entry = {
            "audio": os.path.join(AUDIO_DIR, audio_file),
            "text": os.path.join(TEXT_DIR, txt_file),
        }

        if i < split_idx:
            train_entries.append(entry)
        else:
            val_entries.append(entry)

In [8]:
train_entries

[{'audio': '/data/asr_train_set_30_sec/audio_segments/olomeleia-20250530_120.wav',
  'text': '/data/asr_train_set_30_sec/transcriptions/olomeleia-20250530_120.txt'},
 {'audio': '/data/asr_train_set_30_sec/audio_segments/olomeleia-20250530_121.wav',
  'text': '/data/asr_train_set_30_sec/transcriptions/olomeleia-20250530_121.txt'},
 {'audio': '/data/asr_train_set_30_sec/audio_segments/olomeleia-20250530_122.wav',
  'text': '/data/asr_train_set_30_sec/transcriptions/olomeleia-20250530_122.txt'},
 {'audio': '/data/asr_train_set_30_sec/audio_segments/olomeleia-20250530_123.wav',
  'text': '/data/asr_train_set_30_sec/transcriptions/olomeleia-20250530_123.txt'},
 {'audio': '/data/asr_train_set_30_sec/audio_segments/olomeleia-20250530_124.wav',
  'text': '/data/asr_train_set_30_sec/transcriptions/olomeleia-20250530_124.txt'},
 {'audio': '/data/asr_train_set_30_sec/audio_segments/olomeleia-20250530_125.wav',
  'text': '/data/asr_train_set_30_sec/transcriptions/olomeleia-20250530_125.txt'},
 {'a

In [9]:
# Write JSON files
with open(TRAIN_JSON, "w", encoding="utf-8") as f:
    json.dump(train_entries, f, indent=2, ensure_ascii=False)

with open(VAL_JSON, "w", encoding="utf-8") as f:
    json.dump(val_entries, f, indent=2, ensure_ascii=False)

print(f"✅ Wrote {len(train_entries)} entries to {TRAIN_JSON}")
print(f"✅ Wrote {len(val_entries)} entries to {VAL_JSON}")


✅ Wrote 202 entries to train.json
✅ Wrote 89 entries to validation.json


# TEST DATA SAVED TO JSON FILES

## Test Folder Structure

```text
asr_test_set_30_sec/
├── olomeleia-0001/
│   ├── audio_segments/
│   │   └── olomeleia-0001_001.wav
│   └── transcriptions/
│       └── olomeleia-0001_001.txt
├── olomeleia-0002/
│   ├── audio_segments/
│   └── transcriptions/
└── ...
```

## `split.json` Format (Test Set)

```json
[
  {
    "audio": "asr_test_set_30_sec/olomeleia-0001/audio_segments/olomeleia-0001_001.wav",
    "text": "asr_test_set_30_sec/olomeleia-0001/transcriptions/olomeleia-0001_001.txt"
  }
]
```


In [12]:
import os
import json

BASE_DIR = "/data/asr_test_set_30_sec"
TEST_JSON = "test.json"

entries = []

# Iterate over olomeleia-xxxx folders
for recording in sorted(os.listdir(BASE_DIR)):
    rec_path = os.path.join(BASE_DIR, recording)
    if not os.path.isdir(rec_path):
        continue

    audio_dir = os.path.join(rec_path, "audio_segments")
    text_dir = os.path.join(rec_path, "transcriptions")

    if not os.path.isdir(audio_dir) or not os.path.isdir(text_dir):
        continue

    for fname in sorted(os.listdir(audio_dir)):
        if not fname.endswith(".wav"):
            continue

        txt_file = fname.replace(".wav", ".txt")

        entry = {
            "audio": os.path.join(audio_dir, fname),
            "text": os.path.join(text_dir, txt_file),
        }
        entries.append(entry)


In [16]:
# Write test.json
with open(TEST_JSON, "w", encoding="utf-8") as f:
    json.dump(entries, f, indent=2, ensure_ascii=False)

print(f"✅ Wrote {len(entries)} entries to {TEST_JSON}")

✅ Wrote 360 entries to test.json
