## 1 Data Subsampling

This notebook subsamples 4 hours of training data and 4 hours of validated data from the original dataset for the Sursilvan idiom. This subsample is going to be used to train the first prototype.

In [6]:
import os
import pandas as pd
import shutil
import soundfile as sf
from tqdm import tqdm
import random

In [9]:
DATA_ROOT = "romansh-data"
IDIOM_FOLDER = "rmsursilv-cc-2021-05-28"
TARGET_HOURS = {
    "train": 4.0,
    "validated": 4.0
}
RANDOM_SEED = 42
OUTPUT_FOLDER = os.path.join(DATA_ROOT, "sursilvan-small")

BASE_PATH = os.path.join(DATA_ROOT, IDIOM_FOLDER)
CLIPS_PATH = os.path.join(BASE_PATH, "clips")

def get_audio_duration(path):
  """Return duration of a wav file in seconds."""
  try:
    with sf.SoundFile(path) as f:
      return len(f) / f.samplerate
  except Exception as e:
    print(f"‚ö†Ô∏è Could not read {path}: {e}")
    return 0.0


def subsample_split(df, split_name, target_hours):
  """Return a subsampled DataFrame totaling ~target_hours."""
  df = df.sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)

  selected_rows = []
  total_seconds = 0.0

  for _, row in tqdm(df.iterrows(), total=len(df), desc=f"Subsampling {split_name}"):
    audio_path = os.path.join(CLIPS_PATH, row["path"])
    duration = get_audio_duration(audio_path)
    if duration == 0:
      continue
    if total_seconds + duration > target_hours * 3600:
      break
    selected_rows.append(row)
    total_seconds += duration

  sub_df = pd.DataFrame(selected_rows)
  print(f"‚úÖ {split_name}: {len(sub_df)} utterances, {total_seconds/3600:.2f} hours")
  return sub_df


def copy_required_clips(df_list, output_clips_path):
  """Copy only audio files referenced in given list of DataFrames."""
  all_paths = set()
  for df in df_list:
    all_paths.update(df["path"].tolist())

  os.makedirs(output_clips_path, exist_ok=True)

  for rel_path in tqdm(all_paths, desc="Copying clips"):
    src_path = os.path.join(CLIPS_PATH, rel_path)
    dst_path = os.path.join(output_clips_path, rel_path)
    os.makedirs(os.path.dirname(dst_path), exist_ok=True)
    shutil.copy2(src_path, dst_path)

random.seed(RANDOM_SEED)
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

dfs_to_copy = []

for split_name, hours in TARGET_HOURS.items():
  tsv_path = os.path.join(BASE_PATH, f"{split_name}.tsv")
  if not os.path.isfile(tsv_path):
    print(f"‚ùå Missing {split_name}.tsv")
    continue

  df = pd.read_csv(tsv_path, sep="\t")
  sub_df = subsample_split(df, split_name, hours)
  output_tsv = os.path.join(OUTPUT_FOLDER, f"{split_name}.tsv")
  sub_df.to_csv(output_tsv, sep="\t", index=False)
  dfs_to_copy.append(sub_df)

test_tsv = os.path.join(BASE_PATH, "test.tsv")
if os.path.isfile(test_tsv):
  df_test = pd.read_csv(test_tsv, sep="\t")
  output_test_tsv = os.path.join(OUTPUT_FOLDER, "test.tsv")
  df_test.to_csv(output_test_tsv, sep="\t", index=False)
  dfs_to_copy.append(df_test)
  print(f"‚úÖ test set: {len(df_test)} utterances")

output_clips_path = os.path.join(OUTPUT_FOLDER, "clips")
copy_required_clips(dfs_to_copy, output_clips_path)

print(f"\nüéâ Mini Sursilvan folder ready at '{OUTPUT_FOLDER}'")
print("Contains:")
print(f" - {len(os.listdir(output_clips_path))} audio files (referenced in TSVs)")
print(" - train, validated, test TSVs")

Subsampling train:  11%|‚ñà         | 754/6888 [00:00<00:00, 8336.33it/s]


‚úÖ train: 754 utterances, 4.00 hours


Subsampling validated:   0%|          | 0/6982 [00:00<?, ?it/s]

Subsampling validated:  11%|‚ñà         | 750/6982 [00:00<00:00, 8504.13it/s]


‚úÖ validated: 750 utterances, 4.00 hours
‚úÖ test set: 94 utterances


Copying clips: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1504/1504 [00:00<00:00, 1857.78it/s]


üéâ Mini Sursilvan folder ready at 'romansh-data/sursilvan-small'
Contains:
 - 1504 audio files (referenced in TSVs)
 - train, validated, test TSVs



