In [2]:
from datasets import load_from_disk, DatasetDict
from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2Processor
import jiwer
import torchaudio
import os
import pandas as pd

In [3]:
# ==== Paths ====
input_path = "/scratch/pippalin2/jupyter/GMM-DistilHuBERT/data/hf_librispeech_clean100"
output_path = "/scratch/pippalin2/jupyter/GMM-DistilHuBERT/data/hf_librispeech_clean100_preprocessed"

# ==== Load raw dataset ====
ds = load_from_disk(input_path)

# ==== Load tokenizer/processor ====
tokenizer = Wav2Vec2CTCTokenizer.from_pretrained("facebook/wav2vec2-base-960h")
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")

# ==== Text cleaning ====
transform = jiwer.Compose([
    jiwer.ToLowerCase(),
    jiwer.RemovePunctuation(),
    jiwer.RemoveMultipleSpaces(),
    jiwer.Strip()
])

def clean_text(batch):
    batch["text"] = transform(batch["text"])
    return batch

# ==== Feature extraction ====
def extract_features(batch):
    audio = batch["audio"]
    batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
    batch["labels"] = tokenizer(batch["text"]).input_ids
    return batch

# ==== Apply transforms ====
ds = ds.map(clean_text)
ds = ds.map(extract_features, remove_columns=ds.column_names)

# ==== Save preprocessed dataset ====
DatasetDict({"train": ds}).save_to_disk(output_path)
print(f"✅ Preprocessed dataset saved to: {output_path}")

Loading dataset from disk:   0%|          | 0/47 [00:00<?, ?it/s]

Map:   0%|          | 0/28523 [00:00<?, ? examples/s]

Map:   0%|          | 0/28523 [00:00<?, ? examples/s]

Saving the dataset (0/47 shards):   0%|          | 0/28523 [00:00<?, ? examples/s]

✅ Preprocessed dataset saved to: /scratch/pippalin2/jupyter/GMM-DistilHuBERT/data/hf_librispeech_clean100_preprocessed


In [8]:
from datasets import load_from_disk
from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2Processor
import jiwer

input_path = "/scratch/pippalin2/jupyter/GMM-DistilHuBERT/data/hf_test_clean"
output_path = "/scratch/pippalin2/jupyter/GMM-DistilHuBERT/data/hf_test_clean_preprocessed"

tokenizer = Wav2Vec2CTCTokenizer.from_pretrained("facebook/wav2vec2-base-960h")
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")

transform = jiwer.Compose([
    jiwer.ToLowerCase(),
    jiwer.RemovePunctuation(),
    jiwer.RemoveMultipleSpaces(),
    jiwer.Strip()
])

def clean_text(batch):
    batch["text"] = transform(batch["text"])
    return batch

def extract_features(batch):
    audio = batch["audio"]
    batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
    batch["labels"] = tokenizer(batch["text"]).input_ids
    return batch

# ds is a DatasetDict, e.g. {"train": test-clean}
ds = load_from_disk(input_path)
ds = ds.map(clean_text)
ds = ds.map(extract_features, remove_columns=["path", "utt_id", "text", "audio"])

# ✅ save without wrapping again
ds.save_to_disk(output_path)

print("✅ Test-clean preprocessed and saved correctly.")

Loading dataset from disk:   0%|          | 0/80 [00:00<?, ?it/s]

Map:   0%|          | 0/49256 [00:00<?, ? examples/s]

Saving the dataset (0/80 shards):   0%|          | 0/49256 [00:00<?, ? examples/s]

✅ Test-clean preprocessed and saved correctly.


In [10]:
train_dataset = load_from_disk(os.path.join("/scratch/pippalin2/jupyter/GMM-DistilHuBERT/data/hf_librispeech_clean100_preprocessed"))["train"]
eval_dataset = load_from_disk(os.path.join("/scratch/pippalin2/jupyter/GMM-DistilHuBERT/data/hf_test_clean_preprocessed"))["train"]

Loading dataset from disk:   0%|          | 0/47 [00:00<?, ?it/s]

Loading dataset from disk:   0%|          | 0/80 [00:00<?, ?it/s]

In [2]:
import torch
print(torch.cuda.is_available())  # Should return: True
print(torch.cuda.get_device_name(0))  # Should return: 'NVIDIA GeForce RTX 2080 Ti'


True
NVIDIA GeForce RTX 2080 Ti


In [10]:
from datasets import load_from_disk
import numpy as np

data_dir = "/scratch/pippalin2/jupyter/GMM-DistilHuBERT"
dataset_path = f"{data_dir}/data/hf_librispeech_clean100"
dataset = load_from_disk(dataset_path)

soft_labels = np.load(f"{data_dir}/script/train_classifier/soft_labels_clean100.npy")
print(f"soft_labels: {soft_labels.shape}, dataset: {len(dataset)}")

# Step 1: Determine label dim
K = soft_labels.shape[1]

# Step 2: Pad if needed
pad_count = len(dataset) - soft_labels.shape[0]
if pad_count > 0:
    print(f"Padding with {pad_count} zero-vectors of shape ({K},)")
    padding = np.zeros((pad_count, K), dtype=np.float32)
    soft_labels = np.concatenate([soft_labels, padding], axis=0)
elif pad_count < 0:
    print(f"Truncating {abs(pad_count)} entries")
    soft_labels = soft_labels[:len(dataset)]

# Step 3: Add to dataset and save
dataset = dataset.add_column("soft_labels", soft_labels.tolist())
dataset.save_to_disk(f"{dataset_path}_with_softlabels")
print("✅ Saved padded dataset.")


Loading dataset from disk:   0%|          | 0/47 [00:00<?, ?it/s]

soft_labels: (26816, 500), dataset: 28523
Padding with 1707 zero-vectors of shape (500,)


Saving the dataset (0/47 shards):   0%|          | 0/28523 [00:00<?, ? examples/s]

✅ Saved padded dataset.


In [13]:
import os

checkpoint_path = "/mnt/scratch/pippalin2/jupyter/GMM-DistilHuBERT/script/train_classifier/GMM_DHuBERT_pretrain/gmm_pretrained_distilhubert/checkpoint.pt"
log_path = "/mnt/scratch/pippalin2/jupyter/GMM-DistilHuBERT/script/train_classifier/GMM_DHuBERT_pretrain/gmm_pretrained_distilhubert/training_log.csv"

# Delete if they exist
if os.path.exists(checkpoint_path):
    os.remove(checkpoint_path)
    print("✅ Removed old checkpoint.")

if os.path.exists(log_path):
    os.remove(log_path)
    print("✅ Removed old training log.")


✅ Removed old checkpoint.
✅ Removed old training log.
