In [1]:
!pip install yt-dlp pydub datasets soundfile
!pip install openai-whisper

Collecting yt-dlp
  Downloading yt_dlp-2025.8.27-py3-none-any.whl.metadata (175 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/176.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m176.0/176.0 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
Downloading yt_dlp-2025.8.27-py3-none-any.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m53.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: yt-dlp
Successfully installed yt-dlp-2025.8.27
Collecting openai-whisper
  Downloading openai_whisper-20250625.tar.gz (803 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m803.2/803.2 kB[0m [31m29.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: openai

In [2]:
import os, json
import yt_dlp
import whisper
import nltk
import torchaudio
from datasets import Dataset, Audio
from huggingface_hub import HfApi, HfFolder, Repository

nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [9]:
import os
import yt_dlp
from google.colab import files

# Create folder for audio
os.makedirs("audio_files", exist_ok=True)

choice = input("Enter '1' to upload a file manually OR '2' to provide a YouTube link: ")

audio_files = []

if choice.strip() == "1":
    print("📤 Please upload your audio file (mp3, wav, m4a supported)")
    uploaded = files.upload()
    for filename in uploaded.keys():
        dst_path = os.path.join("audio_files", filename)
        os.rename(filename, dst_path)
        audio_files.append(dst_path)

elif choice.strip() == "2":
    url = input("🔗 Enter YouTube link: ").strip()

    ydl_opts = {
        "format": "bestaudio/best",
        "outtmpl": "audio_files/%(id)s.%(ext)s",
        "postprocessors": [
            {
                "key": "FFmpegExtractAudio",
                "preferredcodec": "wav",
                "preferredquality": "192",
            }
        ],
    }

    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        info = ydl.extract_info(url, download=True)
        filename = ydl.prepare_filename(info)
        audio_path = os.path.splitext(filename)[0] + ".wav"
        audio_files.append(audio_path)

else:
    print("❌ Invalid choice. Please restart and select 1 or 2.")

print("✅ Audio files ready:", audio_files)


Enter '1' to upload a file manually OR '2' to provide a YouTube link: 2
🔗 Enter YouTube link: https://youtu.be/1DOqousyDjc?si=9i3wN1SAdg9ozM7Q
[youtube] Extracting URL: https://youtu.be/1DOqousyDjc?si=9i3wN1SAdg9ozM7Q
[youtube] 1DOqousyDjc: Downloading webpage
[youtube] 1DOqousyDjc: Downloading tv simply player API JSON
[youtube] 1DOqousyDjc: Downloading tv client config
[youtube] 1DOqousyDjc: Downloading player 6742b2b9-main
[youtube] 1DOqousyDjc: Downloading tv player API JSON
[info] 1DOqousyDjc: Downloading 1 format(s): 251
[download] Destination: audio_files/1DOqousyDjc.webm
[download] 100% of    4.58MiB in 00:00:00 at 25.37MiB/s  
[ExtractAudio] Destination: audio_files/1DOqousyDjc.wav
Deleting original file audio_files/1DOqousyDjc.webm (pass -k to keep)
✅ Audio files ready: ['audio_files/1DOqousyDjc.wav']


In [10]:
files = [f for f in os.listdir("audio_files") if f.endswith(".wav")]
print("Audio files found:", files)

Audio files found: ['1DOqousyDjc.wav']


In [11]:
model = whisper.load_model("base")

all_transcriptions = []

for f in os.listdir("audio_files"):
    if f.endswith((".wav",".mp3")):
        result = model.transcribe(f"audio_files/{f}")
        all_transcriptions.append({
            "file": f,
            "text": result["text"],
            "segments": result["segments"]
        })

with open("transcriptions.json", "w") as f:
    json.dump(all_transcriptions, f, indent=2)



In [12]:
import nltk
nltk.download("punkt")
nltk.download("punkt_tab")   # NEW

sentences = []
for entry in all_transcriptions:
    for seg in entry["segments"]:
        for s in nltk.sent_tokenize(seg["text"]):
            sentences.append({"file": entry["file"], "sentence": s})

with open("sentences.json", "w") as f:
    json.dump(sentences, f, indent=2)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [14]:
import math

clip_dataset = []
clip_text = ""
clip_time = 0
clip_id = 0

max_duration = 30  # seconds

for entry in all_transcriptions:
    waveform, sr = torchaudio.load(f"raw_audio/{entry['file']}")

    for seg in entry["segments"]:
        seg_duration = seg["end"] - seg["start"]
        if clip_time + seg_duration <= max_duration:
            clip_text += " " + seg["text"]
            clip_time += seg_duration
        else:
            clip_dataset.append({
                "id": f"clip_{clip_id}",
                "text": clip_text.strip()
            })
            clip_id += 1
            clip_text = seg["text"]
            clip_time = seg_duration

# save last
if clip_text:
    clip_dataset.append({"id": f"clip_{clip_id}", "text": clip_text.strip()})

with open("clips.json", "w") as f:
    json.dump(clip_dataset, f, indent=2)


RuntimeError: Failed to open the input "raw_audio/1DOqousyDjc.wav" (No such file or directory).

In [15]:
import os, torchaudio

os.makedirs("clips_audio", exist_ok=True)
new_dataset = []

clip_id = 0
for entry in all_transcriptions:
    wav, sr = torchaudio.load(f"raw_audio/{entry['file']}")

    clip_text = ""
    clip_start = 0.0
    clip_end = 0.0

    for seg in entry["segments"]:
        seg_dur = seg["end"] - seg["start"]
        if (clip_end - clip_start) + seg_dur <= 30:
            # accumulate text
            clip_text += " " + seg["text"]
            clip_end = seg["end"]
        else:
            # save clip audio
            filename = f"clips_audio/clip_{clip_id}.wav"
            start_frame = int(clip_start * sr)
            end_frame = int(clip_end * sr)
            torchaudio.save(filename, wav[:, start_frame:end_frame], sr)

            new_dataset.append({"path": filename, "text": clip_text.strip()})

            clip_id += 1
            # reset for next
            clip_text = seg["text"]
            clip_start = seg["start"]
            clip_end = seg["end"]

# save last clip
if clip_text:
    filename = f"clips_audio/clip_{clip_id}.wav"
    start_frame = int(clip_start * sr)
    end_frame = int(clip_end * sr)
    torchaudio.save(filename, wav[:, start_frame:end_frame], sr)
    new_dataset.append({"path": filename, "text": clip_text.strip()})

# Build HF dataset with audio + text
from datasets import Dataset, Audio

ds = Dataset.from_list(new_dataset)
ds = ds.cast_column("path", Audio())
ds.save_to_disk("processed_dataset")


RuntimeError: Failed to open the input "raw_audio/1DOqousyDjc.wav" (No such file or directory).

In [None]:
from huggingface_hub import login
from datasets import load_from_disk

# 🔑 Login (paste your HF token when prompted)
login()

# 📂 Load the dataset from disk
ds = load_from_disk("processed_dataset")

# 🆕 Change this to your own repo name
repo_id = "nty23/fine"

# Push to Hub
ds.push_to_hub(repo_id)


In [None]:
from datasets import Dataset, Audio

files = [os.path.join("audio_files", f) for f in os.listdir("audio_files") if f.endswith(".wav")]

if len(files) == 0:
    raise ValueError("❌ No audio files found. Please upload or download again.")

dataset = Dataset.from_dict({"audio": files})
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))

print(dataset)


In [None]:
# Cell 9 — flatten processed_dataset to plain strings (path, text)
from datasets import load_from_disk, Dataset
import os

# Load the saved dataset
ds_raw = load_from_disk("processed_dataset")

# SAFELY extract filepaths from the Arrow struct without decoding audio
# (We avoid .map() so nothing tries to auto-decode)
paths = []
texts = []
table = ds_raw.data  # this is a pyarrow.Table

# The "path" column is a struct<bytes: binary, path: string>. We want its "path" field only.
for chunk in table.column("path").chunks:
    # chunk is a pyarrow.StructArray
    # get the "path" field (string) and extend to python list
    paths.extend(chunk.field("path").to_pylist())

# text column is plain string; gather all rows
for chunk in table.column("text").chunks:
    texts.extend(chunk.to_pylist())

assert len(paths) == len(texts), "Mismatch between paths and texts length."

# Build a clean list of dicts (no Audio feature anywhere)
clean_list = [{"path": p, "text": t} for p, t in zip(paths, texts)]

# Rebuild a fresh HF dataset with ONLY strings
ds = Dataset.from_list(clean_list)

print(ds)
print("Example row:", ds[0])
print("✅ Dataset rebuilt with plain strings only.")


In [None]:
from datasets import Dataset, Audio

# Collect downloaded wav files
files = [os.path.join("audio_files", f) for f in os.listdir("audio_files") if f.endswith(".wav")]

dataset = Dataset.from_dict({"audio": files})
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))

print(dataset)


In [None]:
!pip install transformers accelerate librosa

In [None]:
!pip install git+https://github.com/m-bain/whisperx.git


In [None]:
from transformers import pipeline

# Load Whisper pipeline with word timestamps
pipe = pipeline(
    "automatic-speech-recognition",
    model="openai/whisper-small",  # small for speed, can change to base/medium/large
    generate_kwargs={"task": "transcribe", "return_timestamps": "word"}
)

In [None]:
import whisperx
import torch
from datasets import load_from_disk

# Load cleaned dataset
cleaned_ds = load_from_disk("cleaned_dataset")

# Load model
device = "cuda" if torch.cuda.is_available() else "cpu"
model = whisperx.load_model("small", device)

# Alignment model for word-level timestamps
align_model, metadata = whisperx.load_align_model(language_code="en", device=device)

def transcribe_with_timestamps(batch):
    try:
        import numpy as np
        audio_array = np.array(batch["array"], dtype=np.float32)

        # Run ASR
        result = model.transcribe(audio_array, batch_size=8)

        # Align to word-level
        result_aligned = whisperx.align(result["segments"], align_model, metadata, audio_array, device)

        batch["transcription"] = result["text"]
        batch["segments"] = result_aligned["segments"]   # sentence-level with start/end
        batch["words"] = result_aligned["word_segments"] # word-level with start/end
    except Exception as e:
        batch["transcription"] = f"❌ Error: {e}"
        batch["segments"] = []
        batch["words"] = []
    return batch

# Map over dataset
transcribed_ds = cleaned_ds.map(transcribe_with_timestamps)

# Save
transcribed_ds.save_to_disk("transcribed_dataset")
print("📦 Saved transcribed_dataset with word-level timestamps.")


In [None]:
import librosa
import numpy as np

transcriptions = []

for i in range(len(dataset)):
    # Get audio path from dataset
    audio_path = dataset[i]["audio"]["path"]

    # Load audio with librosa (16kHz mono)
    audio_data, sr = librosa.load(audio_path, sr=16000, mono=True)

    # Split into 30s chunks
    chunk_size = sr * 30  # 30 seconds in samples
    for start in range(0, len(audio_data), chunk_size):
        chunk = audio_data[start:start + chunk_size]

        # Transcribe each chunk (timestamps disabled)
        result = pipe({"array": chunk, "sampling_rate": sr}, return_timestamps=False)
        transcriptions.append(result["text"])

print("✅ Transcription completed. Number of chunks processed:", len(transcriptions))


In [None]:
import json

with open("transcriptions.json", "w") as f:
    json.dump(transcriptions, f, indent=2)

print("📄 Transcriptions saved to transcriptions.json")


In [None]:
import re

def split_by_punctuation(words, punctuations=".?!"):
    segments = []
    current = []
    for w in words:
        current.append(w)
        if any(w["word"].endswith(p) for p in punctuations):
            segments.append(current)
            current = []
    if current:
        segments.append(current)
    return segments

# Example: split first transcription into sentences
first_words = transcriptions[0]["chunks"]  # list of {word, timestamp}
segments = split_by_punctuation(first_words)

for idx, seg in enumerate(segments[:5]):
    text = " ".join([w["text"] for w in seg])
    start = seg[0]["timestamp"][0]
    end = seg[-1]["timestamp"][1]
    print(f"Segment {idx+1}: {text} ({start:.2f}s → {end:.2f}s)")


In [None]:
# Join all chunk transcriptions into one text
full_transcript = " ".join(transcriptions)

print("✅ Combined transcript length:", len(full_transcript))
print("🔹 Preview:\n", full_transcript[:500])  # show first 500 chars


In [None]:
!pip install nltk

import nltk
nltk.download("punkt")


In [None]:
from nltk.tokenize import sent_tokenize

# Split transcript into sentences (NLTK handles punctuation intelligently)
sentences = sent_tokenize(full_transcript)

print("✅ Transcript split into", len(sentences), "sentences")
print("🔹 First 5 sentences:\n", sentences[:5])


In [None]:
from nltk.tokenize import word_tokenize

word_chunks = [word_tokenize(sentence) for sentence in sentences]

print("✅ Example word tokens from first sentence:\n", word_chunks[0])


In [None]:
!pip install unsloth datasets transformers accelerate bitsandbytes



In [None]:
from datasets import load_dataset, Dataset, Audio
from unsloth import FastLanguageModel
import torch


In [None]:
# Suppose you have audio file paths + corresponding transcripts
# Example: audio_files = ["audio1.wav", "audio2.wav", ...]
#          transcripts = ["Hello world", "This is a test", ...]

dataset = Dataset.from_dict({
    "audio": audio_files,   # list of file paths
    "text": transcripts,    # aligned text
})

# Convert audio column into Hugging Face Audio type
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))

print(dataset[0])


In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "openai/whisper-small",  # can also try "tiny" or "base"
    max_seq_length = 512,
    dtype = torch.float16,
    load_in_4bit = True,
)


In [None]:
def prepare_batch(batch):
    audio = batch["audio"]["array"]
    inputs = tokenizer(batch["text"], return_tensors="pt", truncation=True)
    batch["input_ids"] = inputs["input_ids"][0]
    batch["attention_mask"] = inputs["attention_mask"][0]
    return batch

dataset = dataset.map(prepare_batch)


In [None]:
from unsloth import Trainer

trainer = Trainer(
    model=model,
    train_dataset=dataset,
    eval_dataset=dataset.select(range(2)),  # just 2 samples for quick sanity check
    tokenizer=tokenizer,
    max_steps=100,    # increase for real training
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    logging_steps=10,
    save_steps=50,
)

trainer.train()


In [None]:
model.save_pretrained("finetuned-whisper-unsloth")
tokenizer.save_pretrained("finetuned-whisper-unsloth")


In [None]:
!pip install unsloth datasets transformers accelerate bitsandbytes peft


In [None]:
from datasets import Dataset, Audio
from unsloth import FastLanguageModel
from peft import LoraConfig
import torch


In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "openai/whisper-small",  # try "tiny" or "base" if low VRAM
    max_seq_length = 512,
    dtype = torch.float16,
    load_in_4bit = True,
)


In [None]:
# Configure LoRA
lora_config = LoraConfig(
    r=16,                # rank
    lora_alpha=32,       # scaling
    target_modules=["q_proj","v_proj"],  # attention projection layers
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_2_SEQ_LM",  # Whisper is seq2seq
)

model = FastLanguageModel.get_peft_model(model, lora_config)


In [None]:
dataset = Dataset.from_dict({
    "audio": audio_files,
    "text": transcripts,
})
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))

def prepare_batch(batch):
    inputs = tokenizer(batch["text"], return_tensors="pt", truncation=True)
    batch["input_ids"] = inputs["input_ids"][0]
    batch["attention_mask"] = inputs["attention_mask"][0]
    return batch

dataset = dataset.map(prepare_batch)


In [None]:
from unsloth import Trainer

trainer = Trainer(
    model=model,
    train_dataset=dataset,
    eval_dataset=dataset.select(range(2)),
    tokenizer=tokenizer,
    max_steps=200,    # increase for real training
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    logging_steps=10,
    save_steps=50,
)

trainer.train()


In [None]:
model.save_pretrained("whisper-lora-finetuned")
tokenizer.save_pretrained("whisper-lora-finetuned")


In [None]:
from peft import PeftModel

# Reload base + LoRA
base_model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "openai/whisper-small",
    max_seq_length = 512,
    dtype = torch.float16,
    load_in_4bit = True,
)

model = PeftModel.from_pretrained(base_model, "whisper-lora-finetuned")

# Test transcription
inputs = tokenizer("dummy input", return_tensors="pt").to("cuda")
outputs = model.generate(**inputs, max_length=100)
print(tokenizer.decode(outputs[0]))


In [None]:
from datasets import DatasetDict

# Shuffle before splitting (important!)
dataset = dataset.shuffle(seed=42)

# 90% train, 10% eval
split_dataset = dataset.train_test_split(test_size=0.1)

train_dataset = split_dataset["train"]
eval_dataset  = split_dataset["test"]

print(train_dataset)
print(eval_dataset)


In [None]:
from unsloth import Trainer

trainer = Trainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    max_steps=200,   # increase for real runs
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    logging_steps=10,
    eval_steps=50,   # evaluate regularly
    save_steps=50,
)


In [None]:
import evaluate
wer = evaluate.load("wer")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(-1)
    pred_str = tokenizer.batch_decode(preds, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(labels, skip_special_tokens=True)
    return {"wer": wer.compute(predictions=pred_str, references=label_str)}

trainer.compute_metrics = compute_metrics


In [None]:
!pip install sesame-ai


In [None]:
from sesame import SesameTrainer


In [None]:
sesame_trainer = SesameTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    max_steps=200,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    logging_steps=10,
    eval_steps=50,
    save_steps=50,
    save_strategy="steps",
    evaluation_strategy="steps",
)


In [None]:
sesame_trainer.train()


In [None]:
sesame_trainer.save_model("whisper-lora-sesame")
tokenizer.save_pretrained("whisper-lora-sesame")


In [None]:
# Base model transcription
base_text = base_pipe(audio_path)["text"]

# Fine-tuned model transcription
fine_text = lora_pipe(audio_path)["text"]

print("Ground Truth:", ground_truth_text)
print("Base Whisper:", base_text)
print("Fine-tuned Whisper:", fine_text)


In [None]:
from transformers import AutoProcessor, AutoModelForTextToWaveform
import soundfile as sf
import torch

# Load model + processor
processor = AutoProcessor.from_pretrained("coqui/XTTS-v2")
model = AutoModelForTextToWaveform.from_pretrained("coqui/XTTS-v2").to("cuda")

# Use original speaker audio for cloning
with open(audio_path, "rb") as f:
    speaker_ref = f.read()

def synthesize_tts(text, ref_audio, filename):
    inputs = processor(text=text, speaker_prompt=ref_audio, sampling_rate=22050, return_tensors="pt")
    with torch.no_grad():
        audio = model(**inputs).waveform
    sf.write(filename, audio.cpu().numpy().squeeze(), 22050)

# Generate
synthesize_tts(base_text, speaker_ref, "base_tts.wav")
synthesize_tts(fine_text, speaker_ref, "fine_tts.wav")


In [None]:
from speechbrain.pretrained import EncoderClassifier
import torchaudio

classifier = EncoderClassifier.from_hparams(
    source="speechbrain/spkrec-ecapa-voxceleb"
)

def get_embedding(path):
    signal, fs = torchaudio.load(path)
    embedding = classifier.encode_batch(signal)
    return embedding.mean(dim=1)

# Compute embeddings
emb_real = get_embedding("real_voice.wav")
emb_base = get_embedding("base_tts.wav")
emb_fine = get_embedding("fine_tts.wav")

# Cosine similarities
import torch
sim_base = torch.cosine_similarity(emb_real, emb_base).item()
sim_fine = torch.cosine_similarity(emb_real, emb_fine).item()

print(f"Similarity (Real vs Base TTS): {sim_base:.3f}")
print(f"Similarity (Real vs Fine-tuned TTS): {sim_fine:.3f}")


In [None]:
!pip install git+https://github.com/huggingface/transformers
!pip install soundfile torchaudio speechbrain accelerate
!pip install git+https://github.com/coqui-ai/TTS


In [None]:
import torch
import torchaudio
import soundfile as sf
from transformers import pipeline

# Paths
audio_path = "real_voice.wav"  # replace with your uploaded/recorded file

# Load Whisper base
base_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=0)

# Load fine-tuned (LoRA + Sesame) Whisper
# Assuming you've saved it after training with Unsloth
fine_pipe = pipeline("automatic-speech-recognition", model="./finetuned_whisper", device=0)


In [None]:
# Base transcription
base_text = base_pipe(audio_path)["text"]

# Fine-tuned transcription
fine_text = fine_pipe(audio_path)["text"]

print("Base Whisper:", base_text)
print("Fine-tuned Whisper:", fine_text)


In [None]:
from TTS.api import TTS

# Load XTTS model (multilingual + voice cloning)
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to("cuda")

# Clone voice and synthesize
tts.tts_to_file(text=base_text, speaker_wav=audio_path, file_path="base_clone.wav")
tts.tts_to_file(text=fine_text, speaker_wav=audio_path, file_path="fine_clone.wav")


In [None]:
from speechbrain.pretrained import EncoderClassifier

# Load speaker embedding model
classifier = EncoderClassifier.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb")

def get_embedding(path):
    signal, fs = torchaudio.load(path)
    embedding = classifier.encode_batch(signal)
    return embedding.mean(dim=1)

# Get embeddings
emb_real = get_embedding(audio_path)
emb_base = get_embedding("base_clone.wav")
emb_fine = get_embedding("fine_clone.wav")

# Cosine similarity
sim_base = torch.cosine_similarity(emb_real, emb_base).item()
sim_fine = torch.cosine_similarity(emb_real, emb_fine).item()

print(f"Similarity (Real vs Base Clone): {sim_base:.3f}")
print(f"Similarity (Real vs Fine Clone): {sim_fine:.3f}")


In [None]:
import IPython.display as ipd

print("🔊 Real Voice:")
display(ipd.Audio(audio_path))

print("🔊 Base Whisper Clone:")
display(ipd.Audio("base_clone.wav"))

print("🔊 Fine-tuned Whisper Clone:")
display(ipd.Audio("fine_clone.wav"))
