In [1]:
!pip install openai-whisper
!pip install yt_dlp

Collecting openai-whisper
  Downloading openai_whisper-20250625.tar.gz (803 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/803.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━[0m [32m573.4/803.2 kB[0m [31m18.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m803.2/803.2 kB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: openai-whisper
  Building wheel for openai-whisper (pyproject.toml) ... [?25l[?25hdone
  Created wheel for openai-whisper: filename=openai_whisper-20250625-py3-none-any.whl size=803979 sha256=d32c4e498f0741037756e72daf67dc500a18d6101427d7697f6a86bc85bed062
  Stored in directory: /root/.cache/pip/wheels/61/d2/20/09ec9bef734d1

In [3]:
import os ,json,subprocess,uuid ,torch,whisper,glob,yt_dlp,io
from pathlib import Path
from IPython.display import Audio
from itertools import chain
import pandas as pd
import numpy as np
from pydub import AudioSegment

In [4]:
Audio_dir = Path("/content/audio")
Audio_dir.mkdir(exist_ok=True)
url = ['https://www.youtube.com/watch?v=mKBbP4T5fbk&t=142s&ab_channel=SpeakEnglishWithTiffani']
ydl_opts = {
    'format': 'bestaudio/best',
    'extractaudio': True,
    'audioformat': 'm4a',
     'outtmpl': os.path.join(Audio_dir, '%(title)s.%(ext)s')
}

with yt_dlp.YoutubeDL(ydl_opts) as ydl:
    ydl.download(url)

[youtube] Extracting URL: https://www.youtube.com/watch?v=mKBbP4T5fbk&t=142s&ab_channel=SpeakEnglishWithTiffani
[youtube] mKBbP4T5fbk: Downloading webpage
[youtube] mKBbP4T5fbk: Downloading tv simply player API JSON
[youtube] mKBbP4T5fbk: Downloading tv client config
[youtube] mKBbP4T5fbk: Downloading player 6742b2b9-main
[youtube] mKBbP4T5fbk: Downloading tv player API JSON
[info] mKBbP4T5fbk: Downloading 1 format(s): 251
[download] Destination: /content/audio/ENGLISH MASTERCLASS ｜ THINK & SPEAK ENGLISH [FULL LESSON].webm
[download] 100% of   55.56MiB in 00:00:04 at 13.65MiB/s  


In [5]:
def get_duration_sec(path:Path)->float:
  # return seconds using ffprob
  pr  = subprocess.run(['ffprobe','-v','error','-show_entries','format=duration','-of','json',str(path)],capture_output=True,text=True)
  try :
    return float(json.loads(pr.stdout)["format"]["duration"])
  except (json.JSONDecodeError,ValueError,KeyError) as e:
    print(f"Error getting duration for {path}:{e}")
    return 0.0

audio_paths = sorted(chain(
    Path('/content/audio').glob('*.m4a'),
    Path('/content/audio').glob('*.mp3'),
    Path('/content/audio').glob('*.wav'),
    Path('/content/audio').glob('*.webm'),
    ))
print(f" Found {len(audio_paths)} audio files: ")

total_duration = 0
for files in audio_paths:
  dur_sec = get_duration_sec(files)
  total_duration += dur_sec
  print(f" {files.name:<40} : {dur_sec/60:6.2f} min")

print(f"Total duration: {total_duration/60:6.2f} min")

 Found 1 audio files: 
 ENGLISH MASTERCLASS ｜ THINK & SPEAK ENGLISH [FULL LESSON].webm :  71.80 min
Total duration:  71.80 min


In [6]:
clean_Audio_dir = Path('/content/clean_audio')
clean_Audio_dir.mkdir(exist_ok=True)

def clean_audio(src:Path)->Path:
  #FFmpeg filters to clean and normalise audio
  out = clean_Audio_dir/f'{src.stem}_clean.wav'
  filters = (
      "highpass=f=90,"                # this is use to remove low frequency noise below 90hz
      "afftdn,"                       # this is FFT based denoiser to reduce broadband noise
      "loudnorm=I=-16:LRA=11:TP=-1.5," # Apply loudness normalization
      "dynaudnorm=f=200,"             #dynamic audio normalization to adjust volume based on given frame size
      "apad=pad_dur=0.1"             # add 100ms of silence padding at the end of the video
  )
  subprocess.run(["ffmpeg","-y","-i",str(src),
                  "-af",filters,"-ar","32000","-ac","1",str(out)],check=True)#sample rate 40000 and output as mono channel
  return out

In [7]:
total_cleaned= []
total_duration_cleaned = 0
for files in audio_paths:
  clean_file = Path(clean_Audio_dir/f'{files.stem}_clean.wav')
  if clean_file.exists():
    print(f"Skipping {clean_file.name} file already Exist ")
    total_duration_cleaned += get_duration_sec(clean_file)
    total_cleaned.append(clean_file)
  else:
    print(f'Cleaning {files.name} - ',end="")
    total_duration_cleaned += get_duration_sec(files)
    total_cleaned.append(clean_audio(files))
    print("Done ",)

print(f"Total duration: {total_duration/60:6.2f} min")

Cleaning ENGLISH MASTERCLASS ｜ THINK & SPEAK ENGLISH [FULL LESSON].webm - Done 
Total duration:  71.80 min


In [8]:
MODEL_SIZE = "large"  #@param ["tiny", "base", "small", "medium", "large"]
DEVICE = 'cuda'        #"cuda" tells PyTorch to use an NVIDIA GPU if available.
                      #"cpu" means run on the CPU (slower).
model = whisper.load_model(MODEL_SIZE,device=DEVICE)
Trancripts_dir = Path('/content/trancripts')
Trancripts_dir.mkdir(exist_ok=True)

for audio_path in total_cleaned:
  out_json = Trancripts_dir/f'{audio_path.stem}.json'

  if out_json.exists():
    print(f"Skipping {out_json.name} file already Exist ")
    continue

  print(f"Transcribing - {out_json.stem}..")

  result = model.transcribe(str(audio_path),word_timestamps=True,
                            fp16=(DEVICE=='cuda'),verbose=True)
  with out_json.open('w') as f:
    json.dump(result,f,indent=2)

  print(f"Saved {out_json.name} in Transcripts Directory ")
print("All transcried are Done")

100%|█████████████████████████████████████| 2.88G/2.88G [02:12<00:00, 23.3MiB/s]


Transcribing - ENGLISH MASTERCLASS ｜ THINK & SPEAK ENGLISH [FULL LESSON]_clean..
Detecting language using up to the first 30 seconds. Use `--language` to specify the language
Detected language: English
[00:00.000 --> 00:04.480]  Your goal is to speak English fluently, like a native English speaker.
[00:04.640 --> 00:10.360]  In order to speak English fluently, you must think like a native English speaker.
[00:11.240 --> 00:14.900]  And today I am going to help you think and speak in English,
[00:15.020 --> 00:16.740]  like a native English speaker.
[00:17.080 --> 00:18.020]  Are you ready?
[00:18.420 --> 00:20.740]  Well, then I'm teacher Tiffany.
[00:21.020 --> 00:22.980]  Let's jump right in.
[00:23.140 --> 00:23.800]  All right.
[00:23.820 --> 00:26.380]  The very first thing you must do is think in English.
[00:26.380 --> 00:31.400]  So if someone asks you the question, what did you eat yesterday?
[00:31.760 --> 00:38.400]  You first need to start off by answering the five W's.
[00

In [9]:
def create_audio_text_dataset(json_file_path, audio_file_path, output_csv_path, audio_chunks_dir,chunk_duration=30):

    # Create output directory for audio segments
    os.makedirs(audio_chunks_dir, exist_ok=True)

    # Read the JSON file
    with open(json_file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    # Try to load audio processing libraries

    print("Using pydub for audio processing...")
    audio = AudioSegment.from_file(audio_file_path)
    sample_rate = audio.frame_rate
    print(f"Audio loaded successfully: {len(audio)/1000:.2f} seconds at {sample_rate} Hz")
    total_duration = len(audio) / 1000


    # Extract segments and create dataset rows
    segments = data.get('segments', [])

    rows = []
    chunk_id = 0
    current_chunk_start = 0.0
    current_chunk_texts = []
    current_chunk_end = 0.0

    for segment in segments:
        seg_start = segment.get('start', 0.0)
        seg_end = segment.get('end', 0.0)
        seg_text = segment.get('text', '').strip()

        if not seg_text:
                continue

        # Check if adding this segment would exceed 30 seconds
        potential_chunk_duration = seg_end - current_chunk_start

        # If this would make chunk too long (>30s), save current chunk and start new one
        if potential_chunk_duration > 30.0 and current_chunk_texts:
             # Save current chunk
            try:
                # Extract audio for current chunk
                start_ms = int(current_chunk_start * 1000)
                end_ms = int(current_chunk_end * 1000)

                audio_chunk = audio[start_ms:end_ms]

                # Save audio chunk to file
                audio_filename = f"chunks_{chunk_id:03d}.wav"
                audio_file_path = os.path.join(audio_chunks_dir, audio_filename)
                audio_chunk.export(audio_file_path, format="wav")

                combined_text = ' '.join(current_chunk_texts).strip()

                # Create row with perfectly aligned audio and text
                audio_with_path = {
                    'path':  audio_file_path
                }

                row = {
                    'audio': audio_with_path,
                    'text': combined_text,
                    'source': "0"
                }
                rows.append(row)

                chunk_duration_actual = current_chunk_end - current_chunk_start
                print(f"Chunk {chunk_id}: {current_chunk_start:.1f}s-{current_chunk_end:.1f}s ({chunk_duration_actual:.1f}s), {len(combined_text)} chars")
                chunk_id += 1

            except Exception as e:
                print(f"Error processing chunk {chunk_id}: {e}")

                # Start new chunk with current segment
            current_chunk_start = seg_start
            current_chunk_texts = [seg_text]
            current_chunk_end = seg_end
        else:
            # Add this segment to current chunk
            if not current_chunk_texts:  # First segment in chunk
                current_chunk_start = seg_start
            current_chunk_texts.append(seg_text)
            current_chunk_end = seg_end

        # Don't forget the last chunk
    if current_chunk_texts:
        try:
            start_ms = int(current_chunk_start * 1000)
            end_ms = int(current_chunk_end * 1000)

            audio_chunk = audio[start_ms:end_ms]

            # Save audio chunk to file
            audio_filename = f"chunks_{chunk_id:03d}.wav"
            audio_file_path = os.path.join(audio_chunks_dir, audio_filename)
            audio_chunk.export(audio_file_path, format="wav")

            combined_text = ' '.join(current_chunk_texts).strip()

            audio_with_path = {
                'path': audio_file_path
            }

            row = {
                'audio': audio_with_path,
                'text': combined_text,
                'source': "0"
            }
            rows.append(row)

            chunk_duration_actual = current_chunk_end - current_chunk_start
            print(f"Chunk {chunk_id}: {current_chunk_start:.1f}s-{current_chunk_end:.1f}s ({chunk_duration_actual:.1f}s), {len(combined_text)} chars")

        except Exception as e:
            print(f"Error processing final chunk: {e}")
    # Create DataFrame and save to CSV
    df = pd.DataFrame(rows)
    df.to_json(output_csv_path, index=False,orient="records")

    print(f"\n{'='*60}")
    print(f"DATASET CREATED SUCCESSFULLY!")
    print(f"{'='*60}")
    print(f" Dataset file: {output_csv_path}")
    print(f" Audio segments folder: {audio_chunks_dir}")
    print(df.head().to_string())
    return df

In [10]:
json_file = r"/content/trancripts/ENGLISH MASTERCLASS ｜ THINK & SPEAK ENGLISH [FULL LESSON]_clean.json"
audio_file = r"/content/clean_audio/ENGLISH MASTERCLASS ｜ THINK & SPEAK ENGLISH [FULL LESSON]_clean.wav"
dataset_json = r"/content/our_dataset/dataset_json.json"
audio_segments_dir = r"/content/myaudio_segments"

our_dataset = Path("/content/our_dataset")
our_dataset.mkdir(exist_ok = True)
# Check if files exist
if not os.path.exists(json_file):
    print(f"JSON file not found: {json_file}")
    exit(1)

if not os.path.exists(audio_file):
    print(f"Audio file not found: {audio_file}")
    print("Please ensure the audio file is in the Downloads folder")
    exit(1)

print("Starting dataset creation...")
main_df = create_audio_text_dataset(json_file, audio_file,dataset_json, audio_segments_dir)

Starting dataset creation...
Using pydub for audio processing...
Audio loaded successfully: 4308.31 seconds at 32000 Hz
Chunk 0: 0.0s-26.4s (26.4s), 377 chars
Chunk 1: 26.4s-56.4s (30.0s), 308 chars
Chunk 2: 56.4s-86.3s (29.9s), 325 chars
Chunk 3: 86.3s-116.2s (30.0s), 406 chars
Chunk 4: 116.2s-146.0s (29.8s), 338 chars
Chunk 5: 146.2s-176.1s (29.9s), 361 chars
Chunk 6: 176.2s-205.8s (29.6s), 355 chars
Chunk 7: 206.2s-236.0s (29.7s), 342 chars
Chunk 8: 236.0s-265.8s (29.8s), 401 chars
Chunk 9: 266.0s-296.0s (30.0s), 362 chars
Chunk 10: 296.0s-325.8s (29.8s), 359 chars
Chunk 11: 326.0s-356.0s (30.0s), 361 chars
Chunk 12: 356.0s-385.6s (29.7s), 444 chars
Chunk 13: 386.0s-416.0s (30.0s), 419 chars
Chunk 14: 416.0s-445.8s (29.9s), 373 chars
Chunk 15: 446.0s-475.8s (29.8s), 337 chars
Chunk 16: 475.8s-505.8s (30.0s), 372 chars
Chunk 17: 505.8s-535.8s (30.0s), 391 chars
Chunk 18: 535.8s-565.7s (30.0s), 390 chars
Chunk 19: 565.8s-595.7s (30.0s), 387 chars
Chunk 20: 595.8s-625.4s (29.6s), 378 c

In [11]:
main_df

Unnamed: 0,audio,text,source
0,{'path': '/content/myaudio_segments/chunks_000...,"Your goal is to speak English fluently, like a...",0
1,{'path': '/content/myaudio_segments/chunks_001...,"So if someone asks you the question, what did ...",0
2,{'path': '/content/myaudio_segments/chunks_002...,Fries. When last night at around 6. P.m. Where...,0
3,{'path': '/content/myaudio_segments/chunks_003...,"Questions, right? You understand what you did,...",0
4,{'path': '/content/myaudio_segments/chunks_004...,Because they wanted to meet for dinner. It was...,0
...,...,...,...
141,{'path': '/content/myaudio_segments/chunks_141...,I didn't even think anything of it. So my frie...,0
142,{'path': '/content/myaudio_segments/chunks_142...,At their apartment. And when I got to the door...,0
143,{'path': '/content/myaudio_segments/chunks_143...,I looked behind him. My entire computer was co...,0
144,{'path': '/content/myaudio_segments/chunks_144...,"And La La Land just excited. She picks him up,...",0


In [12]:
!pip install unsloth[all]

Collecting unsloth[all]
  Downloading unsloth-2025.8.10-py3-none-any.whl.metadata (52 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/52.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.3/52.3 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[0mCollecting unsloth_zoo>=2025.8.9 (from unsloth[all])
  Downloading unsloth_zoo-2025.8.9-py3-none-any.whl.metadata (9.5 kB)
Collecting xformers>=0.0.27.post2 (from unsloth[all])
  Downloading xformers-0.0.32.post2-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (1.1 kB)
Collecting bitsandbytes (from unsloth[all])
  Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl.metadata (11 kB)
Collecting tyro (from unsloth[all])
  Downloading tyro-0.9.31-py3-none-any.whl.metadata (11 kB)
Collecting datasets<4.0.0,>=3.4.1 (from unsloth[all])
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting trl!=0.15.0,!=0.19.0,!=0.9.0,!=0.9.1,!=0.9.2,!=0.9.

In [13]:
# Gatting the predefined model from hugging face
from unsloth import FastModel
from transformers import CsmForConditionalGeneration

model_name = "unsloth/csm-1b"

model,processor = FastModel.from_pretrained(
    model_name = model_name,
    max_seq_length = 2048,
    dtype = None,
    auto_model = CsmForConditionalGeneration,
    load_in_4bit = False,
    full_finetuning = False)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.8.10: Fast Csm patching. Transformers: 4.55.4.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: QLoRA and full finetuning all not selected. Switching to 16bit LoRA.


model.safetensors:   0%|          | 0.00/4.15G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/264 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/449 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

unsloth/csm-1b does not have a padding token! Will use pad_token = <|PAD_TOKEN|>.


In [14]:
print(model)

CsmForConditionalGeneration(
  (lm_head): Linear(in_features=2048, out_features=2051, bias=False)
  (embed_text_tokens): Embedding(128256, 2048)
  (backbone_model): CsmBackboneModel(
    (embed_tokens): CsmBackboneModelEmbeddings(
      (embed_audio_tokens): Embedding(65632, 2048)
    )
    (layers): ModuleList(
      (0-15): 16 x CsmDecoderLayer(
        (self_attn): CsmAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): CsmMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        

In [15]:
model  = FastModel.get_peft_model(
     model,
     r= 32,
     target_modules = ["q_proj","k_proj","v_proj",
                       "o_proj","gate_proj","up_proj",
                       "down_proj",],
     lora_alpha = 16,
     lora_dropout = 0,
     bias = "none",
     use_gradient_checkpointing = "unsloth",
     randomstate = 3407,
     use_rslora = True,
     loftq_config = None,
 )

Unsloth: Making `model.base_model.model.backbone_model` require gradients


In [16]:
model.print_trainable_parameters()

trainable params: 29,032,448 || all params: 1,661,132,609 || trainable%: 1.7478


In [17]:
sample_rate = 24000

import os
from transformers import AutoProcessor
from datasets import load_dataset,Audio

processor = AutoProcessor.from_pretrained("unsloth/csm-1b")
# raw_ds = load_dataset(exa_dataset,split="train")

dataset = load_dataset("json",data_files="/content/our_dataset/dataset_json.json")
dataset = dataset.cast_column("audio", Audio())
raw_ds = dataset["train"]

print(f'Dataset loaded with features: {raw_ds.features["audio"]}')

if "source" not in raw_ds.column_names:
   print("Unsloth No speaker Found")
   new_column = ["0"]*len(raw_ds)
   raw_ds = raw_ds.add_column(name="source",column=new_column)
elif "source" in raw_ds.column_names:
   speaker_key = "source"

target_sampling_rate = sample_rate
raw_ds = raw_ds.cast_column("audio",Audio(sampling_rate=target_sampling_rate))

import math
from datasets import DatasetDict

totalrows = len(raw_ds)
test_rows = min(30,max(1,math.ceil(0.10*totalrows)))

split:DatasetDict = raw_ds.train_test_split(test_size=test_rows,shuffle=True,seed=42)

rawTrainDs = split["train"]
rawTestDs = split["test"]

print(f"Train rows: {len(rawTrainDs)}")
print(f"Eval rows: {len(rawTestDs)}")

preprocessor_config.json:   0%|          | 0.00/271 [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset loaded with features: Audio(sampling_rate=None, mono=True, decode=True, id=None)
Train rows: 131
Eval rows: 15


In [18]:
max_audio_length = 0
max_text_length = 0 # Initialize max_text_length
for row in raw_ds:
    text_length = len(row['text'])
    audio_length = len(row['audio']['array'])
    max_text_length = max(max_text_length, text_length)
    max_audio_length = max(max_audio_length, audio_length)

print(f"Maximum text length: {max_text_length}")
print(f"Maximum audio length: {max_audio_length}")

Maximum text length: 507
Maximum audio length: 719544


In [19]:
def preprocess(example):
  conversation = [
      {
          "role": str(example["source"]),
          "content": [
              {"type": "text", "text": example["text"]},
              {"type": "audio", "audio": example["audio"]["array"]} # Pass the numpy array
          ],
      }
  ]
  try :
    model_inputs = processor.apply_chat_template(
        conversation,
        tokenize=True,
        return_dict =True,
        output_labels = True,
        text_kwargs={
            "padding":"max_length",
            "max_length":max_text_length,
            "pad_to_multiple_of":8,
            "padding_side":"right",
        },
        audio_kwargs={
            "sampling_rate":24000,
            "padding":"max_length",
            "max_length":max_audio_length,
        },
        common_kwargs = {"return_tensors":"pt"}
    )
  except Exception as e:
    print(e)
    return None
  required_keys = ["input_ids","attention_mask","labels","input_values","input_values_cutoffs"] # Corrected typo here
  processed = {}

  for key in required_keys:
    if key not in model_inputs:
      print(f"{key} missing from model_inputs")
      return None
    value = model_inputs[key][0]
    processed[key] = value

  if not all(isinstance(processed[key],torch.Tensor) for key in processed):
    print("Not all values in processed are tensors")
    return None
  return processed

processed_train_ds = rawTrainDs.map(preprocess,remove_columns=rawTrainDs.column_names,desc="Preprocessing Train data")
processed_test_ds = rawTestDs.map(preprocess,remove_columns=rawTestDs.column_names,desc="Preprocessing Test data")

Preprocessing Train data:   0%|          | 0/131 [00:00<?, ? examples/s]

Preprocessing Test data:   0%|          | 0/15 [00:00<?, ? examples/s]

In [20]:
print(processed_train_ds)

Dataset({
    features: ['input_ids', 'attention_mask', 'labels', 'input_values', 'input_values_cutoffs'],
    num_rows: 131
})


In [21]:
import time
run_name =model_name.split("/")[-1]+'-lora-ft'+time.strftime("_%Y%m%d_%H%M%S")
print(run_name)

csm-1b-lora-ft_20250831_044226


In [22]:
from transformers import TrainingArguments,Trainer
from unsloth import is_bfloat16_supported

trainer=Trainer(
    model =model,
    train_dataset = processed_train_ds,
    eval_dataset = processed_test_ds,
    args = TrainingArguments(
        per_device_train_batch_size =1,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs =3,
        eval_strategy = "steps",
        eval_steps = 0.2,
        learning_rate = 2e-4, #for lora,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps =1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "constant",
        seed = 3407,
        output_dir = "outputs",
        report_to = 'tensorboard',
        logging_dir = f"logs/{run_name}"
        )
)

In [23]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 131 | Num Epochs = 3 | Total steps = 99
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 4 x 1) = 4
 "-____-"     Trainable parameters = 29,032,448 of 1,661,132,609 (1.75% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,Validation Loss
20,24.9853,6.211152
40,23.4813,6.073267
60,23.5552,6.073123
80,21.7534,6.091333


Unsloth: Not an error, but CsmForConditionalGeneration does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


In [24]:
import soundfile as sf
from IPython.display import Audio, display
text = """Resemble AI can clone a voice with just a few minutes of recorded data, making it possible to
generate realistic speech that closely matches the original speaker’s tone, style, and personality
.Learning a new skill is like planting a seed; with patience, effort, and care, it will grow into something meaningful and rewarding over time.
In today’s fast-paced digital world, communication has become more instant than ever, but the value of genuine human connection remains timeless.
Technology is changing the way we live, work, and communicate with each other.
Artificial intelligence is not here to replace humans, but to help us work smarter and faster.
Reading books can transport you into different worlds and expand your imagination.
"""
speaker_id = 0
inputs = processor(
    f"[{speaker_id}]{text}",
    add_special_tokens = True,
    return_tensors = "pt",
).to("cuda")

audio_values = model.generate(
    **inputs,
    max_new_tokens=375,#125 tokens is 10 seconds of audio ,for longer speech increase this
    #play with below parameters for better output
    depth_decoder_temperature = 0.6,
    depth_decoder_top_k = 0,
    depth_decoder_top_p = 0.9,
    temperature = 0.8,
    top_k = 50,
    top_p = 1.0,
    output_audio = True
)

audio = audio_values[0].to(torch.float32).cpu().numpy()
sf.write("finetunedAudio.wav",audio ,sample_rate)
print("Fine-tuned-Audio:")
display(Audio(audio, rate=sample_rate))
print("Real: ")
display(Audio(rawTestDs[3]["audio"]["array"],rate =sample_rate))



Fine-tuned-Audio:


Real: 


In [27]:
speaker_id = 0

cloned = rawTestDs[1]["audio"]["array"]
cloned_text = rawTestDs[1]["text"]
conversation = [
{"role": str(speaker_id),"content": [{"type": "text", "text": cloned_text},{"type": "audio", "audio": cloned}],}
  ,{"role": str(speaker_id),"content": [{"type": "text", "text": text}],}
                 ]
inputs = processor.apply_chat_template(
        conversation,
        tokenize=True,
        return_dict =True,
        return_tensors='pt')

inputs = {k: v.to("cuda") for k, v in inputs.items()}

audio_values = model.generate(
    **inputs,
    max_new_tokens=375,#125 tokens is 10 seconds of audio ,for longer speech increase this ~12.5 tokens ≈ 1 second of audio
    #play with below parameters for better output
    depth_decoder_temperature = 0.6,
    depth_decoder_top_k = 0,
    depth_decoder_top_p = 0.9,
    temperature = 0.8,
    top_k = 50,
    top_p = 1.0,
    output_audio = True
)

audio = audio_values[0].to(torch.float32).cpu().numpy()
sf.write("clonedVoiceAudio.wav",audio ,sample_rate)
print("cloned")
display(Audio(audio, rate=sample_rate))
print("Real: ")
RealAudio = rawTestDs[3]["audio"]["array"]
display(Audio(RealAudio,rate =sample_rate))
sf.write("RealAudio.wav",RealAudio ,sample_rate)


cloned


Real: 
