In [None]:
import os
import whisperx
from pydub import AudioSegment
import shutil

MODEL_SIZE = "large" 
DEVICE = "cpu" #cuda
COMPUTE_TYPE = "float32"

print(f"Loading Whisper model: {MODEL_SIZE}...")
model = whisperx.load_model(MODEL_SIZE, device=DEVICE, compute_type=COMPUTE_TYPE)
model_a, metadata = whisperx.load_align_model(language_code="vi", device=DEVICE)

In [None]:
def cut_audio(input_dir, output_dir, other_dir, target_duration, stride):
    for filename in os.listdir(input_dir):
        if not filename.lower().endswith((".wav", ".mp3")):
            continue

        input_path = os.path.join(input_dir, filename)
        base_name = os.path.splitext(filename)[0]
        print(f"\nProcessing: {filename}")
        
        try:
            result = model.transcribe(input_path)
            detected_lang = result.get("language", "unknown")

            # others
            if detected_lang != "vi":
                print(f"-> {detected_lang} detected: {input_path}")
                shutil.copy(input_path, os.path.join(other_dir, filename))
                continue

            # vi
            result_aligned = whisperx.align(result["segments"], model_a, metadata, input_path, DEVICE)
            words = [w for w in result_aligned["word_segments"] if "start" in w and w["start"] is not None]

            audio = AudioSegment.from_file(input_path)
            total_duration = len(audio) / 1000.0
            
            current_mark = 0.0
            seg_index = 1

            while current_mark < total_duration - target_duration * 0.8:
                # start word >= current_mark
                start_word_idx = None
                for idx, w in enumerate(words):
                    if w["start"] >= current_mark:
                        start_word_idx = idx
                        break
                
                if start_word_idx is None:
                    break

                actual_start_time = words[start_word_idx]["start"]
                if actual_start_time <= last_start_time:
                    break
                
                current_segment_words = []
                for j in range(start_word_idx, len(words)):
                    current_segment_words.append(words[j])
                    if words[j]["end"] - actual_start_time >= target_duration:
                        break
                
                if not current_segment_words:
                    break
                    
                seg_duration = current_segment_words[-1]["end"] - actual_start_time
                if seg_duration < target_duration * 0.8:
                    break

                start_ms = int(actual_start_time * 1000)
                end_ms = int(current_segment_words[-1]["end"] * 1000)
                
                segment_audio = audio[start_ms:end_ms]
                text = " ".join(w["word"] for w in current_segment_words)
                
                out_filename = f"{base_name}_seg_{seg_index:03d}.wav"
                segment_audio.export(os.path.join(output_dir, out_filename), format="wav")

                last_start_time = actual_start_time
                current_mark += stride
                seg_index += 1

            print(f"-> Done: {filename}")

        except Exception as e:
            print(f"Error processing file {filename}: {e}")

    print("\nAll done.")

In [None]:
input_dir = r"D:\Study\7-SP26\DATxSLP\Data_after_preprocessing\test"
output_dir = r"D:\Study\7-SP26\DATxSLP\Data_after_cut\test_output"
other_lang_dir = r"D:\Study\7-SP26\DATxSLP\Data_after_cut\other_languages"

os.makedirs(output_dir, exist_ok=True)
os.makedirs(other_lang_dir, exist_ok=True)

TARGET_SECONDS = 5.0
STRIDE_SECONDS = 1.0