### Code chuẩn

In [1]:
import os
import whisperx
from pydub import AudioSegment
import shutil
import torch
import csv
import gc

# --- CẤU HÌNH MÔI TRƯỜNG ---
venv_path = os.path.join(os.getcwd(), "venv", "Lib", "site-packages", "nvidia", "cublas", "bin")
if os.path.exists(venv_path):
    os.environ["PATH"] += os.pathsep + venv_path

ffmpeg_path = r"D:\Speech_Verification\SpeechVeri_MultiFeatures\data_preparation\ffmpeg.exe"
ffprobe_path = r"D:\Speech_Verification\SpeechVeri_MultiFeatures\data_preparation\ffprobe.exe" 
ffmpeg_dir = os.path.dirname(ffmpeg_path)

# Thêm ffmpeg folder vào PATH
os.environ["PATH"] = ffmpeg_dir + os.pathsep + os.environ["PATH"]

AudioSegment.converter = ffmpeg_path
AudioSegment.ffprobe = ffprobe_path

# Fix torch.load + omegaconf (PyTorch 2.6+ weights_only issue)
import warnings
warnings.filterwarnings("ignore")

try:
    from omegaconf import ListConfig, DictConfig
    if hasattr(torch.serialization, 'add_safe_globals'):
        torch.serialization.add_safe_globals([ListConfig, DictConfig])
except:
    pass

# Patch torch.load để tránh weights_only error
_original_torch_load = torch.load
def _custom_torch_load(*args, **kwargs):
    kwargs['weights_only'] = False
    return _original_torch_load(*args, **kwargs)
torch.load = _custom_torch_load

# --- CẤU HÌNH ---
input_dir = r"D:\Speech_Verification\audio_5s"
output_dir = r"D:\Speech_Verification\cut_audio_5s"
english_dir = r"D:\Speech_Verification\other_language_files"

# Kiểm tra input_dir tồn tại
if not os.path.exists(input_dir):
    print(f"❌ Input folder không tồn tại: {input_dir}")
else:
    print(f"✓ Tìm thấy {len([f for f in os.listdir(input_dir) if f.lower().endswith(('.wav', '.mp3'))])} file audio")

os.makedirs(output_dir, exist_ok=True)
os.makedirs(english_dir, exist_ok=True)

MODEL_SIZE = "large-v2" 
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
COMPUTE_TYPE = "float16" if DEVICE == "cuda" else "int8"

TARGET_SECONDS = 5.0
STRIDE_SECONDS = 1.0 
MARGIN = 0.5
MIN_DUR = TARGET_SECONDS - MARGIN 
MAX_DUR = TARGET_SECONDS + MARGIN 

# Metadata
meta_path = os.path.join(output_dir, "metadata.csv")
meta_exists = os.path.exists(meta_path)
meta_file = open(meta_path, "a", newline="", encoding="utf-8")
writer = csv.writer(meta_file)
if not meta_exists:
    writer.writerow(["target_duration", "speaker_id", "filename", "lang", "duration", "start", "end", "text"])
    meta_file.flush()

print(f"Loading Whisper {MODEL_SIZE} on {DEVICE}...")
model = whisperx.load_model(MODEL_SIZE, device=DEVICE, compute_type=COMPUTE_TYPE)
align_models = {}

# --- [HÀM MỚI] KIỂM TRA TEXT RÁC (LẶP TỪ) ---
def is_valid_text(words_list):
    if not words_list: return False
    
    # Lấy danh sách các từ (bỏ qua dấu câu cơ bản nếu cần)
    text_content = [w["word"].strip().lower() for w in words_list]
    
    # 1. Kiểm tra độ đa dạng của từ (Unique Ratio)
    # Ví dụ: "con con con con" -> 4 từ, nhưng chỉ có 1 từ duy nhất -> Ratio = 0.25 -> RÁC
    unique_words = set(text_content)
    unique_ratio = len(unique_words) / len(text_content)
    
    if len(text_content) > 3 and unique_ratio < 0.4:
        return False # Quá nhiều từ lặp lại
        
    return True
# ---------------------------------------------

print(f"\n>>> CHẾ ĐỘ: CỬA SỔ TRƯỢT INDEX + LỌC RÁC + MEMORY OPTIMIZATION <<<")

# --- CHECKPOINT: QUÉT NHỮNG FILE ĐÃ CẮT ---
processed_files = set()
if os.path.exists(output_dir):
    for out_file in os.listdir(output_dir):
        if "_seg_" in out_file and out_file.endswith(".wav"):
            # Extract base_name từ file như: "sp001_seg_001.wav" -> "sp001"
            base_name = out_file.split("_seg_")[0]
            processed_files.add(base_name)

if processed_files:
    print(f"✓ Đã xử lý: {len(processed_files)} file gốc")
    print(f"  Danh sách: {', '.join(sorted(processed_files)[:10])}" + ("..." if len(processed_files) > 10 else ""))
else:
    print("• Chưa xử lý file nào, bắt đầu từ đầu")
# ---

try:
    for filename in os.listdir(input_dir):
        if not filename.lower().endswith((".wav", ".mp3")): continue
        
        base_name = os.path.splitext(filename)[0]
        
        # SKIP nếu file đã xử lý
        if base_name in processed_files:
            print(f"⊘ Bỏ qua (đã xử lý): {filename}")
            continue
        
        print(f"\n--- Xử lý: {filename} ---")
        input_path = os.path.join(input_dir, filename)
        speaker_id = base_name.split("_")[0]

        try:
            # 1. Transcribe
            audio_data = whisperx.load_audio(input_path)
            result = model.transcribe(audio_data, batch_size=16)
            detected_lang = result.get("language", "unknown")
            
            # [FIX] Clear intermediate tensors từ audio
            del audio_data
            gc.collect()
            torch.cuda.empty_cache()

            if detected_lang == "en":
                shutil.copy(input_path, os.path.join(english_dir, filename))
                processed_files.add(base_name)  # Mark as processed
                del result
                gc.collect()
                torch.cuda.empty_cache()
                continue 

            if detected_lang not in align_models:
                if detected_lang == "vi":
                    align_models[detected_lang] = whisperx.load_align_model(language_code="vi", device=DEVICE, model_name="nguyenvulebinh/wav2vec2-base-vi")
                else:
                    align_models[detected_lang] = whisperx.load_align_model(language_code=detected_lang, device=DEVICE)
            
            result_aligned = whisperx.align(result["segments"], align_models[detected_lang][0], align_models[detected_lang][1], input_path, DEVICE)
            words = [w for w in result_aligned["word_segments"] if "start" in w and "end" in w]
            
            # [FIX] Clear result objects ngay sau khi extract words
            del result, result_aligned
            gc.collect()
            torch.cuda.empty_cache()

            if not words:
                processed_files.add(base_name)  # Mark as processed even if no words
                del words
                gc.collect()
                torch.cuda.empty_cache()
                continue

            # 2. CẮT AUDIO (Index-Based)
            audio = AudioSegment.from_file(input_path)
            start_word_idx = 0
            seg_index = 1
            
            while start_word_idx < len(words):
                actual_start_time = words[start_word_idx]["start"]
                end_word_idx = None
                valid_segment_found = False
                
                # A. Gom từ
                for j in range(start_word_idx, len(words)):
                    current_word = words[j]
                    current_dur = current_word["end"] - actual_start_time
                    
                    if current_dur > MAX_DUR: break 
                    if current_dur >= MIN_DUR:
                        end_word_idx = j
                        valid_segment_found = True
                        break 

                # B. Xuất file (CÓ THÊM BƯỚC CHECK RÁC)
                if valid_segment_found and end_word_idx is not None:
                    current_segment_words = words[start_word_idx : end_word_idx + 1]
                    
                    # --- [FIX MỚI] CHECK LẶP TỪ ---
                    if is_valid_text(current_segment_words):
                        final_dur = current_segment_words[-1]["end"] - actual_start_time
                        start_ms = int(actual_start_time * 1000)
                        end_ms = int(current_segment_words[-1]["end"] * 1000)
                        
                        seg_audio = audio[start_ms:end_ms]
                        out_name = f"{base_name}_seg_{seg_index:03d}.wav"
                        seg_audio.export(os.path.join(output_dir, out_name), format="wav")
                        
                        text_seg = " ".join([w["word"].strip() for w in current_segment_words])
                        writer.writerow([
                            TARGET_SECONDS, speaker_id, out_name, detected_lang, 
                            round(final_dur, 3), 
                            round(actual_start_time, 3), 
                            round(current_segment_words[-1]["end"], 3), 
                            text_seg
                        ])
                        meta_file.flush()
                        seg_index += 1
                        
                        # [FIX] Clear segment audio objects
                        del seg_audio
                    else:
                        pass 
                        # print(f"-> Bỏ qua đoạn rác (Lặp từ): {actual_start_time}s")

                # C. Sliding Window (Index)
                next_start_idx = None
                desired_next_time = actual_start_time + STRIDE_SECONDS
                for k in range(start_word_idx + 1, len(words)):
                    if words[k]["start"] >= desired_next_time:
                        next_start_idx = k
                        break
                
                if next_start_idx is None: break 
                start_word_idx = next_start_idx
            
            # [FIX] Clear audio object sau khi xử lý xong file
            del audio, words
            gc.collect()
            torch.cuda.empty_cache()

            print(f"-> Đã lưu: {seg_index-1} file.")
            processed_files.add(base_name)  # Mark file as fully processed

        except Exception as e:
            print(f"Lỗi {filename}: {e}")
            # [FIX] Clear objects even on error
            gc.collect()
            torch.cuda.empty_cache()

finally:
    meta_file.close()
    print("\n>>> HOÀN TẤT <<<")
    # [FIX] Final cleanup
    del model, align_models
    gc.collect()
    torch.cuda.empty_cache()




✓ Tìm thấy 86665 file audio
Loading Whisper large-v2 on cuda...
2026-02-02 21:09:47 - whisperx.asr - INFO - No language specified, language will be detected for each audio file (increases inference time)
2026-02-02 21:09:47 - whisperx.vads.pyannote - INFO - Performing voice activity detection using Pyannote...


Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.6.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint d:\Speech_Verification\venv\lib\site-packages\whisperx\assets\pytorch_model.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.4.0. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.7.1+cu128. Bad things might happen unless you revert torch to 1.x.

>>> CHẾ ĐỘ: CỬA SỔ TRƯỢT INDEX + LỌC RÁC + MEMORY OPTIMIZATION <<<
✓ Đã xử lý: 71594 file gốc
  Danh sách: id00000_00000, id00000_00001, id00000_00002, id00000_00005, id00000_00007, id00000_00008, id00000_00009, id00000_00010, id00000_00012, id00000_00013...
⊘ Bỏ qua (đã xử lý): id00000_00000.wav
⊘ Bỏ qua (đã xử lý): id00000_00001.wav
⊘ Bỏ qua (đã xử lý): id00000_00002.wav
⊘ Bỏ qua (đã xử lý): id00000_00005.wav
⊘ Bỏ qua (đã xử lý): id00000_00007.wav
⊘ Bỏ qua (đã xử lý): id00000_00008.wav
⊘ Bỏ qua (đã xử lý): id00000_00009.wav
⊘ Bỏ qua (đã xử lý): id00000_00010.wav
⊘ Bỏ qua (đã xử lý): id00000_00012.wav
⊘ Bỏ qua (đã xử lý): id00000_00013.wav
⊘ Bỏ qua (đã xử lý): id00000_00014.wav
⊘ Bỏ qua (đã xử lý): id00000_00015.wav
⊘ Bỏ qua (đã xử lý): id00

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at nguyenvulebinh/wav2vec2-base-vi and are newly initialized: ['lm_head.bias', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


-> Đã lưu: 0 file.
⊘ Bỏ qua (đã xử lý): id00000_00031.wav
⊘ Bỏ qua (đã xử lý): id00000_00032.wav
⊘ Bỏ qua (đã xử lý): id00000_00033.wav

--- Xử lý: id00000_00034.wav ---
2026-02-02 21:09:57 - whisperx.asr - INFO - Detected language: vi (0.99) in first 30s of audio
-> Đã lưu: 0 file.
⊘ Bỏ qua (đã xử lý): id00000_00035.wav
⊘ Bỏ qua (đã xử lý): id00000_00036.wav
⊘ Bỏ qua (đã xử lý): id00000_00037.wav
⊘ Bỏ qua (đã xử lý): id00000_00038.wav
⊘ Bỏ qua (đã xử lý): id00000_00039.wav
⊘ Bỏ qua (đã xử lý): id00000_00040.wav
⊘ Bỏ qua (đã xử lý): id00000_00041.wav
⊘ Bỏ qua (đã xử lý): id00000_00043.wav
⊘ Bỏ qua (đã xử lý): id00000_00044.wav
⊘ Bỏ qua (đã xử lý): id00000_00045.wav
⊘ Bỏ qua (đã xử lý): id00000_00046.wav
⊘ Bỏ qua (đã xử lý): id00000_00047.wav
⊘ Bỏ qua (đã xử lý): id00000_00048.wav
⊘ Bỏ qua (đã xử lý): id00000_00049.wav
⊘ Bỏ qua (đã xử lý): id00000_00050.wav
⊘ Bỏ qua (đã xử lý): id00000_00051.wav

--- Xử lý: id00000_00052.wav ---
2026-02-02 21:09:58 - whisperx.asr - INFO - Detected langu