In [None]:
import os
import whisperx
from pydub import AudioSegment
import shutil
import torch

# Trỏ đường dẫn đến thư viện CUDA vừa cài trong venv
venv_path = os.path.join(os.getcwd(), "venv", "Lib", "site-packages", "nvidia", "cublas", "bin")
if os.path.exists(venv_path):
    os.environ["PATH"] += os.pathsep + venv_path

# --- FIX LỖI WINERROR 2 ---
# 1. Chỉ định đường dẫn trực tiếp cho Pydub
ffmpeg_path = r"D:\Study\7-SP26\DATxSLP\ffmpeg.exe"
ffprobe_path = r"D:\Study\7-SP26\DATxSLP\ffprobe.exe" # Đảm bảo bạn có file này trong folder

AudioSegment.converter = ffmpeg_path
AudioSegment.ffprobe = ffprobe_path

# 2. Thêm vào môi trường hệ thống của Python
os.environ["PATH"] += os.pathsep + r"D:\Study\7-SP26\DATxSLP"

# --- CẤU HÌNH ---
input_dir = r"D:\Study\7-SP26\DATxSLP\Data_after_preprocessing\test\id00005"
output_dir = r"D:\Study\7-SP26\DATxSLP\Data_after_cut\test_output"
english_dir = r"D:\Study\7-SP26\DATxSLP\Data_after_cut\file_english"

os.makedirs(output_dir, exist_ok=True)
os.makedirs(english_dir, exist_ok=True)

MODEL_SIZE = "large-v2" 
TARGET_SECONDS = 5.0
STRIDE_SECONDS = 1.0
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# Dùng int8_float16 để tiết kiệm RAM/VRAM
COMPUTE_TYPE = "int8_float16" 

# [NEW] Cấu hình sai số cho phép (0.5 giây)
ALLOWED_MARGIN = 0.5  
# Ngưỡng tối thiểu = 5.0 - 0.5 = 4.5 giây
MIN_DURATION_THRESHOLD = TARGET_SECONDS - ALLOWED_MARGIN

asr_options = {
    "n_mels": 128  # Ép dùng 128 cho large-v3
}

# 1. Load Model Whisper
print(f"Loading Whisper model: {MODEL_SIZE} on {DEVICE}...")
model = whisperx.load_model(MODEL_SIZE, device=DEVICE, compute_type=COMPUTE_TYPE, asr_options=asr_options)

# Dictionary lưu align models để không phải load lại nhiều lần
align_models = {}

for filename in os.listdir(input_dir):
    if not filename.lower().endswith((".wav", ".mp3")):
        continue

    input_path = os.path.join(input_dir, filename)
    base_name = os.path.splitext(filename)[0]
    print(f"\n--- Đang xử lý file: {filename} ---")
    
    try:
        # 2. Nhận diện sơ bộ để kiểm tra ngôn ngữ
        audio_data = whisperx.load_audio(input_path)
        result = model.transcribe(audio_data, batch_size=16)
        detected_lang = result.get("language", "unknown")
        print(f"Ngôn ngữ phát hiện: {detected_lang}")

        # 3. KIỂM TRA NẾU LÀ TIẾNG ANH (en) -> Di chuyển và bỏ qua
        if detected_lang == "en":
            print(f"-> Phát hiện tiếng Anh. Đang copy file vào: {english_dir}")
            shutil.copy(input_path, os.path.join(english_dir, filename))
            continue 

        # 4. Load Align Model
        if detected_lang not in align_models:
            print(f"Loading alignment model cho: {detected_lang}...")
            try:
                # Sử dụng model của nguyenvulebinh - Rất ổn định cho tiếng Việt
                if detected_lang == "vi":
                    align_model_name = "nguyenvulebinh/wav2vec2-base-vi"
                    align_models[detected_lang] = whisperx.load_align_model(
                        language_code=detected_lang, 
                        device=DEVICE, 
                        model_name=align_model_name
                    )
                else:
                    # Các ngôn ngữ khác dùng mặc định của whisperx
                    align_models[detected_lang] = whisperx.load_align_model(
                        language_code=detected_lang, 
                        device=DEVICE
                    )
            except Exception as e:
                print(f"Không hỗ trợ Alignment cho ngôn ngữ '{detected_lang}'. Lỗi: {e}")
                continue

        model_a, metadata = align_models[detected_lang]
        
        # 5. Thực hiện Alignment để lấy thời gian từng từ chính xác
        result_aligned = whisperx.align(result["segments"], model_a, metadata, input_path, DEVICE)
        words = [w for w in result_aligned["word_segments"] if "start" in w and w["end"] is not None]

        if not words:
            print("Không tìm thấy mốc thời gian từ (word segments) để cắt.")
            continue

        # 6. Cắt Audio bằng Pydub
        audio = AudioSegment.from_file(input_path)
        total_duration = len(audio) / 1000.0
        current_mark = 0.0
        seg_index = 1

        while current_mark + TARGET_SECONDS <= total_duration:
            # Tìm từ bắt đầu >= current_mark
            start_word_idx = None
            for idx, w in enumerate(words):
                if w["start"] >= current_mark:
                    start_word_idx = idx
                    break
            
            if start_word_idx is None: break

            actual_start_time = words[start_word_idx]["start"]
            current_segment_words = []
            
            # Gom từ cho đến khi đủ TARGET_SECONDS
            for j in range(start_word_idx, len(words)):
                current_segment_words.append(words[j])
                if words[j]["end"] - actual_start_time >= TARGET_SECONDS:
                    break
            
            if not current_segment_words: break
                
            seg_duration = current_segment_words[-1]["end"] - actual_start_time
            
            # Chỉ xuất file nếu đoạn cắt đủ dài (>= 70% mục tiêu)
            # --- [LOGIC MỚI Ở ĐÂY] ---
            # Kiểm tra xem đoạn cắt có >= (5.0 - 0.5) hay không
            if seg_duration >= MIN_DURATION_THRESHOLD:
                start_ms = int(actual_start_time * 1000)
                end_ms = int(current_segment_words[-1]["end"] * 1000)
                
                segment_audio = audio[start_ms:end_ms]
                out_filename = f"{base_name}_seg_{seg_index:03d}.wav"
                segment_audio.export(os.path.join(output_dir, out_filename), format="wav")
                # print(f"   + Lưu đoạn {seg_index}: {seg_duration:.2f}s") # Uncomment để debug
                seg_index += 1
            else:
                # Uncomment dòng dưới nếu muốn xem file nào bị loại
                # print(f"   - Bỏ qua đoạn: {seg_duration:.2f}s (Ngắn hơn {MIN_DURATION_THRESHOLD}s)")
                pass

            current_mark += STRIDE_SECONDS

        print(f"-> Hoàn tất! Đã cắt được {seg_index-1} đoạn.")

    except Exception as e:
        print(f"Lỗi khi xử lý file {filename}: {e}")

print("\n>>> TẤT CẢ FILE ĐÃ ĐƯỢC XỬ LÝ XONG! <<<")

### Code chuẩn

In [1]:
import os
import whisperx
from pydub import AudioSegment
import shutil
import torch
import csv

# --- CẤU HÌNH MÔI TRƯỜNG ---
venv_path = os.path.join(os.getcwd(), "venv", "Lib", "site-packages", "nvidia", "cublas", "bin")
if os.path.exists(venv_path):
    os.environ["PATH"] += os.pathsep + venv_path

ffmpeg_path = r"D:\Study\7-SP26\DATxSLP\ffmpeg.exe"
ffprobe_path = r"D:\Study\7-SP26\DATxSLP\ffprobe.exe" 
AudioSegment.converter = ffmpeg_path
AudioSegment.ffprobe = ffprobe_path
os.environ["PATH"] += os.pathsep + r"D:\Study\7-SP26\DATxSLP"

# Fix torch.load
if hasattr(torch, 'serialization'):
    _original_torch_load = torch.load
    def _custom_torch_load(*args, **kwargs):
        if 'weights_only' not in kwargs: kwargs['weights_only'] = False
        return _original_torch_load(*args, **kwargs)
    torch.load = _custom_torch_load

# --- CẤU HÌNH ---
input_dir = r"E:\speech_data\audio_6.5s"
output_dir = r"E:\speech_data\cut_audio_7s"
english_dir = r"E:\speech_data\file_english"

os.makedirs(output_dir, exist_ok=True)
os.makedirs(english_dir, exist_ok=True)

MODEL_SIZE = "large-v2" 
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
COMPUTE_TYPE = "int8_float16" 
ASR_OPTIONS = {"n_mels": 128}

TARGET_SECONDS = 7.0
STRIDE_SECONDS = 1.0 
MARGIN = 0.5
MIN_DUR = TARGET_SECONDS - MARGIN 
MAX_DUR = TARGET_SECONDS + MARGIN 

# Metadata
meta_path = os.path.join(output_dir, r"E:\speech_data\cut_audio_7s\metadata.csv")
meta_exists = os.path.exists(meta_path)
meta_file = open(meta_path, "a", newline="", encoding="utf-8")
writer = csv.writer(meta_file)
if not meta_exists:
    writer.writerow(["target_duration", "speaker_id", "filename", "lang", "duration", "start", "end", "text"])
    meta_file.flush()

print(f"Loading Whisper {MODEL_SIZE} on {DEVICE}...")
model = whisperx.load_model(MODEL_SIZE, device=DEVICE, compute_type=COMPUTE_TYPE, asr_options=ASR_OPTIONS)
align_models = {}

# --- [HÀM MỚI] KIỂM TRA TEXT RÁC (LẶP TỪ) ---
def is_valid_text(words_list):
    if not words_list: return False
    
    # Lấy danh sách các từ (bỏ qua dấu câu cơ bản nếu cần)
    text_content = [w["word"].strip().lower() for w in words_list]
    
    # 1. Kiểm tra độ đa dạng của từ (Unique Ratio)
    # Ví dụ: "con con con con" -> 4 từ, nhưng chỉ có 1 từ duy nhất -> Ratio = 0.25 -> RÁC
    unique_words = set(text_content)
    unique_ratio = len(unique_words) / len(text_content)
    
    if len(text_content) > 3 and unique_ratio < 0.4:
        return False # Quá nhiều từ lặp lại
        
    return True
# ---------------------------------------------

print(f"\n>>> CHẾ ĐỘ: CỬA SỔ TRƯỢT INDEX + LỌC RÁC <<<")

try:
    for filename in os.listdir(input_dir):
        if not filename.lower().endswith((".wav", ".mp3")): continue
        
        print(f"\n--- Xử lý: {filename} ---")
        input_path = os.path.join(input_dir, filename)
        base_name = os.path.splitext(filename)[0]
        speaker_id = base_name.split("_")[0]

        try:
            # 1. Transcribe
            audio_data = whisperx.load_audio(input_path)
            result = model.transcribe(audio_data, batch_size=16)
            detected_lang = result.get("language", "unknown")

            if detected_lang == "en":
                shutil.copy(input_path, os.path.join(english_dir, filename))
                continue 

            if detected_lang not in align_models:
                if detected_lang == "vi":
                    align_models["vi"] = whisperx.load_align_model(language_code="vi", device=DEVICE, model_name="nguyenvulebinh/wav2vec2-base-vi")
                else:
                    align_models[detected_lang] = whisperx.load_align_model(language_code=detected_lang, device=DEVICE)
            
            result_aligned = whisperx.align(result["segments"], align_models[detected_lang][0], align_models[detected_lang][1], input_path, DEVICE)
            words = [w for w in result_aligned["word_segments"] if "start" in w and "end" in w]

            if not words: continue

            # 2. CẮT AUDIO (Index-Based)
            audio = AudioSegment.from_file(input_path)
            start_word_idx = 0
            seg_index = 1
            
            while start_word_idx < len(words):
                actual_start_time = words[start_word_idx]["start"]
                end_word_idx = None
                valid_segment_found = False
                
                # A. Gom từ
                for j in range(start_word_idx, len(words)):
                    current_word = words[j]
                    current_dur = current_word["end"] - actual_start_time
                    
                    if current_dur > MAX_DUR: break 
                    if current_dur >= MIN_DUR:
                        end_word_idx = j
                        valid_segment_found = True
                        break 

                # B. Xuất file (CÓ THÊM BƯỚC CHECK RÁC)
                if valid_segment_found and end_word_idx is not None:
                    current_segment_words = words[start_word_idx : end_word_idx + 1]
                    
                    # --- [FIX MỚI] CHECK LẶP TỪ ---
                    if is_valid_text(current_segment_words):
                        final_dur = current_segment_words[-1]["end"] - actual_start_time
                        start_ms = int(actual_start_time * 1000)
                        end_ms = int(current_segment_words[-1]["end"] * 1000)
                        
                        seg_audio = audio[start_ms:end_ms]
                        out_name = f"{base_name}_seg_{seg_index:03d}.wav"
                        seg_audio.export(os.path.join(output_dir, out_name), format="wav")
                        
                        text_seg = " ".join([w["word"].strip() for w in current_segment_words])
                        writer.writerow([
                            TARGET_SECONDS, speaker_id, out_name, detected_lang, 
                            round(final_dur, 3), 
                            round(actual_start_time, 3), 
                            round(current_segment_words[-1]["end"], 3), 
                            text_seg
                        ])
                        meta_file.flush()
                        seg_index += 1
                    else:
                        pass 
                        # print(f"-> Bỏ qua đoạn rác (Lặp từ): {actual_start_time}s")

                # C. Sliding Window (Index)
                next_start_idx = None
                desired_next_time = actual_start_time + STRIDE_SECONDS
                for k in range(start_word_idx + 1, len(words)):
                    if words[k]["start"] >= desired_next_time:
                        next_start_idx = k
                        break
                
                if next_start_idx is None: break 
                start_word_idx = next_start_idx

            print(f"-> Đã lưu: {seg_index-1} file.")

        except Exception as e:
            print(f"Lỗi {filename}: {e}")

finally:
    meta_file.close()
    print("\n>>> HOÀN TẤT <<<")

  from .autonotebook import tqdm as notebook_tqdm
  torch.utils._pytree._register_pytree_node(
  torchaudio.set_audio_backend("soundfile")
  backend = torchaudio.get_audio_backend()
  from speechbrain.pretrained import (
  torchaudio.set_audio_backend(backend)
  from torchaudio.backend.common import AudioMetaData


Loading Whisper large-v2 on cuda...
No language specified, language will be first be detected for each audio file (increases inference time).


  return _original_torch_load(*args, **kwargs)
Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.6.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint C:\Users\Lenovo\.cache\torch\whisperx-vad-segmentation.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.0.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.5.1+cu118. Bad things might happen unless you revert torch to 1.x.

>>> CHẾ ĐỘ: CỬA SỔ TRƯỢT INDEX + LỌC RÁC <<<

--- Xử lý: id01226_00951.wav ---


It can be re-enabled by calling
   >>> import torch
   >>> torch.backends.cuda.matmul.allow_tf32 = True
   >>> torch.backends.cudnn.allow_tf32 = True
See https://github.com/pyannote/pyannote-audio/issues/1370 for more details.



Detected language: vi (1.00) in first 30s of audio...


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at nguyenvulebinh/wav2vec2-base-vi and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'lm_head.bias', 'lm_head.weight', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


-> Đã lưu: 0 file.

--- Xử lý: id01226_00966.wav ---
Detected language: vi (1.00) in first 30s of audio...
-> Đã lưu: 0 file.

--- Xử lý: id01226_00967.wav ---
Detected language: vi (1.00) in first 30s of audio...
-> Đã lưu: 0 file.

--- Xử lý: id01226_00968.wav ---
Detected language: vi (1.00) in first 30s of audio...
-> Đã lưu: 0 file.

--- Xử lý: id01226_00977.wav ---
Detected language: vi (1.00) in first 30s of audio...
-> Đã lưu: 0 file.

--- Xử lý: id01226_00981.wav ---
Detected language: vi (1.00) in first 30s of audio...
-> Đã lưu: 0 file.

--- Xử lý: id01226_00992.wav ---
Detected language: vi (1.00) in first 30s of audio...
-> Đã lưu: 0 file.

--- Xử lý: id01226_00996.wav ---
Detected language: vi (1.00) in first 30s of audio...
-> Đã lưu: 0 file.

--- Xử lý: id01226_01018.wav ---
Detected language: vi (1.00) in first 30s of audio...
-> Đã lưu: 0 file.

--- Xử lý: id01226_01020.wav ---
Detected language: vi (1.00) in first 30s of audio...
-> Đã lưu: 0 file.

--- Xử lý: id0122

Some weights of the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-portuguese were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-portuguese and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.origina

-> Đã lưu: 0 file.

--- Xử lý: id01310_00173.wav ---
Detected language: en (0.50) in first 30s of audio...

--- Xử lý: id01310_00174.wav ---
Detected language: th (0.98) in first 30s of audio...
There is no default alignment model set for this language (th).                Please find a wav2vec2.0 model finetuned on this language in https://huggingface.co/models, then pass the model name in --align_model [MODEL_NAME]
Lỗi id01310_00174.wav: No default align-model for language: th

--- Xử lý: id01310_00175.wav ---
Detected language: th (0.83) in first 30s of audio...
There is no default alignment model set for this language (th).                Please find a wav2vec2.0 model finetuned on this language in https://huggingface.co/models, then pass the model name in --align_model [MODEL_NAME]
Lỗi id01310_00175.wav: No default align-model for language: th

--- Xử lý: id01310_00185.wav ---
Detected language: en (0.96) in first 30s of audio...

--- Xử lý: id01311_00021.wav ---
Detected language

KeyboardInterrupt: 