# Libraries and dependencies

In [None]:
pip install -r requirements.txt

In [None]:
import os
import re

import numpy as np
import pandas as pd
import soundfile as sf
import librosa

from jiwer import wer, cer
from tqdm import tqdm

import torch

from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor


# cv-valid-dev evaluation

In [None]:
# Normalize text
def normalize_text(text):
    text = re.sub(r"<unk>", "", text, flags=re.IGNORECASE)
    text = text.upper().strip()
    text = re.sub(r"[^A-Z' ]+", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

## Load dev dataset

In [None]:
# Load test CSV
dev_df = pd.read_csv("../data/cv-valid-dev.csv")
dev_df["normalized_text"] = dev_df["text"].apply(normalize_text)
DATA_DIR_DEV = "../data/cv-valid-dev"


## Load finetuned model

In [None]:
# Define the path
model_path = "/content/drive/MyDrive/Colab Notebooks/my-htx-repo/asr-train/wav2vec2-large-960h-cv"

# Load model and processor
model = Wav2Vec2ForCTC.from_pretrained(model_path)
processor = Wav2Vec2Processor.from_pretrained(model_path)

# Set model to eval mode and move to appropriate device
model.eval().to("cuda" if torch.cuda.is_available() else "cpu")


## Inference of dev dataset

In [None]:
ground_truths = []
predictions = []

for row in tqdm(dev_df.itertuples(), total=len(dev_df)):
    file_path = os.path.join(DATA_DIR_DEV, row.filename)
    true_text = row.normalized_text

    # Load audio
    audio, sr = sf.read(file_path)
    if len(audio.shape) > 1:
        audio = np.mean(audio, axis=1)
    if sr != 16000:
        audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
    audio = audio / np.max(np.abs(audio))

    # Run inference
    inputs = processor(audio, sampling_rate=16000, return_tensors="pt", padding=True).to(model.device)
    with torch.no_grad():
        logits = model(**inputs).logits
    pred_ids = torch.argmax(logits, dim=-1)
    pred_text = processor.batch_decode(pred_ids)[0]
    pred_text = normalize_text(pred_text)

    predictions.append(pred_text)
    ground_truths.append(true_text)

## Sanity check

In [None]:
final_wer = wer(ground_truths, predictions)
final_cer = cer(ground_truths, predictions)

print(f"\n Finetuned Dev Set Evaluation:")
print(f"WER: {final_wer:.4f}")
print(f"CER: {final_cer:.4f}")


## Save predictions

In [None]:
# Save results to CSV
results_df = pd.DataFrame({
    "filename": dev_df["filename"].tolist(),
    "ground_truth": ground_truths,
    "prediction": predictions
})

results_df.to_csv("cv-valid-dev_predictions.csv", index=False)
print("Predictions saved to cv-valid-dev_predictions.csv")


## Filter rows by hot words - BE CAREFUL, DESTROY, STRANGER

In [None]:
# Load predictions CSV (if not already in memory)
results_df = pd.read_csv("cv-valid-dev_predictions.csv")

# Normalize text to uppercase
results_df["ground_truth"] = results_df["ground_truth"].str.upper()
results_df["prediction"] = results_df["prediction"].str.upper()

# Define keywords to search
keywords = ["BE CAREFUL", "DESTROY", "STRANGER"]

# Function to check if a row has any of the keywords in ground truth
def has_keyword(text):
    return any(keyword in text for keyword in keywords)

# Filter only rows where ground truth contains at least one keyword
filtered_dev_df = results_df[results_df["ground_truth"].apply(has_keyword)].copy()

# Determine which keyword(s) are present in the ground truth
def get_matching_keyword(row):
    for keyword in keywords:
        if keyword in row["ground_truth"]:
            return keyword
    return None  # should not happen due to previous filtering

filtered_dev_df["keyword"] = filtered_dev_df.apply(get_matching_keyword, axis=1)

# Check if the same keyword exists in the prediction
filtered_dev_df["label"] = filtered_dev_df.apply(
    lambda row: 1 if row["keyword"] in row["prediction"] else 0,
    axis=1
)

# Save the filtered DataFrame
filtered_dev_df.to_csv("cv-valid-dev_filtered_keywords.csv", index=False)
print("Filtered keyword detection results saved to cv-valid-dev_filtered_keywords.csv")


## Save detected.txt

In [None]:
# Filter rows with correctly detected hotwords (label == 1)
detected_filenames = filtered_dev_df[filtered_dev_df["label"] == 1]["filename"]

# Save to detected.txt
with open("detected.txt", "w") as f:
    for filename in detected_filenames:
        f.write(f"{filename}\n")

print("Hotword-detected filenames saved to detected.txt")
