In [1]:
model_path = "openai/whisper-medium"

In [3]:
import re
from typing import Dict, List, Tuple

import torch
import torchaudio
from iso639 import Lang
from torchaudio.transforms import Resample
from transformers import (
    WhisperForConditionalGeneration,
    WhisperProcessor,
    WhisperTokenizer,
)


class AudioProcessor:
    def __init__(self, model_path: str = "openai/whisper-tiny"):
        self.model = WhisperForConditionalGeneration.from_pretrained(model_path).to("cuda")
        self.tokenizer = WhisperTokenizer.from_pretrained(model_path)
        self.processor = WhisperProcessor.from_pretrained(model_path)
        print(f"Audio backends: {torchaudio.list_audio_backends()}")

    def load_audio(self, audio_file_path: str, max_duration: int = 60) -> Tuple[torch.Tensor, int]:
        waveform, sample_rate = torchaudio.load(uri=audio_file_path, backend="soundfile")
        max_samples = max_duration * sample_rate
        waveform = waveform[:, :max_samples]
        if sample_rate != 16000:
            waveform = Resample(orig_freq=sample_rate, new_freq=16000)(waveform)
            sample_rate = 16000
        return waveform, sample_rate

    def audio_preprocess(self, waveform: torch.Tensor, sample_rate: int) -> torch.Tensor:
        return self.processor(
            waveform.squeeze().numpy(), sampling_rate=sample_rate, return_tensors="pt"
        ).input_features

    def get_lang_tokens(self) -> List[str]:
        return [t for t in self.tokenizer.additional_special_tokens if len(t) == 6]

    def get_model_logits(self, input_features: torch.Tensor) -> torch.Tensor:
        TRANSCRIBE_TOKEN_ID = 50258
        decoder_input_ids = torch.full((input_features.shape[0], 1), TRANSCRIBE_TOKEN_ID, dtype=torch.long).to("cuda")
        return self.model(input_features, decoder_input_ids=decoder_input_ids).logits

    def get_tokens_probabilities(
        self, logits: torch.Tensor, token_ids: List[int]
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        logits = logits.clone()
        mask = torch.ones_like(logits, dtype=torch.bool)
        mask[:, :, token_ids] = False
        logits[mask] = -float("inf")
        return logits, logits.softmax(dim=-1).cpu()

    def get_sorted_results(self, logits: torch.Tensor, tokens: List[str]) -> Dict[str, float]:
        token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        logits, probs = self.get_tokens_probabilities(logits, token_ids)

        results = [
            {token: probs[input_idx, 0, token_id].item() for token_id, token in zip(token_ids, tokens)}
            for input_idx in range(logits.shape[0])
        ]
        summed_results = {token: sum([result[token] for result in results]) for token in tokens}
        average_results = {token: total / len(results) for token, total in summed_results.items()}
        sorted_results = dict(sorted(average_results.items(), key=lambda item: item[1], reverse=True))
        return sorted_results

    def get_language_accent(self, logits: torch.Tensor) -> Dict[str, float]:
        lang_tokens = self.get_lang_tokens()
        lang_tokens = [lang for lang in lang_tokens if lang not in ["<|jw|>"]]
        sorted_results = self.get_sorted_results(logits, lang_tokens)
        return {Lang(lang[2:-2]).name: value for lang, value in sorted_results.items()}

    def get_filler(self, logits: torch.Tensor) -> Dict[str, float]:
        all_tokens = self.tokenizer.get_vocab().keys()
        sorted_results = self.get_sorted_results(logits, all_tokens)

        filler_tokens = ["uh", "um", "ah", "mm", "er"]

        filler_results = {token: sorted_results[token] for token in filler_tokens}
        other_results = {token: sorted_results[token] for token in all_tokens if token not in filler_tokens}

        filler_sum = sum(filler_results.values()) / len(filler_results)
        other_sum = sum(other_results.values()) / len(other_results)

        filler_ratio = filler_sum / (filler_sum + other_sum)
        return filler_ratio

    def convert_transcription_to_structured(self, transcription: str) -> List[Dict]:
        pattern = re.compile(r"<\|([\d.]+)\|>([^<]*)")
        matches = pattern.findall(transcription)

        structured_data = []
        for i in range(len(matches)):
            text = matches[i][1].strip()
            if not text:
                continue
            start_t = matches[i][0]
            end_t = matches[i + 1][0] if i + 1 < len(matches) else None
            structured_data.append(
                {
                    "start_t": start_t.strip(),
                    "end_t": end_t.strip() if end_t else None,
                    "text": text,
                }
            )

        return structured_data

    def transcribe(self, input_features: torch.Tensor) -> Dict[str, float]:
        generated_ids = self.model.generate(
            input_features,
            return_timestamps=True,
            output_scores=True,
            return_dict_in_generate=True,
        )
        token_scores = generated_ids.scores
        tokens = generated_ids.sequences[0]

        # Decode tokens and get transcription
        transcription = self.tokenizer.decode(tokens, skip_special_tokens=True, decode_with_timestamps=True)
        transcription = self.convert_transcription_to_structured(transcription)

        scores = [torch.max(score).item() for score in token_scores]
        token_scores_dict = {self.tokenizer.decode([token]): score for token, score in zip(tokens, scores)}

        special_token_pattern = re.compile(r"^<\|.*\|>$")
        word_scores = [score for token, score in token_scores_dict.items() if not special_token_pattern.match(token)]
        mean_score = sum(word_scores) / len(word_scores) if word_scores else 0

        return {
            "transcript": transcription,
            "token_scores": token_scores_dict,
            "mean_score": mean_score,
        }

    def __call__(self, audio_file_path: str, max_duration: int = 60) -> Dict[str, float]:
        waveform, sample_rate = self.load_audio(audio_file_path, max_duration)
        input_features = self.audio_preprocess(waveform, sample_rate).to("cuda")
        logits = self.get_model_logits(input_features)

        accent_result = self.get_language_accent(logits)
        fillers_result = self.get_filler(logits)
        transcription_results = self.transcribe(input_features)

        return {
            "accents": accent_result,
            "fillers": fillers_result,
            **transcription_results,
        }


processor = AudioProcessor(model_path=model_path)

config.json:   0%|          | 0.00/1.99k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.06G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.75k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

Audio backends: ['ffmpeg', 'sox', 'soundfile']


In [4]:
import csv
import json
import os
from glob import glob

import torch
from tqdm import tqdm


def process_audio_files(folder_path, output_file):
    global processor
    audio_files = glob(os.path.join(folder_path, "*"))
    results_list = []

    for audio_file in tqdm(audio_files):
        with torch.no_grad():
            result = processor(audio_file)
        result["file_name"] = os.path.basename(audio_file)
        results_list.append(result)

    with open(output_file, "w") as f:
        json.dump(results_list, f, indent=4)

    print(f"Processed results saved to {output_file}")


def load_and_calculate_means(json_file):
    with open(json_file, "r") as f:
        data = json.load(f)

    accent_totals = {}
    accent_counts = {}

    for entry in data:
        accents = entry.get("accents", {})
        for accent, value in accents.items():
            accent_totals[accent] = accent_totals.get(accent, 0) + value
            accent_counts[accent] = accent_counts.get(accent, 0) + 1

    accent_means = {accent: accent_totals[accent] / accent_counts[accent] for accent in accent_totals}

    return accent_means


def save_means_to_csv(means, output_csv):
    with open(output_csv, "w", newline="") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["Accent", "Mean"])
        for accent, mean in means.items():
            writer.writerow([accent, mean])

In [5]:
# Process train and test folders
train_folder = "/kaggle/input/speechocean762/train"
test_folder = "/kaggle/input/speechocean762/test"
train_output_file = "/kaggle/working/train.json"
test_output_file = "/kaggle/working/test.json"

process_audio_files(train_folder, train_output_file)
process_audio_files(test_folder, test_output_file)

train_accent_means = load_and_calculate_means(train_output_file)
test_accent_means = load_and_calculate_means(test_output_file)

train_csv = "/kaggle/working/train_accent_means.csv"
test_csv = "/kaggle/working/test_accent_means.csv"
save_means_to_csv(train_accent_means, train_csv)
save_means_to_csv(test_accent_means, test_csv)

print("Train Accent Means:", train_accent_means)
print("Test Accent Means:", test_accent_means)

  0%|          | 0/2500 [00:00<?, ?it/s]Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
From v4.47 onwards, when a model cache is to be returned, `generate` will return a `Cache` instance instead by default (as opposed t

Processed results saved to /kaggle/working/train.json


100%|██████████| 2500/2500 [1:10:05<00:00,  1.68s/it]


Processed results saved to /kaggle/working/test.json
Train Accent Means: {'English': 0.7051987406270113, 'Chinese': 0.1545658992620836, 'Norwegian Nynorsk': 0.013873178503603412, 'Welsh': 0.014819614144940351, 'Malay (macrolanguage)': 0.011636508948838081, 'Japanese': 0.01431274796390453, 'Korean': 0.009380414244541953, 'Latin': 0.018275934812962805, 'German': 0.0025031598469416622, 'Thai': 0.0024497907615411807, 'French': 0.0023213868543528974, 'Italian': 0.0006629226805232065, 'Khmer': 0.0016772620422373734, 'Burmese': 0.00984911193476379, 'Tibetan': 0.004100653456382938, 'Spanish': 0.001085257676695619, 'Vietnamese': 0.0006541767716267145, 'Arabic': 0.0018436331870868117, 'Urdu': 0.0005751339445852693, 'Hindi': 0.0005146765732202596, 'Maori': 0.002648595049092863, 'Sinhala': 0.0008576318442581936, 'Slovenian': 0.0006138523228289578, 'Finnish': 0.0005412157170124175, 'Danish': 0.0022240435679937265, 'Portuguese': 0.001165518401266229, 'Sanskrit': 0.00034307935641071766, 'Dutch': 0.00

In [6]:
base_name = "speech-accent-archive"
folder = f"/kaggle/input/{base_name}/recordings/recordings"
output_file = f"/kaggle/working/{base_name}.json"
output_csv = f"/kaggle/working/{base_name}_accent_means.csv"

process_audio_files(folder, output_file)
accent_means = load_and_calculate_means(output_file)
save_means_to_csv(accent_means, output_csv)

100%|██████████| 2138/2138 [2:08:13<00:00,  3.60s/it]


Processed results saved to /kaggle/working/speech-accent-archive.json


In [7]:
import polars as pl

# Define the path to the CSV file
csv_path = "/kaggle/input/speech-accent-archive/speakers_all.csv"

# Load the CSV file using Polars
df = pl.read_csv(
    csv_path,
    dtypes={"age": pl.Float64},
)

print(df.head())
print(df.schema)

shape: (5, 12)
┌──────┬───────────┬──────────────┬──────────┬───┬──────────────┬──────┬─────────────┬─────────────┐
│ age  ┆ age_onset ┆ birthplace   ┆ filename ┆ … ┆ file_missing ┆      ┆ _duplicated ┆ _duplicated │
│ ---  ┆ ---       ┆ ---          ┆ ---      ┆   ┆ ?            ┆ ---  ┆ _0          ┆ _1          │
│ f64  ┆ f64       ┆ str          ┆ str      ┆   ┆ ---          ┆ str  ┆ ---         ┆ ---         │
│      ┆           ┆              ┆          ┆   ┆ bool         ┆      ┆ str         ┆ str         │
╞══════╪═══════════╪══════════════╪══════════╪═══╪══════════════╪══════╪═════════════╪═════════════╡
│ 24.0 ┆ 12.0      ┆ koussi,      ┆ balanta  ┆ … ┆ true         ┆ null ┆ null        ┆ null        │
│      ┆           ┆ senegal      ┆          ┆   ┆              ┆      ┆             ┆             │
│ 18.0 ┆ 10.0      ┆ buea,        ┆ cameroon ┆ … ┆ true         ┆ null ┆ null        ┆ null        │
│      ┆           ┆ cameroon     ┆          ┆   ┆              ┆      ┆    

  df = pl.read_csv(


In [8]:
unique_languages = df.select("native_language").unique()
print(unique_languages.get_column("native_language").to_list())

['french', 'fang', 'moore', 'yiddish', 'khmer', 'korean', 'sarua', 'tatar', 'mauritian', 'bamun', 'mongolian', 'poonchi', 'icelandic', 'igbo', 'sindhi', 'naxi', 'uzbek', 'lamaholot', 'xiang', 'twi', 'sardinian', 'kikongo', 'hadiyya', 'lithuanian', 'marathi', 'urdu', 'tetun-dili', 'agni', 'hmong', 'rotuman', 'satawalese', 'synthesized', 'yakut', 'nicaragua', 'croatian', 'kazakh', 'vietnamese', 'tibetan', 'gedeo', 'bari', 'fanti', 'pulaar', 'tamil', 'xasonga', 'amazigh', 'rwanda', 'estonian', 'nandi', 'ga', 'sundanese', 'hindi', 'bafang', 'danish', 'chamorro', 'gujarati', 'haitian', 'slovak', 'hindko', 'thai', 'mizo', 'afrikaans', 'tagalog', 'hungarian', 'frisian', 'yoruba', 'yupik', 'jamaican', 'tswana', 'nuer', 'quechua', 'susu', 'hainanese', 'japanese', 'jola', 'slovenian', 'tok', 'northern', 'greek', 'faroese', 'ashanti', 'baga', 'somali', 'miskito', 'nepali', 'oriya', 'russian', 'garifuna', 'ife', 'arabic', 'liberian', 'catalan', 'pashto', 'mankanya', 'zulu', 'luo', 'temne', 'azerba