In [1]:
%pip install -q -U datasets evaluate accelerate python-iso639

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/510.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━[0m [32m256.0/510.5 kB[0m [31m7.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.1/290.1 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m274.7/274.7 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[2K     [9

In [2]:
from dataclasses import dataclass, asdict
from typing import Any
import datetime

import datasets
import transformers
from transformers import AutoModel, Trainer, AutoModelForAudioClassification, TrainingArguments
from torch.utils.data import DataLoader
import evaluate
from tqdm.auto import trange, tqdm
import numpy as np
import iso639

In [3]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
results_output_path = "/content/drive/MyDrive/LangId Evaluation/"

# Load datasets

In [5]:
fleurs_val = datasets.load_dataset("google/fleurs", "all", split="validation", streaming=True)
fleurs_test = datasets.load_dataset("google/fleurs", "all", split="test", streaming=True)
fleurs = datasets.concatenate_datasets([fleurs_val, fleurs_test])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/12.6k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/13.3k [00:00<?, ?B/s]

In [6]:
minds_14 = datasets.load_dataset("PolyAI/minds14", "all", streaming=True)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/5.90k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/5.29k [00:00<?, ?B/s]

# Set up results

In [7]:
@dataclass
class LangIdResult:
  dataset: str
  sample_id: int
  label: int
  predictions: list[dict[str, Any]]

In [8]:
def lower_letter_to_num(letter:str) -> int:
  """Returns the index of the english lowercase letter in the alphabet, with a zero-based index.
  The input string must be a single character and a lowercase letter.
  >>> lower_letter_to_num("a")
  0
  >>> lower_letter_to_num("z")
  25
  >>> lower_letter_to_num("k")
  10
  """
  assert len(letter) == 1, f'Letter "{letter}" must be a single letter'
  letter_ascii = ord(letter)
  letter_num = letter_ascii - ord('a')
  assert 0 <= letter_num <= 26, f'Letter "{letter}" must be a lowercase English (ASCII) letter'
  return letter_num

In [9]:
def iso639_part3_to_global_id(lang:str) -> int:
  """Converts an ISO 639-3 language to a unique integer id used across all models and datasets.
  ISO 639-3 is already a unique identifier, but parts of Huggingface modules require integers.

  Interprets the letters in the ISO 639-3 code as a base-26 number and converts that to an integer

  >>> iso639_part3_to_global_id("yue")
  16748
  >>> iso639_part3_to_global_id("eng")
  3048
  """
  assert len(lang) == 3, f'lang must be an ISO 639-3 language code. Got "{lang}"'
  return 26**2 * lower_letter_to_num(lang[0]) + 26 * lower_letter_to_num(lang[1]) + lower_letter_to_num(lang[2])

In [10]:
def letter_num_to_letter(letter_num:int) -> str:
  assert 0 <= letter_num < 26
  return chr(ord('a') + letter_num)

In [11]:
def global_id_to_iso639_part3(id:int) -> str:
  return letter_num_to_letter((id // (26 * 26)) % 26) + letter_num_to_letter((id // 26) % 26) + letter_num_to_letter(id % 26)

In [12]:
def language_to_global_id(lang:str) -> int:
  return iso639_part3_to_global_id(iso639.Language.match(lang).part3)

In [13]:
def remove_hyphen_language_to_global_id(label:str):
  return language_to_global_id(label.replace("-", " "))

In [14]:
def sanchit_gandhi_whisper_medium_fleurs_lang_id_output_to_global_id(label:str):
  # Generated from the model's str2id
  # {k:int(v) for k, v in classifier.model.config.label2id.items()}
  label2id = {
      'Afrikaans': 0,
      'Amharic': 1,
      'Arabic': 2,
      'Armenian': 35,
      'Assamese': 3,
      'Asturian': 4,
      'Azerbaijani': 5,
      'Belarusian': 6,
      'Bengali': 8,
      'Bosnian': 9,
      'Bulgarian': 7,
      'Burmese': 64,
      'Cantonese Chinese': 100,
      'Catalan': 10,
      'Cebuano': 11,
      'Croatian': 33,
      'Czech': 14,
      'Danish': 16,
      'Dutch': 67,
      'English': 19,
      'Estonian': 21,
      'Filipino': 25,
      'Finnish': 24,
      'French': 26,
      'Fula': 23,
      'Galician': 28,
      'Ganda': 51,
      'Georgian': 42,
      'German': 17,
      'Greek': 18,
      'Gujarati': 29,
      'Hausa': 30,
      'Hebrew': 31,
      'Hindi': 32,
      'Hungarian': 34,
      'Icelandic': 38,
      'Igbo': 37,
      'Indonesian': 36,
      'Irish': 27,
      'Italian': 39,
      'Japanese': 40,
      'Javanese': 41,
      'Kabuverdianu': 44,
      'Kamba': 43,
      'Kannada': 47,
      'Kazakh': 45,
      'Khmer': 46,
      'Korean': 48,
      'Kyrgyz': 49,
      'Lao': 53,
      'Latvian': 56,
      'Lingala': 52,
      'Lithuanian': 54,
      'Luo': 55,
      'Luxembourgish': 50,
      'Macedonian': 58,
      'Malay': 62,
      'Malayalam': 59,
      'Maltese': 63,
      'Mandarin Chinese': 13,
      'Maori': 57,
      'Marathi': 61,
      'Mongolian': 60,
      'Nepali': 66,
      'Northern-Sotho': 68,
      'Norwegian': 65,
      'Nyanja': 69,
      'Occitan': 70,
      'Oriya': 72,
      'Oromo': 71,
      'Pashto': 75,
      'Persian': 22,
      'Polish': 74,
      'Portuguese': 76,
      'Punjabi': 73,
      'Romanian': 77,
      'Russian': 78,
      'Serbian': 84,
      'Shona': 82,
      'Sindhi': 79,
      'Slovak': 80,
      'Slovenian': 81,
      'Somali': 83,
      'Sorani-Kurdish': 12,
      'Spanish': 20,
      'Swahili': 86,
      'Swedish': 85,
      'Tajik': 89,
      'Tamil': 87,
      'Telugu': 88,
      'Thai': 90,
      'Turkish': 91,
      'Ukrainian': 92,
      'Umbundu': 93,
      'Urdu': 94,
      'Uzbek': 95,
      'Vietnamese': 96,
      'Welsh': 15,
      'Wolof': 97,
      'Xhosa': 98,
      'Yoruba': 99,
      'Zulu': 101
  }
  # Generated from fleurs mappings
  # fleurs.features["lang_id"]._int2str
  model_id_to_iso639_locale = [
      'af_za',
      'am_et',
      'ar_eg',
      'as_in',
      'ast_es',
      'az_az',
      'be_by',
      'bg_bg',
      'bn_in',
      'bs_ba',
      'ca_es',
      'ceb_ph',
      'ckb_iq',
      'cmn_hans_cn',
      'cs_cz',
      'cy_gb',
      'da_dk',
      'de_de',
      'el_gr',
      'en_us',
      'es_419',
      'et_ee',
      'fa_ir',
      'ff_sn',
      'fi_fi',
      'fil_ph',
      'fr_fr',
      'ga_ie',
      'gl_es',
      'gu_in',
      'ha_ng',
      'he_il',
      'hi_in',
      'hr_hr',
      'hu_hu',
      'hy_am',
      'id_id',
      'ig_ng',
      'is_is',
      'it_it',
      'ja_jp',
      'jv_id',
      'ka_ge',
      'kam_ke',
      'kea_cv',
      'kk_kz',
      'km_kh',
      'kn_in',
      'ko_kr',
      'ky_kg',
      'lb_lu',
      'lg_ug',
      'ln_cd',
      'lo_la',
      'lt_lt',
      'luo_ke',
      'lv_lv',
      'mi_nz',
      'mk_mk',
      'ml_in',
      'mn_mn',
      'mr_in',
      'ms_my',
      'mt_mt',
      'my_mm',
      'nb_no',
      'ne_np',
      'nl_nl',
      'nso_za',
      'ny_mw',
      'oc_fr',
      'om_et',
      'or_in',
      'pa_in',
      'pl_pl',
      'ps_af',
      'pt_br',
      'ro_ro',
      'ru_ru',
      'sd_in',
      'sk_sk',
      'sl_si',
      'sn_zw',
      'so_so',
      'sr_rs',
      'sv_se',
      'sw_ke',
      'ta_in',
      'te_in',
      'tg_tj',
      'th_th',
      'tr_tr',
      'uk_ua',
      'umb_ao',
      'ur_pk',
      'uz_uz',
      'vi_vn',
      'wo_sn',
      'xh_za',
      'yo_ng',
      'yue_hant_hk',
      'zu_za',
      'all',
  ]

  lang_with_locale = model_id_to_iso639_locale[label2id[label]]
  iso639_lang = lang_with_locale.split("_")[0]
  return language_to_global_id(iso639_lang)

In [15]:
models = [
  {
      "model_or_path": "facebook/mms-lid-126",
      "output_label_to_global_id": language_to_global_id,
      # Mappings for non-standard language names not recognized by iso639 to the propert ISO 639 part 3 code
      # "custom_language_mappings": {},
  }, {
      "model_or_path": "sanchit-gandhi/whisper-medium-fleurs-lang-id",
      "output_label_to_global_id": sanchit_gandhi_whisper_medium_fleurs_lang_id_output_to_global_id,
      # this model has same ids as fleurs, but the labels it outputs aren't recognized by iso639. We can use fleurs str2id
      # "custom_language_mappings": {
      # },
  }
]

In [16]:
def classifier_predictions_to_ids(predictions: list[dict]) -> list[dict]:
  return [{
      "score": pred["score"],
      "lang_id": global_id_to_iso639_part3(language_to_global_id(pred["label"])),
    } for pred in predictions]

In [17]:
def classifier_predictions_to_iso_639_part3(predictions: list[dict]) -> list[dict]:
  return [{
      "score": pred["score"],
      "lang_id": global_id_to_iso639_part3(pred["label"]),
    } for pred in predictions]

In [18]:
import json
import os

def write_results(output_basepath, results: list[LangIdResult], time=None):
  try:
    os.makedirs(output_basepath)
  except FileExistsError:
    pass

  if time is not None:
    time_prefix = time.strftime("%Y-%m-%dT%H_%M_%S")
  else:
    time_prefix = ""

  with open(output_basepath + f"/{time_prefix}_results_global_ids.json", "w") as out_file:
    json.dump([asdict(r) for r in results], out_file)

  with open(output_basepath + f"/{time_prefix}_results_iso_639-3.json", "w") as out_file:
    results = [{
      "dataset": r.dataset,
      "sample_id": r.sample_id,
      "predictions": classifier_predictions_to_iso_639_part3(r.predictions),
      "label": global_id_to_iso639_part3(r.label),
    } for r in results]
    json.dump(results, out_file)


In [19]:
# Helper to get label2id and id2label for models that have id2label or label2id but not both, for some reason
# Returns label2id and id2label
def get_id_and_label_mappings(model):
  label2id = model.config.label2id
  id2label = model.config.id2label
  assert label2id is not None or id2label is not None, f"Model has neither label2id nor id2label. label2id: {label2id}. id2label: {id2label}"
  if id2label is None:
    id2label = {v:k for v, k in label2id.items()}
  else:
    label2id = {v:k for v, k in id2label.items()}

  return label2id, id2label

# Run evaluations

At this point, the loaded datasets should have two columns
1. "audio" with audio data
2. "language" with the labeled language of the audio (any format that `python-iso639` can match including ISO 639-3 code, Human name)

In [20]:
for model_info in models:
  model_name_or_path = model_info["model_or_path"]
  output_label_to_global_id = model_info["output_label_to_global_id"]
  print("Evaluating", model_name_or_path)
  classifier = transformers.pipeline(
    "audio-classification", model=model_name_or_path,
    num_workers=2
  )

  accuracy = evaluate.load("accuracy")
  f1 = evaluate.load("f1")

  # List of evaluation results. Each evaluation result is a LangIdResult
  results: list[LangIdResult] = []
  # From dataset card on Huggingface
  fleurs_length = 403_860
  # Consider batching on GPUs. Should measure performance to decide if batching is useful.
  for batch in tqdm(fleurs.take(1), total=1):
    predictions = classifier(batch["audio"])
    # print(predictions)
    pred_global_id = output_label_to_global_id(predictions[0]["label"])
    ref_language = iso639.Language.match(batch["language"])
    reference_global_id = iso639_part3_to_global_id(ref_language.part3)

    accuracy.add_batch(references=[reference_global_id], predictions=[pred_global_id])
    f1.add_batch(references=[reference_global_id], predictions=[pred_global_id])

    preds_with_global_ids = [{
      "score": pred["score"],
      "label": output_label_to_global_id(pred["label"]),
    } for pred in predictions]

    results.append(LangIdResult(
        dataset="google/fleurs",
        sample_id=batch["id"],
        label=reference_global_id,
        predictions=preds_with_global_ids
    ))

  print(accuracy.compute())
  print(f1.compute())

  output_basepath = results_output_path + "/" + classifier.model.name_or_path
  write_results(output_basepath, results, datetime.datetime.now())

  # We only need one model at a time. Try to prevent memory growing as more models are loaded over time
  #del classifier

Evaluating facebook/mms-lid-126


config.json:   0%|          | 0.00/4.27k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/mms-lid-126 were not used when initializing Wav2Vec2ForSequenceClassification: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/mms-lid-126 and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametriza

preprocessor_config.json:   0%|          | 0.00/212 [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

  0%|          | 0/50 [00:00<?, ?it/s]

{'accuracy': 1.0}
Evaluating sanchit-gandhi/whisper-medium-fleurs-lang-id


config.json:   0%|          | 0.00/6.64k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/615M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/339 [00:00<?, ?B/s]

  0%|          | 0/50 [00:00<?, ?it/s]

{'accuracy': 0.78}


In [89]:
with open(output_basepath + "/results_iso_639-3.json", "r") as in_file:
  _ = json.load(in_file)
_

[{'dataset': 'google/fleurs',
  'sample_id': 1326,
  'predictions': [{'score': 0.9999330043792725, 'lang_id': 'afr'},
   {'score': 7.093016847647959e-06, 'lang_id': 'nso'},
   {'score': 4.269149485480739e-06, 'lang_id': 'isl'},
   {'score': 3.2661141631251667e-06, 'lang_id': 'dan'},
   {'score': 3.258075366829871e-06, 'lang_id': 'yue'}],
  'label': 'afr'}]

In [90]:
with open(output_basepath + "/results_global_ids.json", "r") as in_file:
  _ = json.load(in_file)
_

[{'dataset': 'google/fleurs',
  'sample_id': 1326,
  'label': 147,
  'predictions': [{'score': 0.9999330043792725, 'label': 147},
   {'score': 7.093016847647959e-06, 'label': 9270},
   {'score': 4.269149485480739e-06, 'label': 5887},
   {'score': 3.2661141631251667e-06, 'label': 2041},
   {'score': 3.258075366829871e-06, 'label': 16748}]}]

In [None]:
def classifier_prediction_to_ids(predictions: list[dict]):
  return [{
      "score": pred["score"],
      "lang_id": lang_to_id[pred["label"]],
  } for pred in predictions]