In [None]:
# Requirements
! pip install fasttext
! pip install huggingface_hub
! pip install "numpy<2.0"
! pip install datasets
! pip install gcld3
! pip install GlotScript

Collecting fasttext
  Downloading fasttext-0.9.3.tar.gz (73 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.13.6-py3-none-any.whl.metadata (9.5 kB)
Using cached pybind11-2.13.6-py3-none-any.whl (243 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (pyproject.toml) ... [?25ldone
[?25h  Created wheel for fasttext: filename=fasttext-0.9.3-cp312-cp312-linux_x86_64.whl size=326817 sha256=904111c5a6638ec51a98928c5ea59ba8286b89961091895f2dd022a77572a497
  Stored in directory: /home/onyxia/.cache/pip/wheels/20/27/95/a7baf1b435f1cbde017cabdf1e9688526d2b0e929255a359c6
Successfully built fasttext
Installing collected packages: pybind11, fasttext
Successfully installed fasttext-0.9.3 pybind11-2.13.6
Note: you may need to restart the kernel to use updated packages.


## 1. GlotLID-M on UDHR

In [None]:
# Imports
import fasttext
from huggingface_hub import hf_hub_download
from datasets import load_dataset
import pandas as pd
from metrics import compute_f1_false_positive

In [None]:
# Load the model
# cache_dir is the path to the folder where the downloaded model will be stored/cached.
model_path = hf_hub_download(repo_id="cis-lmu/glotlid", filename="model.bin", cache_dir=None)
print("model path:", model_path)

# load the model
glotlid_m = fasttext.load_model(model_path)

model path: /home/onyxia/.cache/huggingface/hub/models--cis-lmu--glotlid/snapshots/74cb50b709c9eefe0f790030c6c95c461b4e3b77/model.bin


In [None]:
# First test
"""Language Identification"""
# predict language label (call this function as many times as needed)
glotlid_m.predict("Hello, world!")
# (('__label__eng_Latn',), array([0.99850202]))

(('__label__eng_Latn',), array([0.99637443]))

In [114]:
# Load test dataset 1 : UDHR - clean version for LID
udhr = load_dataset('cis-lmu/udhr-lid', split='test')
print(udhr[0])
udhr = pd.DataFrame(udhr)
udhr = udhr.rename(columns={'id': 'iso_script'})

{'id': 'tir_Ethi', 'sentence': 'ሕድሕድ ኦብ ስራሕ ዝርከብ ሰብ ሰብኦዊ ክብሩን ክብሪ ቤተሰቡን ዝሕለወሉ ኦድላዪ እንተኾይኑ’ውን ብካልኦት ማሕበራዊ ውሕስነታት ዝድገፍ ፍትሓውን ዘተባብዕን ፃማ ሪሃፁ ናይ ምርካብ መሰል ኦለዎ፡፡', 'iso639-3': 'tir', 'iso15924': 'Ethi', 'language': 'Tigrinya'}


In [None]:
# Apply the model to predict labels
udhr_glotlid_pred = udhr
udhr_glotlid_pred['top_pred'] = udhr_glotlid_pred['sentence'].apply(lambda x: glotlid_m.predict(x)[0][0].replace("__label__", ""))
print(udhr_glotlid_pred.head())

         id                                           sentence iso639-3  \
0  tir_Ethi  ሕድሕድ ኦብ ስራሕ ዝርከብ ሰብ ሰብኦዊ ክብሩን ክብሪ ቤተሰቡን ዝሕለወሉ ...      tir   
1  tir_Ethi  ሕድሕድ ሰብ ከቢድ ስቓይ ዘውርድ፣ ጭካነ ዝመልኦ፣ ኢሰብኦዊ ወይ ሰብኦዊ ...      tir   
2  tir_Ethi  ሕድሕድ ሰብ መሰሉን ግቡእን፣ ከምኡ’ውን ንዝቐርበሉ ዝኾነ ይኹን ገበናዊ ...      tir   
3  tir_Ethi  ንዓቕመ ኦዳምን ሄዋንን ዝበፅሑ ደቂ ተባዕትዮን ደቂ ኦንስትዮን ዘርኢ፣ ዜ...      tir   
4  tir_Ethi                       ሕድሕድ ሰብ ብማሕበር ንኽውደብ ኦይግደድን፡፡      tir   

  iso15924  language  top_pred  
0     Ethi  Tigrinya  tir_Ethi  
1     Ethi  Tigrinya  tir_Ethi  
2     Ethi  Tigrinya  tir_Ethi  
3     Ethi  Tigrinya  tir_Ethi  
4     Ethi  Tigrinya  tir_Ethi  


In [None]:
# Compute F1 and false positive rate
result_udhr_glotlid = compute_f1_false_positive(udhr_glotlid_pred)

print("Result DataFrame:")
print(result_udhr_glotlid)

Result DataFrame:
        label  f1_score  precision_score  recall_score  false_positive_rate
0    pnb_Arab  0.748466         0.598039      1.000000             0.001480
1    ido_Latn  1.000000         1.000000      1.000000             0.000000
2    hnj_Latn  0.000000         0.000000      0.000000             0.000000
3    pap_Latn  0.943089         0.892308      1.000000             0.000253
4    kan_Knda  1.000000         1.000000      1.000000             0.000000
..        ...       ...              ...           ...                  ...
963  fub_Latn  0.000000         0.000000      0.000000             0.000252
964  kng_Latn  0.191617         0.307692      0.139130             0.001302
965  kab_Latn  0.957983         0.934426      0.982759             0.000144
966  nso_Latn  1.000000         1.000000      1.000000             0.000000
967  mcu_Latn  0.000000         0.000000      0.000000             0.000036

[968 rows x 5 columns]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# Compute average results
average_udhr_glotlid = result_udhr_glotlid.drop(columns=['label']).mean()
print("Averaged result DataFrame:")
print(average_udhr_glotlid)

Averaged result DataFrame:
f1_score               0.325932
precision_score        0.335005
recall_score           0.331207
false_positive_rate    0.000266
dtype: float64
Averaged result DataFrame on lines with f1_score>0:
f1_score               0.904019
precision_score        0.929182
recall_score           0.918648
false_positive_rate    0.000233
dtype: float64


In [None]:
# To reproduce prior work, macro averages must be computed on the intersection L = intersection of languages of GlotLID-M and languages present in UDHR

# Create a list of GlotLID languages
languages_glotlid = pd.read_csv('languages_glotlid.csv', header=None)[0].tolist()
print(languages_glotlid)
print(f"Number of languages in GlotLID: {len(languages_glotlid)}")

# Create a list of UDHR languages
languages_udhr = list(set(udhr['iso_script']))
print(languages_udhr)
print(f"Number of languages in UDHR: {len(languages_udhr)}")

# Get the intersection
languages_intersection = list(set(languages_glotlid) & set(languages_udhr))
print(languages_intersection)
print(f"Number of languages in the intersection: {len(languages_intersection)}")

['aai_Latn', 'aak_Latn', 'aau_Latn', 'aaz_Latn', 'aba_Latn', 'abi_Latn', 'abk_Cyrl', 'abn_Latn', 'abq_Cyrl', 'abs_Latn', 'abt_Latn', 'abx_Latn', 'aby_Latn', 'abz_Latn', 'aca_Latn', 'acd_Latn', 'ace_Arab', 'ace_Latn', 'acf_Latn', 'ach_Latn', 'acm_Arab', 'acn_Latn', 'acr_Latn', 'acu_Latn', 'ada_Latn', 'ade_Latn', 'adh_Latn', 'adi_Latn', 'adj_Latn', 'adl_Latn', 'ady_Cyrl', 'adz_Latn', 'aeb_Arab', 'aer_Latn', 'aeu_Latn', 'aey_Latn', 'afr_Latn', 'agd_Latn', 'agg_Latn', 'agm_Latn', 'agn_Latn', 'agr_Latn', 'agt_Latn', 'agu_Latn', 'agw_Latn', 'agx_Cyrl', 'aha_Latn', 'ahk_Latn', 'aia_Latn', 'aii_Syrc', 'aim_Latn', 'ain_Latn', 'ajg_Latn', 'aji_Latn', 'ajp_Arab', 'ajz_Latn', 'akb_Latn', 'ake_Latn', 'akh_Latn', 'akp_Latn', 'ald_Latn', 'alj_Latn', 'aln_Latn', 'alp_Latn', 'alq_Latn', 'als_Latn', 'alt_Cyrl', 'aly_Latn', 'alz_Latn', 'ame_Latn', 'amf_Latn', 'amh_Ethi', 'ami_Latn', 'amk_Latn', 'amm_Latn', 'amn_Latn', 'amp_Latn', 'amr_Latn', 'amu_Latn', 'amx_Latn', 'ang_Latn', 'anm_Latn', 'ann_Latn', 'an

In [None]:
# Filter results
result_udhr_glotlid_filt = result_udhr_glotlid[result_udhr_glotlid['label'].isin(languages_intersection)]
print(result_udhr_glotlid_filt)

# Compute the macro averages on the filtered results
average_udhr_glotlid_filt = result_udhr_glotlid_filt.drop(columns=['label']).mean()
print("Averaged result DataFrame (computed on languages present in both GlotLID and UDHR datasets):")
print(average_udhr_glotlid_filt)


        label  f1_score  precision_score  recall_score  false_positive_rate
0    pnb_Arab  0.748466         0.598039      1.000000             0.001480
1    ido_Latn  1.000000         1.000000      1.000000             0.000000
2    hnj_Latn  0.000000         0.000000      0.000000             0.000000
3    pap_Latn  0.943089         0.892308      1.000000             0.000253
4    kan_Knda  1.000000         1.000000      1.000000             0.000000
..        ...       ...              ...           ...                  ...
959  tdt_Latn  0.662651         0.504587      0.964912             0.001949
961  ton_Latn  0.991736         0.983607      1.000000             0.000036
964  kng_Latn  0.191617         0.307692      0.139130             0.001302
965  kab_Latn  0.957983         0.934426      0.982759             0.000144
966  nso_Latn  1.000000         1.000000      1.000000             0.000000

[371 rows x 5 columns]
Averaged result DataFrame (computed on languages present in both

## 2. Baselines on UDHR

In [None]:
# Imports
import baselines

In [109]:
# Load the model
CLD3 = baselines.CLD3()

# First prediction
CLD3.predict_lang_with_confidence("Hello, world!")

('kir-Cyrl', 0.7191885113716125)

In [None]:
# Apply the model to predict labels
udhr_cld3_pred = udhr
udhr_cld3_pred['top_pred'] = udhr_cld3_pred['sentence'].apply(lambda x: CLD3.predict_lang_with_confidence(x)[0])
print(udhr_cld3_pred.head(100))

   iso_script                                           sentence iso639-3  \
0    tir_Ethi  ሕድሕድ ኦብ ስራሕ ዝርከብ ሰብ ሰብኦዊ ክብሩን ክብሪ ቤተሰቡን ዝሕለወሉ ...      tir   
1    tir_Ethi  ሕድሕድ ሰብ ከቢድ ስቓይ ዘውርድ፣ ጭካነ ዝመልኦ፣ ኢሰብኦዊ ወይ ሰብኦዊ ...      tir   
2    tir_Ethi  ሕድሕድ ሰብ መሰሉን ግቡእን፣ ከምኡ’ውን ንዝቐርበሉ ዝኾነ ይኹን ገበናዊ ...      tir   
3    tir_Ethi  ንዓቕመ ኦዳምን ሄዋንን ዝበፅሑ ደቂ ተባዕትዮን ደቂ ኦንስትዮን ዘርኢ፣ ዜ...      tir   
4    tir_Ethi                       ሕድሕድ ሰብ ብማሕበር ንኽውደብ ኦይግደድን፡፡      tir   
5    tir_Ethi  እዞም መሰላትን ነፃነታትን እዚኦቶም ብዝኾነ ይኹን መንገዲ ኦንፃር ዕላማታ...      tir   
6    tir_Ethi  ሕድሕድ ሰብ ኦብ ውሽጢ ዝነብሪሉ ሃገር ናይ ምዝውዋርን ምንባርን መሰል ኦ...      tir   
7    tir_Ethi  ሕድሕድ ሰብ ብባርነት ወይ ብጊልያነት ክግዛእ የብሉን፡፡ ባርነትን ናይ ባ...      tir   
8    tir_Ethi  ሕድሕድ ሰብ ሓደ ነገር ኦብ ዝተፈፀመሉ እዋን ነቲ ጉዳይ ምፍፃም ወይ ዘይ...      tir   
9    tir_Ethi  ሕድሕድ ሰብ ካብ ንህይወቱ ሓደገኛ ዝኾኑ ግፍዕታት ኦምሊጡ ኦብ ካልኦት ሃ...      tir   
10   tir_Ethi  ሕድሕድ ሰብ ባዕሉ ካብ ዝፈጠሮ ሳይንሳዊ፣ ስነ ፅሑፋዊ ወይ ኪነታዊ ውፅኢ...      tir   
11   tir_Ethi  ሕድሕድ ሰብ ብቀጥታ ወይ ብነፃ መሪፃ ብዝተመሪፁ ወከልቱ ኦቢሉ ኦብ መንግ...      tir   

In [None]:
# Compute F1 and false positive rate
result_udhr_cld3 = compute_f1_false_positive(udhr_cld3_pred)

print("Result DataFrame:")
print(result_udhr_cld3)

# PROBLEME : tous les f1_score sont = 0 -> pb dans les labels ? 

Result DataFrame:
        label  f1_score  precision_score  recall_score  false_positive_rate
0    pnb_Arab       0.0              0.0           0.0              0.00000
1    ido_Latn       0.0              0.0           0.0              0.00000
2    hnj_Latn       0.0              0.0           0.0              0.00000
3    pap_Latn       0.0              0.0           0.0              0.00000
4    kan_Knda       0.0              0.0           0.0              0.00000
..        ...       ...              ...           ...                  ...
533  ton_Latn       0.0              0.0           0.0              0.00000
534  ltz-Latn       0.0              0.0           0.0              0.00562
535  kng_Latn       0.0              0.0           0.0              0.00000
536  kab_Latn       0.0              0.0           0.0              0.00000
537  nso_Latn       0.0              0.0           0.0              0.00000

[538 rows x 5 columns]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# To reproduce prior work, macro averages must be computed on the intersection L = intersection of languages of CLD3 and languages present in UDHR
# We already have languages in UDHR, we just have to retrieve languages present in CLD3 

# Create a list of CLD3 languages
# languages_cld3 = ??
print(languages_cld3)
print(f"Number of languages in CLD3: {len(languages_cld3)}")

# Get the intersection
languages_intersection = list(set(languages_cld3) & set(languages_udhr))
print(languages_intersection)
print(f"Number of languages in the intersection: {len(languages_intersection)}")

In [None]:
# Filter results
result_udhr_cld3_filt = result_udhr_cld3[result_udhr_cld3['label'].isin(languages_intersection)]
print(result_udhr_cld3_filt)

# Compute the macro averages on the filtered results
average_udhr_cld3_filt = result_udhr_cld3_filt.drop(columns=['label']).mean()
print("Averaged result DataFrame (computed on languages present in both CLD3 and UDHR datasets):")
print(average_udhr_cld3_filt)


## TO DO 

- Tester sur le test set 2 : FLORES
- Trouver comment tester sur le glotlid-corpus : le charger en entier (si possible) + le mettre dans le bon format (un seul df)
- Tester les modèles baselines sur les 3 datasets (avec les deux méthodes SET? / SET! ? en utilisant customlid.py pour la méthode SET! je crois)