## 1. GlotLID-M on UDHR-LID

On fixe le threshold $\theta = 0.5$ pour tous les modèles

In [1]:
# Imports
import fasttext
from huggingface_hub import hf_hub_download
from datasets import load_dataset
import pandas as pd
from metrics import compute_f1_false_positive

  from .autonotebook import tqdm as notebook_tqdm


Result DataFrame:
      label  f1_score  precision_score  recall_score  false_positive_rate
0  fas_Arab  0.000000              0.0           0.0             0.333333
1  eng_Latn  0.666667              0.5           1.0             0.500000
2  spa_Latn  0.000000              0.0           0.0             0.000000
3  fra_Latn  0.000000              0.0           0.0             0.000000


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# Load the model
model_path_glotlid = hf_hub_download(repo_id="cis-lmu/glotlid", filename="model.bin", cache_dir=None)
glotlid_m = fasttext.load_model(model_path_glotlid)

model path: /home/onyxia/.cache/huggingface/hub/models--cis-lmu--glotlid/snapshots/74cb50b709c9eefe0f790030c6c95c461b4e3b77/model.bin


In [None]:
# First test
glotlid_m.predict("Hello, world!")
# (('__label__eng_Latn',), array([0.99850202]))

(('__label__eng_Latn',), array([0.99637443]))

In [None]:
# Load test dataset : UDHR - clean version for LID
udhr = load_dataset('cis-lmu/udhr-lid', split='test')

udhr = pd.DataFrame(udhr)
udhr = udhr.rename(columns={'id': 'iso_script'})
print(udhr.head())

{'id': 'tir_Ethi', 'sentence': 'ሕድሕድ ኦብ ስራሕ ዝርከብ ሰብ ሰብኦዊ ክብሩን ክብሪ ቤተሰቡን ዝሕለወሉ ኦድላዪ እንተኾይኑ’ውን ብካልኦት ማሕበራዊ ውሕስነታት ዝድገፍ ፍትሓውን ዘተባብዕን ፃማ ሪሃፁ ናይ ምርካብ መሰል ኦለዎ፡፡', 'iso639-3': 'tir', 'iso15924': 'Ethi', 'language': 'Tigrinya'}


In [79]:
# Apply the model to predict labels

def get_glotlid_pred(x, threshold=0.5):
    pred, score = glotlid_m.predict(x)
    if score >= threshold:
        return pred[0].replace("__label__","")
    else:
        return 'undetermined'

udhr_glotlid_pred = pd.DataFrame(udhr)

udhr_glotlid_pred['top_pred'] = udhr_glotlid_pred['sentence'].apply(get_glotlid_pred)

print(udhr_glotlid_pred.head())


  iso_script                                           sentence iso639-3  \
0   tir_Ethi  ሕድሕድ ኦብ ስራሕ ዝርከብ ሰብ ሰብኦዊ ክብሩን ክብሪ ቤተሰቡን ዝሕለወሉ ...      tir   
1   tir_Ethi  ሕድሕድ ሰብ ከቢድ ስቓይ ዘውርድ፣ ጭካነ ዝመልኦ፣ ኢሰብኦዊ ወይ ሰብኦዊ ...      tir   
2   tir_Ethi  ሕድሕድ ሰብ መሰሉን ግቡእን፣ ከምኡ’ውን ንዝቐርበሉ ዝኾነ ይኹን ገበናዊ ...      tir   
3   tir_Ethi  ንዓቕመ ኦዳምን ሄዋንን ዝበፅሑ ደቂ ተባዕትዮን ደቂ ኦንስትዮን ዘርኢ፣ ዜ...      tir   
4   tir_Ethi                       ሕድሕድ ሰብ ብማሕበር ንኽውደብ ኦይግደድን፡፡      tir   

  iso15924  language  top_pred  
0     Ethi  Tigrinya  tir_Ethi  
1     Ethi  Tigrinya  tir_Ethi  
2     Ethi  Tigrinya  tir_Ethi  
3     Ethi  Tigrinya  tir_Ethi  
4     Ethi  Tigrinya  tir_Ethi  


In [31]:
# Compute F1 and false positive rate
result_udhr_glotlid = compute_f1_false_positive(udhr_glotlid_pred)

print("Result DataFrame:")
print(result_udhr_glotlid)

Result DataFrame:
        label  f1_score  precision_score  recall_score  false_positive_rate
0    emk_Latn  0.000000         0.000000      0.000000             0.000000
1    keo_Latn  0.000000         0.000000      0.000000             0.001009
2    aii_Syrc  0.000000         0.000000      0.000000             0.000000
3    cme_Latn  0.000000         0.000000      0.000000             0.000036
4    sey_Latn  0.526316         0.371622      0.901639             0.003358
..        ...       ...              ...           ...                  ...
751  plt_Latn  1.000000         1.000000      1.000000             0.000000
752  cot_Latn  0.991597         1.000000      0.983333             0.000000
753  zpo_Latn  0.000000         0.000000      0.000000             0.000072
754  dur_Latn  0.000000         0.000000      0.000000             0.000216
755  stp_Latn  0.000000         0.000000      0.000000             0.000072

[756 rows x 5 columns]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [32]:
# Compute average results
average_udhr_glotlid = result_udhr_glotlid.drop(columns=['label']).mean()
print("Averaged result DataFrame:")
print(average_udhr_glotlid)

Averaged result DataFrame:
f1_score               0.418994
precision_score        0.434400
recall_score           0.421252
false_positive_rate    0.000348
dtype: float64


In [33]:
# To reproduce prior work, macro averages must be computed on the intersection L = intersection of languages of GlotLID-M and languages present in UDHR

# Create a list of GlotLID languages
languages_glotlid = pd.read_csv('languages_glotlid.csv', header=None)[0].tolist()
print(languages_glotlid)
print(f"Number of languages in GlotLID: {len(languages_glotlid)}")

# Create a list of UDHR languages
languages_udhr = list(set(udhr['iso_script']))
print(languages_udhr)
print(f"Number of languages in UDHR: {len(languages_udhr)}")

# Get the intersection
languages_intersection = list(set(languages_glotlid) & set(languages_udhr))
print(languages_intersection)
print(f"Number of languages in the intersection: {len(languages_intersection)}")

['aai_Latn', 'aak_Latn', 'aau_Latn', 'aaz_Latn', 'aba_Latn', 'abi_Latn', 'abk_Cyrl', 'abn_Latn', 'abq_Cyrl', 'abs_Latn', 'abt_Latn', 'abx_Latn', 'aby_Latn', 'abz_Latn', 'aca_Latn', 'acd_Latn', 'ace_Arab', 'ace_Latn', 'acf_Latn', 'ach_Latn', 'acm_Arab', 'acn_Latn', 'acr_Latn', 'acu_Latn', 'ada_Latn', 'ade_Latn', 'adh_Latn', 'adi_Latn', 'adj_Latn', 'adl_Latn', 'ady_Cyrl', 'adz_Latn', 'aeb_Arab', 'aer_Latn', 'aeu_Latn', 'aey_Latn', 'afr_Latn', 'agd_Latn', 'agg_Latn', 'agm_Latn', 'agn_Latn', 'agr_Latn', 'agt_Latn', 'agu_Latn', 'agw_Latn', 'agx_Cyrl', 'aha_Latn', 'ahk_Latn', 'aia_Latn', 'aii_Syrc', 'aim_Latn', 'ain_Latn', 'ajg_Latn', 'aji_Latn', 'ajp_Arab', 'ajz_Latn', 'akb_Latn', 'ake_Latn', 'akh_Latn', 'akp_Latn', 'ald_Latn', 'alj_Latn', 'aln_Latn', 'alp_Latn', 'alq_Latn', 'als_Latn', 'alt_Cyrl', 'aly_Latn', 'alz_Latn', 'ame_Latn', 'amf_Latn', 'amh_Ethi', 'ami_Latn', 'amk_Latn', 'amm_Latn', 'amn_Latn', 'amp_Latn', 'amr_Latn', 'amu_Latn', 'amx_Latn', 'ang_Latn', 'anm_Latn', 'ann_Latn', 'an

In [34]:
# Filter results
result_udhr_glotlid_filt = result_udhr_glotlid[result_udhr_glotlid['label'].isin(languages_intersection)]
print(result_udhr_glotlid_filt)

# Compute the macro averages on the filtered results
average_udhr_glotlid_filt = result_udhr_glotlid_filt.drop(columns=['label']).mean()
print("Averaged result DataFrame (computed on languages present in both GlotLID and UDHR datasets):")
print(average_udhr_glotlid_filt)


        label  f1_score  precision_score  recall_score  false_positive_rate
2    aii_Syrc  0.000000         0.000000      0.000000             0.000000
4    sey_Latn  0.526316         0.371622      0.901639             0.003358
7    gjn_Latn  0.958678         0.950820      0.966667             0.000108
9    bul_Cyrl  1.000000         1.000000      1.000000             0.000000
11   swb_Latn  0.957265         0.949153      0.965517             0.000108
..        ...       ...              ...           ...                  ...
742  tiv_Latn  1.000000         1.000000      1.000000             0.000000
744  kab_Latn  0.974359         0.966102      0.982759             0.000072
746  cbi_Latn  0.991870         1.000000      0.983871             0.000000
751  plt_Latn  1.000000         1.000000      1.000000             0.000000
752  cot_Latn  0.991597         1.000000      0.983333             0.000000

[371 rows x 5 columns]
Averaged result DataFrame (computed on languages present in both

## 2. OpenLID on UDHR-LD

In [93]:
# Load the model
model_path_openlid = hf_hub_download(repo_id="laurievb/OpenLID", filename="model.bin")
openlid = fasttext.load_model(model_path_openlid)

In [94]:
# First test
openlid.predict("Hello, world!")

(('__label__yor_Latn',), array([0.69856358]))

In [96]:
# Apply the model to predict labels

def get_openlid_pred(x, threshold=0.5):
    pred, score = openlid.predict(x)
    if score >= threshold:
        return pred[0].replace("__label__","")
    else:
        return 'undetermined'

udhr_openlid_pred = pd.DataFrame(udhr)

udhr_openlid_pred['top_pred'] = udhr_openlid_pred['sentence'].apply(get_openlid_pred)

print(udhr_openlid_pred.head())


  iso_script                                           sentence iso639-3  \
0   tir_Ethi  ሕድሕድ ኦብ ስራሕ ዝርከብ ሰብ ሰብኦዊ ክብሩን ክብሪ ቤተሰቡን ዝሕለወሉ ...      tir   
1   tir_Ethi  ሕድሕድ ሰብ ከቢድ ስቓይ ዘውርድ፣ ጭካነ ዝመልኦ፣ ኢሰብኦዊ ወይ ሰብኦዊ ...      tir   
2   tir_Ethi  ሕድሕድ ሰብ መሰሉን ግቡእን፣ ከምኡ’ውን ንዝቐርበሉ ዝኾነ ይኹን ገበናዊ ...      tir   
3   tir_Ethi  ንዓቕመ ኦዳምን ሄዋንን ዝበፅሑ ደቂ ተባዕትዮን ደቂ ኦንስትዮን ዘርኢ፣ ዜ...      tir   
4   tir_Ethi                       ሕድሕድ ሰብ ብማሕበር ንኽውደብ ኦይግደድን፡፡      tir   

  iso15924  language  top_pred  
0     Ethi  Tigrinya  tir_Ethi  
1     Ethi  Tigrinya  tir_Ethi  
2     Ethi  Tigrinya  tir_Ethi  
3     Ethi  Tigrinya  tir_Ethi  
4     Ethi  Tigrinya  tir_Ethi  


In [97]:
# Compute F1 and false positive rate
result_udhr_openlid = compute_f1_false_positive(udhr_openlid_pred)

print("Result DataFrame:")
print(result_udhr_openlid)

Result DataFrame:
        label  f1_score  precision_score  recall_score  false_positive_rate
0    emk_Latn  0.000000         0.000000           0.0             0.000000
1    aii_Syrc  0.000000         0.000000           0.0             0.000000
2    sey_Latn  0.000000         0.000000           0.0             0.000000
3    gjn_Latn  0.000000         0.000000           0.0             0.000000
4    bul_Cyrl  0.958678         0.920635           1.0             0.000181
..        ...       ...              ...           ...                  ...
455  pag_Latn  0.000000         0.000000           0.0             0.001801
456  bjn_Arab  0.000000         0.000000           0.0             0.000829
457  plt_Latn  0.990654         0.981481           1.0             0.000036
458  cot_Latn  0.000000         0.000000           0.0             0.000000
459  bak_Cyrl  0.000000         0.000000           0.0             0.002378

[460 rows x 5 columns]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [98]:
# Compute average results
average_udhr_openlid = result_udhr_openlid.drop(columns=['label']).mean()
print("Averaged result DataFrame:")
print(average_udhr_openlid)

Averaged result DataFrame:
f1_score               0.254649
precision_score        0.231135
recall_score           0.319097
false_positive_rate    0.001418
dtype: float64


In [99]:
# To reproduce prior work, macro averages must be computed on the intersection L = intersection of languages of Openlid and languages present in UDHR

# Create a list of Openlid languages
languages_openlid = pd.read_csv('languages_openlid.csv', header=None)[0].tolist()
print(languages_openlid)
print(f"Number of languages in Openlid: {len(languages_openlid)}")

# Create a list of UDHR languages
languages_udhr = list(set(udhr['iso_script']))
print(languages_udhr)
print(f"Number of languages in UDHR: {len(languages_udhr)}")

# Get the intersection
languages_intersection = list(set(languages_openlid) & set(languages_udhr))
print(languages_intersection)
print(f"Number of languages in the intersection: {len(languages_intersection)}")

['ace_Arab', 'ace_Latn', 'acm_Arab', 'acq_Arab', 'aeb_Arab', 'afr_Latn', 'ajp_Arab', 'amh_Ethi', 'apc_Arab', 'arb_Latn', 'ars_Arab', 'ary_Arab', 'arz_Arab', 'asm_Beng', 'ast_Latn', 'awa_Deva', 'ayr_Latn', 'azb_Arab', 'azj_Latn', 'bak_Cyrl', 'bam_Latn', 'ban_Latn', 'bel_Cyrl', 'bem_Latn', 'ben_Beng', 'bho_Deva', 'bjn_Arab', 'bjn_Latn', 'bod_Tibt', 'bos_Latn', 'bug_Latn', 'bul_Cyrl', 'cat_Latn', 'ceb_Latn', 'ces_Latn', 'cjk_Latn', 'ckb_Arab', 'crh_Latn', 'cym_Latn', 'dan_Latn', 'deu_Latn', 'dik_Latn', 'dyu_Latn', 'dzo_Tibt', 'ell_Grek', 'eng_Latn', 'epo_Latn', 'est_Latn', 'eus_Latn', 'ewe_Latn', 'fao_Latn', 'fij_Latn', 'fin_Latn', 'fon_Latn', 'fra_Latn', 'fur_Latn', 'fuv_Latn', 'gla_Latn', 'gle_Latn', 'glg_Latn', 'grn_Latn', 'guj_Gujr', 'hat_Latn', 'hau_Latn', 'heb_Hebr', 'hin_Deva', 'hne_Deva', 'hrv_Latn', 'hun_Latn', 'hye_Armn', 'ibo_Latn', 'ilo_Latn', 'ind_Latn', 'isl_Latn', 'ita_Latn', 'jav_Latn', 'jpn_Jpan', 'kab_Latn', 'kac_Latn', 'kam_Latn', 'kan_Knda', 'kas_Arab', 'kas_Deva', 'ka

In [100]:
# Filter results
result_udhr_openlid_filt = result_udhr_openlid[result_udhr_openlid['label'].isin(languages_intersection)]
print(result_udhr_openlid_filt)

# Compute the macro averages on the filtered results
average_udhr_openlid_filt = result_udhr_openlid_filt.drop(columns=['label']).mean()
print("Averaged result DataFrame (computed on languages present in both Openlid and UDHR datasets):")
print(average_udhr_openlid_filt)


        label  f1_score  precision_score  recall_score  false_positive_rate
4    bul_Cyrl  0.958678         0.920635      1.000000             0.000181
7    hat_Latn  0.520000         0.351351      1.000000             0.007815
8    guj_Gujr  1.000000         1.000000      1.000000             0.000000
11   hye_Armn  1.000000         1.000000      1.000000             0.000000
13   swe_Latn  0.983871         0.968254      1.000000             0.000072
..        ...       ...              ...           ...                  ...
441  tur_Latn  0.485597         0.320652      1.000000             0.004513
449  pap_Latn  0.607330         0.436090      1.000000             0.002708
450  ace_Latn  0.923077         0.869565      0.983607             0.000325
453  kab_Latn  0.913386         0.840580      1.000000             0.000397
457  plt_Latn  0.990654         0.981481      1.000000             0.000036

[152 rows x 5 columns]
Averaged result DataFrame (computed on languages present in both

## TO DO 

Faire juste : 
- Tester GlotLID-M / Openlid avec la méthode SET! sur UDHR ?