In [1]:
pip install pynini

In [2]:
import pynini

In [3]:
import pynini
from pynini.lib import pynutil

# -------------------------
# 1) helper: anglais
# -------------------------
def number_to_words_en(n: int) -> str:
    assert 0 <= n <= 1000
    units = ["zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine",
             "ten", "eleven", "twelve", "thivfrteen", "fourteen", "fifteen", "sixteen"]
    tens_names = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]

    if n < 17:
        return units[n]
    if n < 20:
        if n == 17:
            return "seventeen"
        elif n == 18:
            return "eighteen"
        else:
            return "nineteen"
    if n < 100:
        t = n // 10
        u = n % 10
        if u == 0:
            return tens_names[t]
        else:
            return tens_names[t] + "-" + units[u]
    if n < 1000:
        h = n // 100
        r = n % 100
        if r == 0:
            return units[h] + " hundred"
        else:
            return units[h] + " hundred " + number_to_words_en(r)
    # 1000
    return "one thousand"

In [21]:
# -------------------------
# 2) helper: français
# -------------------------
def _fr_below_hundred(n: int) -> str:
    # names for units and teens
    units = ["zero", "un", "deux", "trois", "quatre", "cinq", "six", "sept", "huit", "neuf",
             "dix", "onze", "douze", "treize", "quatorze", "quinze", "seize"]
    tens = {20: "vingt", 30: "trente", 40: "quarante", 50: "cinquante", 60: "soixante", 80: "quatre-vingt"}

    if n < 17:
        return units[n]
    if 17 <= n <= 19:
        return "dix-" + units[n-10]
    if n < 70:
        t = (n // 10) * 10
        u = n % 10
        base = tens[t]
        if u == 0:
            return base
        if u == 1:
            # 21,31,41,51,61 -> vingt-et-un, trente-et-un, ...
            return base + "-et-un"
        return base + "-" + units[u]
    if 70 <= n < 80:
        # 70 = 60 + 10, 71 = 60 + 11 (special: soixante-et-onze)
        r = n - 60
        if r == 11:
            return "soixante-et-onze"
        return "soixante-" + _fr_below_hundred(r)
    if 80 <= n < 100:
        # 80 = quatre-vingt(s)
        r = n - 80
        base = "quatre-vingt"
        if r == 0:
            return base + "s"  # quatre-vingts
        # 81 is "quatre-vingt-un" (no 'et')
        return base + "-" + _fr_below_hundred(r)
    return ""

In [22]:
def number_to_words_fr(n: int) -> str:
    assert 0 <= n <= 1000
    if n == 0:
        return "zéro"
    if n == 1000:
        return "mille"
    if n < 100:
        return _fr_below_hundred(n)
    # hundreds
    h = n // 100
    r = n % 100
    if h == 1:
        prefix = "cent"
    else:
        prefix = number_to_words_fr(h) + " cent"
    # plural 'cents' only if exact multiple of 100 and h > 1
    if r == 0:
        if h > 1:
            return prefix + "s"  # e.g., deux cents
        return prefix
    else:
        return prefix + " " + number_to_words_fr(r)

In [23]:
# -------------------------
# 3) Build FSTs
# -------------------------
def build_number_fst(lang="en"):
    # create union of cross mappings for all numbers 0..1000
    union = None
    for n in range(0, 1001):
        if lang == "en":
            words = number_to_words_en(n)
        elif lang == "fr":
            words = number_to_words_fr(n)
        else:
            raise ValueError("lang must be 'en' or 'fr'")

        pair = pynini.cross(str(n), words)
        if union is None:
            union = pair
        else:
            union |= pair

    # optimize and return
    return pynini.optimize(union)

In [24]:
# -------------------------
# 4) Save FSTs and (optionally) FAR
# -------------------------
if __name__ == "__main__":
    fst_en = build_number_fst("en")
    fst_fr = build_number_fst("fr")

    # save individual FSTs
    fst_en.optimize().write("cardinals_en.fst")
    fst_fr.optimize().write("cardinals_fr.fst")

    # If you want an FAR (archive), try:
    try:
        from pynini import Far
        far = Far("cardinals.far", mode="w")
        far["cardinals_en"] = fst_en
        far["cardinals_fr"] = fst_fr
        far.close()
    except Exception as e:
        print("Could not write FAR (api may differ). FSTs saved individually. Error:", e)

    print("Saved cardinals_en.fst and cardinals_fr.fst (and attempted FAR).")

In [25]:
import re
import pynini
from pynini import shortestpath

def apply_fst(text, fst):
    try:
        # utiliser accep() et non acceptor()
        lattice = pynini.accep(text, token_type="utf8") @ fst
        return shortestpath(lattice).string("utf8")
    except Exception as e:
        return f"Error: {e}, for input:'{text}'"


def normalize_sentence(sentence: str, fst):
    # tokenisation simple
    tokens = re.findall(r"\d+|[^\d\s]+|\w+", sentence, flags=re.UNICODE)

    out_tokens = []
    for t in tokens:
        if t.isdigit():
            out_tokens.append(apply_fst(t, fst))
        else:
            out_tokens.append(t)

    return " ".join(out_tokens).replace(" ,", ",").replace(" .", ".").strip()

In [26]:
fst_fr = pynini.Fst.read("cardinals_fr.fst")
fst_en = pynini.Fst.read("cardinals_en.fst")
print(normalize_sentence("J'ai 21 chiens et 141 chats.", fst_fr))
print(normalize_sentence("I have 665 dogs et 101 cats.", fst_en))

In [10]:
pip install jiwer

In [27]:
from jiwer import wer

# Évalue le WER sur un dictionnaire donné {str_nombre: référence_texte}
def evaluate_cardinal_fr_wer_from_dict(fst, test_dict):
    refs_norm = []
    hyps_norm = []
    numeros = []

    for num, ref in test_dict.items():
        num_str = str(num).strip()
        ref_str = ref.strip()
        hyp = normalize_sentence(num_str, fst)
        refs_norm.append(ref_str)
        hyps_norm.append(hyp)
        numeros.append(num_str)

    score = wer(refs_norm, hyps_norm)
    print(f"WER sur test set fourni (dict):", score)
    for n, gt, hyp in zip(numeros, refs_norm, hyps_norm):
        print(f"{n}\tREF: {gt}\tHYP: {hyp}")
    return score

In [29]:
# Exemple d'utilisation d'un dictionnaire :
test_cases = {
     "94": "quatre-vingt-quatorze",
     "6": "six",
     "181": "cent quatre-vingt-un",
 }
evaluate_cardinal_fr_wer_from_dict(fst_fr, test_cases)

In [34]:
# Début de la définition d'un autre FST pour la normalisation inverse de texte
import pynini
from pynini.lib import pynutil

# Helpers pour la carte 0-59 en string -> mot français
def chiffre_map_0_59():
    chiffres = [
        "zéro", "un", "deux", "trois", "quatre", "cinq", "six", "sept", "huit", "neuf",
        "dix", "onze", "douze", "treize", "quatorze", "quinze", "seize", "dix-sept",
        "dix-huit", "dix-neuf", "vingt", "vingt-et-un", "vingt-deux", "vingt-trois", "vingt-quatre",
        "vingt-cinq", "vingt-six", "vingt-sept", "vingt-huit", "vingt-neuf", "trente", 
        "trente-et-un", "trente-deux", "trente-trois", "trente-quatre", "trente-cinq",
        "trente-six", "trente-sept", "trente-huit", "trente-neuf", "quarante", "quarante-et-un",
        "quarante-deux", "quarante-trois", "quarante-quatre", "quarante-cinq", "quarante-six",
        "quarante-sept", "quarante-huit", "quarante-neuf", "cinquante", "cinquante-et-un",
        "cinquante-deux", "cinquante-trois", "cinquante-quatre", "cinquante-cinq",
        "cinquante-six", "cinquante-sept", "cinquante-huit", "cinquante-neuf"
    ]
    return pynini.string_map([(str(i), chiffres[i]) for i in range(60)])

# FST pour les heures au format 24h ex: 12:47 -> "douze quarante-sept"
def build_fst_heures():
    # Heure (0-23) et minute (0-59)
    heure_map = pynini.string_map([(str(i), [
        "minuit", "une", "deux", "trois", "quatre", "cinq", "six", "sept", "huit", "neuf",
        "dix", "onze", "douze", "treize", "quatorze", "quinze", "seize", "dix-sept",
        "dix-huit", "dix-neuf", "vingt", "vingt-et-un", "vingt-deux", "vingt-trois"
    ][i]) for i in range(24)])
    minute_map = chiffre_map_0_59()
    # Sépare heure/minute d'un espace
    pattern = (
        heure_map +
        pynutil.insert(" ") +
        pynini.accep(":") +
        minute_map
    )
    # Remplace ":" par espace
    pattern = (
        heure_map +
        pynutil.insert(" ") +
        pynini.accep(":") +
        minute_map
    )
    pattern = (
        heure_map + pynutil.insert(" ") + pynini.accep(":") + minute_map
    )
    # Reformule pour exprimer "douze quarante-sept" (pas "douze:quarante-sept")
    pattern = heure_map + pynutil.insert(" ") + pynini.accep(":") + minute_map
    # Ajoute transduction ":" -> " "
    pattern = pynini.compose(
        pattern,
        pynini.cdrewrite(pynini.cross(":", " "), "", "", pynini.union("[BOS]", "[EOS]"))
    )
    return pattern.optimize()

# FST pour les montants en euro/dollars ex: 3,16 $ -> "trois dollars et seize cents"
def build_fst_argent():
    int_map = chiffre_map_0_59()
    devise = pynini.string_map([
        ("$", "dollars"),
        ("€", "euros")
    ])
    # partie entière: ex 3 $
    montant = int_map + pynutil.insert(" ") + devise
    # centimes: ,16
    centimes = pynini.accep(",") + chiffre_map_0_59() + pynutil.insert(" cents")
    # "et" entre unité devise et centimes
    euros_centimes = (
        int_map
        + pynutil.insert(" ")
        + devise
        + pynutil.insert(" et ")
        + chiffre_map_0_59()
        + pynutil.insert(" cents")
    )
    # Juste montant sans centimes OU avec centimes
    pattern = euros_centimes | (montant)
    return pattern.optimize()

def build_fst_normaliseur():
    fst_heure = build_fst_heures()
    fst_argent = build_fst_argent()
    return fst_heure | fst_argent

# Test robuste : affiche erreur amicale si le motif ne passe pas
fst_normaliseur = build_fst_normaliseur()
examples = ["12:47", "3,16 $"]
for ex in examples:
    try:
        print(f"Erreur lors de la normalisation '{ex}' : Operation failed")
    except Exception as e:
        print(f"Erreur lors de la normalisation '{ex}' : {e}")
