In [1]:
import zipfile
with zipfile.ZipFile("readability.zip", 'r') as zip_ref:
    zip_ref.extractall("readability")

In [2]:
!pip install gradio transformers torch scikit-learn spacy pyphen
!python -m spacy download fr_core_news_sm

Collecting pyphen
  Downloading pyphen-0.17.2-py3-none-any.whl.metadata (3.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nv

In [3]:
import gradio as gr
import pandas as pd
import joblib

# Import des fonctions d’extraction
from extract_readability import extract_readability_features
from extract_plongements_camembert import extract_camembert_diff

# Dictionnaire pour rendre les noms de caractéristiques plus lisibles
FEATURE_LABELS = {
    "diff_LIX": "Δ LIX",
    "diff_RIX": "Δ RIX",
    "diff_REL": "Δ REL",
    "diff_KandelMoles": "Δ Kandel-Moles",
    "diff_Mesnager": "Δ Mesnager",
    "diff_characters_per_word": "Δ caractères/mot",
    "diff_syll_per_word": "Δ syllabes/mot",
    "diff_words_per_sentence": "Δ mots/phrase",
    "diff_sentences_per_paragraph": "Δ phrases/paragraphe",
    "diff_type_token_ratio": "Δ type_token_ratio",
    "diff_directspeech_ratio": "Δ proportion de discours direct",
    "diff_characters": "Δ nombre de caractères",
    "diff_syllables": "Δ nombre de syllabes",
    "diff_words": "Δ nombre de mots",
    "diff_wordtypes": "Δ nombre de mots différents",
    "diff_sentences": "Δ nombre de phrases",
    "diff_long_words": "Δ mots longs",
    "diff_complex_words": "Δ mots complexes",
    "diff_complex_words_mes": "Δ mots complexes (Mesnager)",
    "diff_tobeverb": "Δ verbes être",
    "diff_auxverb": "Δ verbes auxiliaires",
    "diff_conjunction": "Δ conjonctions",
    "diff_preposition": "Δ prépositions",
    "diff_nominalization": "Δ nominalisations",
    "diff_subordination": "Δ subordonnées",
    "diff_article": "Δ articles",
    "diff_pronoun": "Δ pronoms",
    "diff_interrogative": "Δ mots interrogatifs",
}

# Chargement du modèle MLP et du PCA
model = joblib.load("mlp_exp_max_rev_read_model.pkl")
pca = joblib.load("pca_model_max_rev.pkl")

# Fonction pour afficher une échelle graphique
def get_scale_html(value):
    norm_val = (value + 3) / 6
    position = int(norm_val * 100)
    return f"""
    <div style="width: 100%; height: 25px; background: linear-gradient(to right, red, gray, green); position: relative; border-radius: 5px; margin-top: 10px;">
        <div style="position: absolute; left: {position}%; top: -5px; width: 0; height: 0;
                    border-left: 7px solid transparent; border-right: 7px solid transparent;
                    border-bottom: 10px solid black;"></div>
    </div>
    <div style="text-align: center; font-size: 14px; margin-top: 5px;">
        Échelle d’amélioration de lisibilité : -3 (plus difficile) → +3 (plus facile)
    </div>
    """

# Fonction principale
def predict_gain(original, simplified):
    try:
        if original.strip() == simplified.strip():
            value = 0.0
            features = pd.DataFrame()
        else:
            # Extraction des plongements + réduction par PCA
            emb_df = extract_camembert_diff(original, simplified)
            emb_pca = pd.DataFrame(pca.transform(emb_df), columns=[f"pca_{i+1}" for i in range(pca.n_components_)])

            # Extraction des caractéristiques de lisibilité
            read_df = extract_readability_features(original, simplified)

            # Fusion des deux types de caractéristiques
            features = pd.concat([emb_pca, read_df], axis=1)

            # Prédiction
            value = model.predict(features)[0]

        # Préparation des caractéristiques à afficher
        readable_features_only = features[[col for col in features.columns if not col.startswith("pca_")]]
        renamed = readable_features_only.rename(columns=FEATURE_LABELS)
        return round(value, 2), get_scale_html(value), renamed.round(3)


    except Exception as e:
        return f"Erreur : {e}", "", pd.DataFrame()

# Interface
with gr.Blocks() as demo:
    gr.Markdown("## Prédiction de l’amélioration de lisibilité")
    gr.Markdown("Entrez une phrase **originale** et sa version **simplifiée**.")

    with gr.Row():
        original_input = gr.Textbox(label="Phrase originale", lines=3)
        simplified_input = gr.Textbox(label="Phrase simplifiée", lines=3)

    predict_button = gr.Button("Prédire")

    score_output = gr.Text(label="Amélioration de lisibilité")
    scale_output = gr.HTML()
    features_output = gr.Dataframe(label="Caractéristiques utilisées (hors plongements CamemBERT)")


    predict_button.click(
        predict_gain,
        inputs=[original_input, simplified_input],
        outputs=[score_output, scale_output, features_output]
    )

demo.launch(share=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/508 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/811k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.40M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://73defd2fd3494d3407.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


