In [None]:
import pandas as pd
import zipfile
import os

def load_medicine_dataset_local():

    FILE_PATH = r"C:\Users\hp\Desktop\4ème\projet\CIS_bdpm.txt"

    if not os.path.exists(FILE_PATH):
        raise FileNotFoundError(f"Le fichier ANSM n'existe pas : {FILE_PATH}")

    if FILE_PATH.endswith(".zip"):
        with zipfile.ZipFile(FILE_PATH, 'r') as z:
            inner_files = [f for f in z.namelist() if "CIS" in f and f.endswith(".txt")]
            if not inner_files:
                raise ValueError("Aucun fichier CIS .txt trouvé dans le ZIP.")
            with z.open(inner_files[0]) as f:
                df = pd.read_csv(f, sep="\t", header=None, encoding="latin-1")

    elif FILE_PATH.endswith(".txt"):
        df = pd.read_csv(FILE_PATH, sep="\t", header=None, encoding="latin-1")

    elif FILE_PATH.endswith(".csv"):
        df = pd.read_csv(FILE_PATH, sep=";", encoding="latin-1")

    else:
        raise ValueError("Format non supporté — utilise .txt, .csv ou .zip.")

    df.columns = [
        "CIS", "Nom", "Forme", "Voie", "Statut_ADM", "AMM",
        "Statut_Commercial", "Date_AMM", "Statut_BDM", "Proc",
        "Titulaire", "Generique"
    ]

    meds = df["Nom"].dropna().unique().tolist()

    return meds


# Chargement global
MED_LIST = load_medicine_dataset_local()


In [None]:
# ------------------------------------------------------------
# 2. CORRECTEUR MEDICAMENTS
# ------------------------------------------------------------
def correct_medicine_names(text: str) -> str:
    words = text.split()
    corrected = []

    for w in words:
        match = process.extractOne(w, MED_LIST, scorer=fuzz.QRatio)
        if match and match[1] > 75:
            corrected.append(match[0])
        else:
            corrected.append(w)

    return " ".join(corrected)


# ------------------------------------------------------------
# 3. NORMALISATION
# ------------------------------------------------------------
def normalize_units(text):
    replacements = {
        "ng": "mg",
        "mng": "mg",
        "mgmg": "mg",
        "m1": "ml",
        "mi": "ml",
        "u1": "UI",
    }
    for wrong, correct in replacements.items():
        text = text.replace(wrong, correct)
    return text


def normalize_posology(text):
    text = text.replace("x/j", " fois par jour")
    text = text.replace("/j", " fois par jour")
    text = text.replace("jr", "jour")
    return text


# ------------------------------------------------------------
# 4. PIPELINE COMPLET
# ------------------------------------------------------------
def correct_prescription_text(text: str) -> str:
    text = text.lower()
    text = normalize_units(text)
    text = correct_medicine_names(text)
    text = normalize_posology(text)
    return text.strip()

In [None]:
def save_dataset(meds):
    base_path = r"C:\Users\hp\Desktop\4ème\projet"

    # Créer chemins
    csv_path = os.path.join(base_path, "medicaments_fr.csv")
  

    # CSV
    df = pd.DataFrame(meds, columns=["nom_medicament"])
    df.to_csv(csv_path, index=False, encoding="utf-8")
    print(f"✔ Dataset CSV sauvegardé : {csv_path}")



# -------------------------------------------
# 3) Exécution
# -------------------------------------------

MED_LIST = load_medicine_dataset_local()
save_dataset(MED_LIST)