In [1]:
import pandas as pd
from sqlalchemy import create_engine
import re
from datetime import datetime

# CONFIGURATION BASE DE DONN√âES

DB_CONFIG = {
    "user": "root",
    "password": "root",
    "host": "localhost",
    "port": 8889,
    "database": "RECETTE"
}

# CONNEXION & EXTRACTION DB

class RecipeDatabaseLoader:
    def __init__(self, db_config: dict):
        self.engine = self._create_engine(db_config)

    def _create_engine(self, db_config):
        """Cr√©ation du moteur SQLAlchemy"""
        url = (
            f"mysql+pymysql://{db_config['user']}:{db_config['password']}"
            f"@{db_config['host']}:{db_config['port']}/{db_config['database']}"
        )
        return create_engine(url)

    def load_recipes(self) -> pd.DataFrame:
        """Charge les recettes + ingr√©dients depuis la DB"""

        query = """
        SELECT
            r.id,
            r.titre,
            r.description,
            r.cuisine,
            r.type_recette,
            r.difficulte,
            r.temps_preparation,
            r.temps_cuisson,
            r.image_url,
            r.vegetarien,
            JSON_ARRAYAGG(
                JSON_OBJECT(
                    'nom', i.nom
                )
            ) AS ingredients
        FROM recettes r
        LEFT JOIN recette_ingredients ri ON r.id = ri.recette_id
        LEFT JOIN ingredients i ON ri.ingredient_id = i.id
        GROUP BY r.id
        """

        print("üì• Chargement des recettes depuis la base...")
        df = pd.read_sql(query, self.engine)
        print(f"‚úÖ {len(df)} recettes r√©cup√©r√©es")

        return df

# NETTOYAGE DES DONN√âES

class RecipeDataCleaner:

    def _clean_text(self, text):
        if pd.isna(text) or text == "":
            return ""
        text = str(text).strip()
        text = re.sub(r"\s+", " ", text)
        return text

    def _validate_time(self, value):
        try:
            v = int(float(value))
            return max(v, 0)
        except:
            return 0

    def clean(self, df: pd.DataFrame) -> pd.DataFrame:
        df = df.copy()

        # Textes
        for col in ['titre', 'description', 'cuisine', 'type_recette', 'difficulte']:
            if col in df.columns:
                df[col] = df[col].apply(self._clean_text)

        # Temps
        for col in ['temps_preparation', 'temps_cuisson']:
            if col in df.columns:
                df[col] = df[col].apply(self._validate_time)

        # Ingr√©dients ‚Üí liste Python
        if 'ingredients' in df.columns:
            df['ingredients'] = df['ingredients'].apply(
                lambda x: [] if x is None else x
            )

        return df


if __name__ == "__main__":

    loader = RecipeDatabaseLoader(DB_CONFIG)
    cleaner = RecipeDataCleaner()

    # 1. R√©cup√©ration DB
    df_raw = loader.load_recipes()

    # 2. Nettoyage
    df_clean = cleaner.clean(df_raw)

    # 3. Conversion pour NLP
    recipes = df_clean.to_dict(orient="records")

    print("\nüçΩ Exemple de recette :")
    print(recipes[0])

    print("\n‚úÖ Donn√©es pr√™tes")


üì• Chargement des recettes depuis la base...
‚úÖ 106 recettes r√©cup√©r√©es

üçΩ Exemple de recette :
{'id': 1, 'titre': 'Pasta Carbonara', 'description': 'Une d√©licieuse recette de p√¢tes carbonara traditionnelle', 'cuisine': '', 'type_recette': '', 'difficulte': 'MOYEN', 'temps_preparation': 15, 'temps_cuisson': 20, 'image_url': None, 'vegetarien': None, 'ingredients': '[{"nom": "Tomate"}, {"nom": "yaourt"}]'}

‚úÖ Donn√©es pr√™tes
