In [None]:
import pandas as pd
import numpy as np
import json
import os
from datetime import datetime
import requests
from sklearn.preprocessing import MinMaxScaler

class ProfessionalAIDataPipeline(AIDataPipeline):
    """
    Pipeline ETL avanc√© : Extraction, Nettoyage, Ing√©nierie de Features 
    et Chargement pour l'entra√Ænement de mod√®les de recommandation.
    """

    def __init__(self, api_base_url='http://localhost:8080/api'):
        super().__init__(api_base_url)
        self.scaler = MinMaxScaler()

    def process_and_clean_data(self, data):
        """Phase de transformation (T de ETL) : Nettoyage et Enrichissement"""
        print("üõ†Ô∏è  D√©but de la phase de transformation...")
        
        # 1. Pr√©paration des interactions avec normalisation temporelle
        df_interactions = self._prepare_weighted_interactions(data['interactions'])
        
        # 2. Nettoyage des recettes (d√©tection de doublons et compl√©tion de donn√©es)
        df_recipes = self._clean_recipe_data(data['recipes'])
        
        # 3. Calcul du score de popularit√© hybride (Interaction + Note)
        df_popularity = self._calculate_hybrid_popularity(df_interactions, data['ratings'])
        
        return {
            'matrix': df_interactions,
            'features': df_recipes,
            'popularity': df_popularity
        }

    def _prepare_weighted_interactions(self, interactions):
        """Cr√©e une matrice d'interaction avec 'Decay Factor' (les interactions r√©centes p√®sent plus)"""
        if not interactions: return pd.DataFrame()

        df = pd.DataFrame(interactions)
        df['dateInteraction'] = pd.to_datetime(df['dateInteraction'])
        
        # Calcul du poids temporel (Time Decay)
        # Plus l'interaction est ancienne, moins elle a d'impact sur l'IA
        now = datetime.now(df['dateInteraction'].iloc[0].tzinfo)
        df['days_ago'] = (now - df['dateInteraction']).dt.days
        df['time_decay'] = np.exp(-0.05 * df['days_ago']) # D√©croissance exponentielle
        
        weight_map = {'CONSULTATION': 1, 'PARTAGE': 3, 'FAVORI_AJOUTE': 5}
        df['base_weight'] = df['typeInteraction'].map(weight_map).fillna(1)
        df['final_score'] = df['base_weight'] * df['time_decay']
        
        return df.pivot_table(index='userId', columns='recetteId', values='final_score', fill_value=0)

    def _clean_recipe_data(self, recipes):
        """Normalisation des features de contenu pour le clustering/similarit√©"""
        df = pd.DataFrame(recipes)
        
        # Remplissage des valeurs manquantes intelligemment
        df['tempsPreparation'] = df['tempsPreparation'].fillna(df['tempsPreparation'].median())
        df['difficulte'] = df['difficulte'].replace('', 'MOYEN').fillna('MOYEN')
        
        # Ing√©nierie de features : Ratio temps/difficult√©
        diff_map = {'FACILE': 1, 'MOYEN': 2, 'DIFFICILE': 3}
        df['diff_num'] = df['difficulte'].map(diff_map).fillna(2)
        df['complexity_index'] = df['tempsPreparation'] * df['diff_num']
        
        # Normalisation des valeurs num√©riques entre 0 et 1 (Essentiel pour les mod√®les de Deep Learning)
        cols_to_scale = ['tempsPreparation', 'complexity_index']
        df[cols_to_scale] = self.scaler.fit_transform(df[cols_to_scale])
        
        return df

    def _calculate_hybrid_popularity(self, df_interactions, ratings):
        """Calcule un score de tendance (Trending Score)"""
        # Somme des interactions par recette
        pop_series = df_interactions.sum(axis=0)
        
        # Int√©gration des notes moyennes
        if ratings:
            df_ratings = pd.DataFrame(ratings)
            avg_ratings = df_ratings.groupby('recetteId')['note'].mean()
            # Score hybride = 70% interactions + 30% notes
            popularity = (pop_series * 0.7) + (avg_ratings * 0.3)
        else:
            popularity = pop_series
            
        return popularity.sort_values(ascending=False)

    def run_pipeline(self, output_path='ai_training_data'):
        """Ex√©cution compl√®te du flux de donn√©es"""
        # Extraction
        raw_data = self.collect_training_data()
        
        # Transformation
        processed = self.process_and_clean_data(raw_data)
        
        # Chargement (Export)
        os.makedirs(output_path, exist_ok=True)
        processed['matrix'].to_csv(f"{output_path}/interaction_matrix.csv")
        processed['features'].to_csv(f"{output_path}/recipe_features_cleaned.csv", index=False)
        processed['popularity'].to_csv(f"{output_path}/trending_scores.csv")
        
        print(f"\nüöÄ Pipeline termin√©. {len(processed['features'])} recettes pr√™tes pour l'entra√Ænement.")
        return processed

# Initialisation
if __name__ == "__main__":
    pipeline = ProfessionalAIDataPipeline()
    # pipeline.login(...)
    # results = pipeline.run_pipeline()