In [1]:
import pandas as pd
import numpy as np
import dask.dataframe as dd
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import seaborn as sns

In [11]:
import dask.dataframe as dd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# 📌 Chargement du fichier Parquet avec Dask (gestion mémoire optimisée)
data_path = "../data/output/user_features.parquet"
df_dask = dd.read_parquet(data_path)

# ✅ Vérification des types de colonnes avant correction
print("📊 Types des colonnes avant correction :")
print(df_dask.dtypes)

# 🔧 Détection et conversion des colonnes mal typées en float
cols_to_convert = ["total_events", "total_views", "total_purchases", "unique_categories", "conversion_rate"]
for col in cols_to_convert:
    if col in df_dask.columns:
        if df_dask[col].dtype == "object" or df_dask[col].dtype == "string":
            print(f"⚠️ Conversion de {col} en float...")
            df_dask[col] = df_dask[col].astype(float)

# 🔄 Suppression des valeurs non numériques
df_dask = df_dask.replace(["NaN", "NULL", "inf", "-inf"], np.nan)
df_dask = df_dask.dropna()

# 📥 Conversion en Pandas après correction
df = df_dask.compute()
print("📊 Types des colonnes après correction :")
print(df.dtypes)

# 📌 Normalisation des données
df = (df - df.min()) / (df.max() - df.min())  # Normalisation Min-Max

# 📉 Réduction de dimension avec ACP
pca = PCA(n_components=3)
X_pca = pca.fit_transform(df)

# 🔹 Clustering avec K-Means pour générer des labels
n_clusters = 5
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=3)
df["cluster"] = kmeans.fit_predict(X_pca)

# 🔀 Séparation Train/Test
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=["cluster"]), df["cluster"], test_size=0.2, random_state=42)

# 🌳 Entraînement d'un modèle Random Forest
clf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
clf.fit(X_train, y_train)

# 📊 Évaluation du modèle
y_pred = clf.predict(X_test)
print("\n📊 Rapport de classification :\n", classification_report(y_test, y_pred))

# 🔍 Visualisation des features importantes
feature_importances = pd.Series(clf.feature_importances_, index=X_train.columns)
plt.figure(figsize=(10, 5))
feature_importances.nlargest(10).plot(kind='barh', color='skyblue')
plt.title("Top 10 des features les plus importantes")
plt.show()


📊 Types des colonnes avant correction :
total_events                   int64
total_views                  float64
total_purchases              float64
unique_categories              int64
last_event_time      string[pyarrow]
conversion_rate              float64
dtype: object
📊 Types des colonnes après correction :
total_events                   int64
total_views                  float64
total_purchases              float64
unique_categories              int64
last_event_time      string[pyarrow]
conversion_rate              float64
dtype: object


ArrowNotImplementedError: Function 'subtract_checked' has no kernel matching input types (large_string, large_string)