In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
file_to_load = "Data/2019-Oct.csv"

chunksize = 50_000
fraction = 0.001
seed = 42

In [None]:
chunks = pd.read_csv(file_to_load, chunksize=chunksize, parse_dates=['event_time'])

In [None]:
def getRandomDataset(chunks, frac, seed):
    return pd.concat(chunk.sample(frac=frac, random_state=seed) for chunk in chunks)

In [None]:
df = getRandomDataset(chunks, fraction, seed)

# Exploration

In [None]:
df.head()

In [None]:
df.category_id.nunique()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
def createNaAndUniqueMatrix(df):
    rows = ["Valeurs nulles", "Valeurs uniques"]

    data = {}

    for column in df.columns:
        na = df[column].isna()
        unique = len(df[column][~na].unique())

        data[column] = [f'{len(df[na])} ({(len(df[na])/len(df))*100:.2f}%)', unique]

    return pd.DataFrame(data, index=rows)

In [None]:
createNaAndUniqueMatrix(df)

Analysons les 3 valeurs uniques d'event_type

In [None]:
frequences = df["event_type"][df["event_type"].notna()].value_counts()

ax = frequences.plot(kind="bar")

for i,v in enumerate(frequences):
    ax.text(i, v + 0.1, str(v), ha="center", va="bottom")

plt.title("Fréquence des types d'événements")
plt.xlabel("Événement")
plt.ylabel("Fréquence")
plt.show()

On recense donc 407 757 vues, 9237 mises dans un panier et 7494 achats

Quels sont les taux de conversion entre les types d'événements ?

In [None]:
viewToCart = frequences["cart"] / frequences["view"] * 100
cartToPurchase = frequences["purchase"] / frequences["cart"] * 100
viewToPurchase = frequences["purchase"] / frequences["view"] * 100

print(f"Taux de conversion vue => panier : {viewToCart:.2f} %")
print(f"Taux de conversion panier => achat : {cartToPurchase:.2f} %")
print(f"Taux de conversion vue => achat : {viewToPurchase:.2f} %")

Combien d'utilisateurs différents ont fait un achat ?

In [None]:
usersPurchased = len(df["user_id"][df["event_type"] == "purchase"].unique())

itemsPurchasedPerUser = frequences["purchase"] / usersPurchased

maxUserBuy = df[df["event_type"] == "purchase"].groupby("user_id").size().max()
maxSessionBuy = df[df["event_type"] == "purchase"].groupby("user_session").size().max()

print(f"Un total de {usersPurchased} utilisateurs différents ont effectué des achats sur le site")
print(f"Un utilisateur qui achète sur le site achète en moyenne {itemsPurchasedPerUser:.2f} articles")
print(f"L'utilisateur qui a le plus acheté a acheté {maxUserBuy} articles sur le mois")
print(f"La session utilisateur qui a le plus acheté a acheté {maxSessionBuy} articles en une session")

Comment évoluent les achats sur le mois ?

In [None]:

datesDayToDay = pd.to_datetime(df["event_time"]).dt.date
purchasesDayToDay = df[df["event_type"] == "purchase"].groupby(datesDayToDay)["event_type"].count()
uniqueDates = datesDayToDay.unique()

In [None]:
plt.figure(figsize=(20, 5))
plt.plot(uniqueDates, purchasesDayToDay)

for x,y in zip(uniqueDates, purchasesDayToDay):
    plt.text(x, y + 0.3, y, ha="center", va="bottom", fontsize=9)

plt.title("Ventes effectués au cours du mois")
plt.xlabel("Date")
plt.ylabel("Quantité")
plt.grid(True)
plt.xticks(uniqueDates, rotation=45)

plt.show()

Quel est le chiffre d'affaire quotidien ?

In [None]:
datesDayToDay = pd.to_datetime(df["event_time"]).dt.date
gainsDayToDay = df[df["event_type"] == "purchase"].groupby(datesDayToDay)["price"].sum()
uniqueDates = datesDayToDay.unique()

In [None]:
plt.figure(figsize=(25, 5))
plt.bar(uniqueDates, gainsDayToDay)
for x,y in zip(uniqueDates, gainsDayToDay):
    plt.text(x, y + 0.3, y, ha="center", va="bottom", fontsize=9)
plt.title("Chiffre d'affaire effectué au cours du mois")
plt.xlabel("Date")
plt.ylabel("Chiffre d'affaire")
plt.grid(True)
plt.xticks(uniqueDates, rotation=45)
plt.show()

# Transformation

In [None]:
import pandas as pd
import numpy as np

# 1. Conversion du timestamp
df['timestamp'] = pd.to_datetime(df['event_time'])

# 2. Extraction des composantes temporelles
df['hour'] = df['timestamp'].dt.hour
df['day'] = df['timestamp'].dt.dayofweek  # 0 = lundi, 6 = dimanche

# 3. Catégorisation des moments de la journée
def time_of_day(h):
    if 6 <= h < 12:  return 'morning'
    elif 12 <= h < 18: return 'afternoon'
    elif 18 <= h < 24: return 'evening'
    else: return 'night'

df['time_period'] = df['hour'].apply(time_of_day)

# 4. Encodage one-hot des types d’événements (sécurisé pour les colonnes manquantes)
df = pd.get_dummies(df, columns=['event_type'], prefix='event_type')
for col in ['event_type_view', 'event_type_cart', 'event_type_purchase']:
    if col not in df:
        df[col] = 0

# 5. Agrégats principaux
agg_behaviour = df.groupby('user_id').agg(
    total_views=('event_type_view', 'sum'),
    total_cart=('event_type_cart', 'sum'),
    total_purchase=('event_type_purchase', 'sum'),
    unique_categories=('category_id', 'nunique'),
    last_activity=('timestamp', 'max')
).reset_index()

# 6. Total dépensé uniquement sur les achats
total_spent = (
    df[df['event_type_purchase'] == 1]
    .groupby('user_id')['price']
    .sum()
    .rename('total_spent')
    .reset_index()
)

agg_behaviour = agg_behaviour.merge(total_spent, on='user_id', how='left')
agg_behaviour['total_spent'] = agg_behaviour['total_spent'].fillna(0)

# 7. KPIs dérivés
agg_behaviour['avg_basket'] = agg_behaviour['total_spent'] / agg_behaviour['total_purchase'].replace(0, np.nan)
agg_behaviour['conversion_rate'] = agg_behaviour['total_purchase'] / agg_behaviour['total_views'].replace(0, np.nan)

# 8. Répartition par moments de la journée (%)
time_dist = (
    df.groupby(['user_id', 'time_period'])
    .size()
    .unstack(fill_value=0)
    .pipe(lambda d: d.div(d.sum(axis=1), axis=0))
)

# 9. Heure moyenne d’activité (cyclique)
df['hour_rad'] = 2 * np.pi * df['hour'] / 24
hour_avg = df.groupby('user_id').agg(
    hour_cos=('hour_rad', lambda x: np.mean(np.cos(x))),
    hour_sin=('hour_rad', lambda x: np.mean(np.sin(x)))
).reset_index()
hour_avg['peak_hour'] = np.arctan2(hour_avg['hour_sin'], hour_avg['hour_cos']) * (24 / (2 * np.pi))
hour_avg['peak_hour'] = hour_avg['peak_hour'] % 24

# 10. Récence en jours
now = df['timestamp'].max()
agg_behaviour['recency_days'] = (now - agg_behaviour['last_activity']).dt.days

# 11. Fusion finale + nettoyage
df_user_features = (
    agg_behaviour
    .merge(time_dist, on='user_id', how='left')
    .merge(hour_avg[['user_id', 'peak_hour']], on='user_id', how='left')
    .drop(columns=['last_activity'])
    .fillna(0)
)

df_user_features.head()

In [None]:
# Nombre de catégories à garder
TOP_N = 20

# 1. Identifier les TOP_N catégories achetées
top_categories = (
    df[df['event_type_purchase'] == 1]
    .groupby('category_id')
    .size()
    .sort_values(ascending=False)
    .head(TOP_N)
    .index
)

# 2. Créer un pivot avec achats uniquement, autres regroupées
purchase_pivot = (
    df[df['event_type_purchase'] == 1]
    .assign(category_id=lambda x: x['category_id'].where(x['category_id'].isin(top_categories), 'other'))
    .groupby(['user_id', 'category_id'])
    .size()
    .unstack(fill_value=0)
    .reset_index()
)

# 3. Ajouter le préfixe "cat_" sauf pour 'user_id'
purchase_pivot.columns = [
    'user_id' if col == 'user_id' else f"cat_{col}" for col in purchase_pivot.columns
]

# 4. Merge avec ton dataframe final
df_final = df_user_features.merge(purchase_pivot, on='user_id', how='left').fillna(0)

# Optionnel : normaliser en pourcentage
category_cols = [col for col in purchase_pivot.columns if col != 'user_id']
df_final[category_cols] = df_final[category_cols].div(
    df_final[category_cols].sum(axis=1).replace(0, np.nan),
    axis=0
).fillna(0)

print(df_final.head())

Cherchons maintenant les corrélations entre les différentes features

In [None]:
corr_matrix = df_final.corr(numeric_only=True)

plt.figure(figsize=(15, 15))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", square=True)
plt.title("Matrice de corrélation")
plt.tight_layout()
plt.show()

Regrouper les clients par intéractions panier/nombre de connexions
Achat par heure de la journée
PySpark
Analyses Bivariées
Retravailler les données
Qualitatif -> Quantitatif
Test de Anova et Chi^2

In [None]:
df_final = df_final.drop(['user_id'], axis=1)

In [None]:
df_final.describe()

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_final)

# Modèle

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=3)
df_pca = pca.fit_transform(df_scaled)

print(f"Variance expliquée cumulée : {pca.explained_variance_ratio_.cumsum()}")
df_pca

In [None]:
import plotly.express as px

df_plot = pd.DataFrame({
    'PC1': df_pca[:, 0],
    'PC2': df_pca[:, 1],
    'PC3': df_pca[:, 2],
})

fig = px.scatter_3d(df_plot, x='PC1', y='PC2', z='PC3',
                    title='Visualisation PCA',
                    opacity=0.7,
                    size_max=5)

fig.show()

## Recherche des hyperparamètres

In [None]:
from sklearn.neighbors import NearestNeighbors

min_samples = df_pca.shape[1] + 1

neighbors = NearestNeighbors(n_neighbors=min_samples)
neighbors_fit = neighbors.fit(df_pca)
distances, indices = neighbors_fit.kneighbors(df_pca)

# On prend la distance au dernier voisin (le k-ième)
distances = np.sort(distances[:, -1])

# Tracer la courbe
plt.figure(figsize=(8, 4))
plt.plot(distances)
plt.xlabel("Points triés")
plt.ylabel(f"Distance au {min_samples}ème voisin")
plt.title("k-distance plot")
plt.show()

In [None]:
print(f"Min distance: {distances.min()}")
print(f"Max distance: {distances.max()}")
print(f"Moyenne distance: {distances.mean()}")
print(f"25% quantile: {np.percentile(distances, 25)}")
print(f"50% quantile: {np.percentile(distances, 50)}")
print(f"75% quantile: {np.percentile(distances, 75)}")

In [None]:
from kneed import KneeLocator

sensitivities = [0.5, 1]
eps_values = []

for S in sensitivities:
    kneedle = KneeLocator(range(len(distances)), distances, S=S, curve='convex', direction='increasing')
    knee_idx = kneedle.knee
    if knee_idx is not None:
        eps = distances[knee_idx]
        print(f"eps détecté avec S={S} : {eps:.4f}")
        eps_values.append(eps)
    else:
        print(f"Aucun coude détecté avec S={S}")
        eps_values.append(None)

In [None]:
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.cluster import DBSCAN

max_sample_size = 1000

best_score = -1
best_eps = None
rng = np.random.default_rng(seed=seed)

for eps in eps_values:
    clustering = DBSCAN(eps=eps, min_samples=min_samples).fit(df_pca)
    labels = clustering.labels_

    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    noise_ratio = list(labels).count(-1) / len(labels)
    print(f"eps={eps:.2f} -> clusters: {n_clusters}, bruit: {noise_ratio*100:.3f}%")

    mask = labels != -1
    X_no_noise = df_pca[mask]
    labels_no_noise = labels[mask]

    if n_clusters > 1 and len(X_no_noise) > 0:
        sample_size = min(max_sample_size, len(X_no_noise))
        indices = rng.choice(len(X_no_noise), sample_size, replace=False)
        X_sample = X_no_noise[indices]
        labels_sample = labels_no_noise[indices]

        sil_score = silhouette_score(X_sample, labels_sample)
        ch_score = calinski_harabasz_score(X_sample, labels_sample)
        db_score = davies_bouldin_score(X_sample, labels_sample)

        print(f"  Silhouette Score    : {sil_score:.3f}")
        print(f"  Calinski-Harabasz   : {ch_score:.3f}")
        print(f"  Davies-Bouldin      : {db_score:.3f}")


        if sil_score > best_score:
            best_score = sil_score
            best_eps = eps

    else:
        print("Pas assez de clusters pour calculer les scores.")

## Modèle final

In [None]:
dbscan = DBSCAN(eps=best_eps, min_samples=min_samples)
clusters = dbscan.fit_predict(df_pca)

# Evaluation

## Visualisation des clusters

In [None]:

df_plot = pd.DataFrame({
    'PC1': df_pca[:, 0],
    'PC2': df_pca[:, 1],
    'PC3': df_pca[:, 2],
    'cluster': clusters
})

df_plot['color'] = df_plot['cluster'].apply(lambda x: 'grey' if x == -1 else f'cluster {x}')

fig = px.scatter_3d(df_plot, x='PC1', y='PC2', z='PC3',
                    color='color',
                    title='Clusters DBSCAN',
                    opacity=0.7,
                    size_max=5,
                    labels={'color': 'Cluster'})

fig.show()

## Importance de chaque variable par cluster

In [None]:

import pandas as pd
import numpy as np

# Suppose que df_final est ton DataFrame des features
# labels = clustering.labels_  # Sortie de DBSCAN

df_clusters = df_final.copy()
df_clusters['cluster'] = labels

# On exclut le bruit (-1)
df_clusters = df_clusters[df_clusters['cluster'] != -1]

# Moyenne par cluster
cluster_means = df_clusters.groupby('cluster').mean()

# Moyenne globale
global_mean = df_final.mean()

# Importance relative : écart à la moyenne globale
feature_importance = cluster_means - global_mean

# Pour chaque cluster, trier les features par importance absolue
for c in feature_importance.index:
    print(f"\nCluster {c}")
    print(feature_importance.loc[c].abs().sort_values(ascending=False).head(10))