Regrouper les clients par intéractions panier/nombre de connexions
Achat par heure de la journée
PySpark
Analyses Bivariées
Retravailler les données
Qualitatif -> Quantitatif
Test de Anova et Chi^2

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder

In [None]:
file_to_load = "./DATA/2019-Oct.csv"

chunksize = 50_000
fraction = 0.01
seed = 42

In [None]:
chunks = pd.read_csv(file_to_load, chunksize=chunksize, parse_dates=['event_time'])

In [None]:
def getRandomDataset(chunks, frac, seed):
    return pd.concat(chunk.sample(frac=frac, random_state=seed) for chunk in chunks)

In [None]:
df = getRandomDataset(chunks, fraction, seed)

# Exploration

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
def createNaAndUniqueMatrix(df):
    rows = ["Valeurs nulles", "Valeurs uniques"]

    data = {}

    for column in df.columns:    
        na = df[column].isna()
        unique = len(df[column][~na].unique())

        data[column] = [f'{len(df[na])} ({(len(df[na])/len(df))*100:.2f}%)', unique]

    return pd.DataFrame(data, index=rows)

In [None]:
createNaAndUniqueMatrix(df)

Analysons les 3 valeurs uniques d'event_type

In [None]:
frequences = df["event_type"][df["event_type"].notna()].value_counts()

ax = frequences.plot(kind="bar")

for i,v in enumerate(frequences):
    ax.text(i, v + 0.1, str(v), ha="center", va="bottom")

plt.title("Fréquence des types d'événements")
plt.xlabel("Événement")
plt.ylabel("Fréquence")
plt.show()

On recense donc 407 787 vues, 9222 mises dans un panier et 7483 achats

Quels sont les taux de conversion entre les types d'événements ?

In [None]:
viewToCart = frequences["cart"] / frequences["view"] * 100
cartToPurchase = frequences["purchase"] / frequences["cart"] * 100
viewToPurchase = frequences["purchase"] / frequences["view"] * 100

print(f"Taux de conversion vue => panier : {viewToCart:.2f} %")
print(f"Taux de conversion panier => achat : {cartToPurchase:.2f} %")
print(f"Taux de conversion vue => achat : {viewToPurchase:.2f} %")

Combien d'utilisateurs différents ont fait un achat ?

In [None]:
usersPurchased = len(df["user_id"][df["event_type"] == "purchase"].unique())

itemsPurchasedPerUser = frequences["purchase"] / usersPurchased

maxUserBuy = df[df["event_type"] == "purchase"].groupby("user_id").size().max()
maxSessionBuy = df[df["event_type"] == "purchase"].groupby("user_session").size().max()

print(f"Un total de {usersPurchased} utilisateurs différents ont effectué des achats sur le site")
print(f"Un utilisateur qui achète sur le site achète en moyenne {itemsPurchasedPerUser:.2f} articles")
print(f"L'utilisateur qui a le plus acheté a acheté {maxUserBuy} articles sur le mois")
print(f"La session utilisateur qui a le plus acheté a acheté {maxSessionBuy} articles en une session")

Comment évoluent les achats sur le mois ?

In [None]:
datesDayToDay = pd.to_datetime(df["event_time"]).dt.date
purchasesDayToDay = df[df["event_type"] == "purchase"].groupby(datesDayToDay)["event_type"].count()
uniqueDates = datesDayToDay.unique()

In [None]:
plt.figure(figsize=(20, 5))
plt.plot(uniqueDates, purchasesDayToDay)

for x,y in zip(uniqueDates, purchasesDayToDay):
    plt.text(x, y + 0.3, y, ha="center", va="bottom", fontsize=9)

plt.title("Ventes effectués au cours du mois")
plt.xlabel("Date")
plt.ylabel("Quantité")
plt.grid(True)
plt.xticks(uniqueDates, rotation=45)

plt.show()

Quel est le chiffre d'affaire quotidien ?

In [None]:
datesDayToDay = pd.to_datetime(df["event_time"]).dt.date
gainsDayToDay = df[df["event_type"] == "purchase"].groupby(datesDayToDay)["price"].sum()
uniqueDates = datesDayToDay.unique()

In [None]:
plt.figure(figsize=(25, 5))
plt.bar(uniqueDates, gainsDayToDay)

for x,y in zip(uniqueDates, gainsDayToDay):
    plt.text(x, y + 0.3, y, ha="center", va="bottom", fontsize=9)

plt.title("Chiffre d'affaire effectué au cours du mois")
plt.xlabel("Date")
plt.ylabel("Chiffre d'affaire")
plt.grid(True)
plt.xticks(uniqueDates, rotation=45)

plt.show()

# Transformation

In [None]:
import numpy as np

In [None]:
df = pd.get_dummies(df, columns=["event_type"])

In [None]:
encoder = OneHotEncoder(sparse_output=False)

In [None]:
# Encoder chaque cycle en cos/sin
df['day_of_year'] = df['event_time'].dt.dayofyear
df['day_year_cos'] = np.cos(2 * np.pi * df['day_of_year'] / 365)
df['day_year_sin'] = np.sin(2 * np.pi * df['day_of_year'] / 365)

df['day_of_week'] = df['event_time'].dt.weekday  # 0=lundi
df['day_week_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)
df['day_week_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)

df['hour'] = df['event_time'].dt.hour
df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)

df['minute'] = df['event_time'].dt.hour * 60 + df['event_time'].dt.minute
df['minute_cos'] = np.cos(2 * np.pi * df['minute'] / 1440)
df['minute_sin'] = np.sin(2 * np.pi * df['minute'] / 1440)

df.drop(['event_time', 'day_of_year', 'day_of_week', 'hour', 'minute'], axis=1, inplace=True)

In [None]:
df.head()

In [None]:
corr_matrix = df.corr(numeric_only=True)

plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", square=True)
plt.title("Matrice de corrélation")
plt.tight_layout()
plt.show()

In [None]:
df_final = df.drop(['product_id', 'category_id', 'category_code', 'brand', 'user_session'], axis=1)

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_final)

In [None]:
df_scaled

# Modèle

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=3)
df_pca = pca.fit_transform(df_scaled)

print(f"Variance expliquée cumulée : {pca.explained_variance_ratio_.cumsum()}")

In [None]:
import plotly.express as px

df_plot = pd.DataFrame({
    'PC1': df_pca[:, 0],
    'PC2': df_pca[:, 1],
    'PC3': df_pca[:, 2],
})

fig = px.scatter_3d(df_plot, x='PC1', y='PC2', z='PC3',
                    title='Visualisation PCA',
                    opacity=0.7,
                    size_max=5)

fig.show()

## Recherche des hyperparamètres

In [None]:
from sklearn.neighbors import NearestNeighbors

min_samples = df_pca.shape[1] + 1

neighbors = NearestNeighbors(n_neighbors=min_samples)
neighbors_fit = neighbors.fit(df_pca)
distances, indices = neighbors_fit.kneighbors(df_pca)

# On prend la distance au dernier voisin (le k-ième)
distances = np.sort(distances[:, -1])

# Tracer la courbe
plt.figure(figsize=(8, 4))
plt.plot(distances)
plt.xlabel("Points triés")
plt.ylabel(f"Distance au {min_samples}ème voisin")
plt.title("k-distance plot")
plt.show()

In [None]:
print(f"Min distance: {distances.min()}")
print(f"Max distance: {distances.max()}")
print(f"Moyenne distance: {distances.mean()}")
print(f"25% quantile: {np.percentile(distances, 25)}")
print(f"50% quantile: {np.percentile(distances, 50)}")
print(f"75% quantile: {np.percentile(distances, 75)}")

In [None]:
from kneed import KneeLocator

sensitivities = [0.5, 1]
eps_values = []

for S in sensitivities:
    kneedle = KneeLocator(range(len(distances)), distances, S=S, curve='convex', direction='increasing')
    knee_idx = kneedle.knee
    if knee_idx is not None:
        eps = distances[knee_idx]
        print(f"eps détecté avec S={S} : {eps:.4f}")
        eps_values.append(eps)
    else:
        print(f"Aucun coude détecté avec S={S}")
        eps_values.append(None)

In [None]:
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.cluster import DBSCAN

max_sample_size = 10000

best_score = -1
best_eps = None

for eps in eps_values:
    clustering = DBSCAN(eps=eps, min_samples=min_samples).fit(df_pca)
    labels = clustering.labels_

    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    noise_ratio = list(labels).count(-1) / len(labels)
    print(f"eps={eps:.2f} -> clusters: {n_clusters}, bruit: {noise_ratio*100:.3f}%")

    mask = labels != -1
    X_no_noise = df_pca[mask]
    labels_no_noise = labels[mask]

    if n_clusters > 1 and len(X_no_noise) > 0:
        sample_size = min(max_sample_size, len(X_no_noise))
        indices = np.random.choice(len(X_no_noise), sample_size, replace=False)
        X_sample = X_no_noise[indices]
        labels_sample = labels_no_noise[indices]

        sil_score = silhouette_score(X_sample, labels_sample)
        ch_score = calinski_harabasz_score(X_sample, labels_sample)
        db_score = davies_bouldin_score(X_sample, labels_sample)

        print(f"  Silhouette Score    : {sil_score:.3f}")
        print(f"  Calinski-Harabasz   : {ch_score:.3f}")
        print(f"  Davies-Bouldin      : {db_score:.3f}")

        if sil_score > best_score:
            best_score = sil_score
            best_eps = eps
    else:
        print("Pas assez de clusters pour calculer les scores.")

## Modèle final

In [None]:
# from sklearn.cluster import HDBSCAN
#
# for mcs in [10000, 50000, 10000]:
#     hdb = HDBSCAN(min_cluster_size=mcs, min_samples=None)
#     labels = hdb.fit_predict(df_pca)
#     n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
#     noise_pct = (list(labels).count(-1) / len(labels)) * 100
#     print(f"mcs={mcs} -> clusters: {n_clusters}, bruit: {noise_pct:.3f}%")

In [None]:
dbscan = DBSCAN(eps=best_eps, min_samples=min_samples)
clusters = dbscan.fit_predict(df_pca)

## Visualisation des clusters

In [None]:
df_plot = pd.DataFrame({
    'PC1': df_pca[:, 0],
    'PC2': df_pca[:, 1],
    'PC3': df_pca[:, 2],
    'cluster': clusters
})

df_plot['color'] = df_plot['cluster'].apply(lambda x: 'grey' if x == -1 else f'cluster {x}')

fig = px.scatter_3d(df_plot, x='PC1', y='PC2', z='PC3',
                    color='color',
                    title='Clusters DBSCAN',
                    opacity=0.7,
                    size_max=5,
                    labels={'color': 'Cluster'})

fig.show()

## Evaluation

In [None]:
# mask = clusters != -1
#
# print(f"Silhouette Score    : {silhouette_score(df_pca[mask], clusters[mask]):.3f}")
# print(f"Calinski-Harabasz   : {calinski_harabasz_score(df_pca[mask], clusters[mask]):.3f}")
# print(f"Davies-Bouldin      : {davies_bouldin_score(df_pca[mask], clusters[mask]):.3f}")