# Projet de Clustering Complet

Ce notebook simule un jeu de données, prétraite les données, applique DBSCAN et K‑Means, puis visualise et résume les résultats.

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN, KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt

# 1. Simulation du dataset
n_rows = 13000
np.random.seed(42)

# Catégorielles
fund_types = ['Equity', 'FixedIncome', 'Mixed']
funds = [f'Fund_{i}' for i in range(1, 21)]
sub_funds = [f'SubFund_{i}' for i in range(1, 101)]
fund_type_col = np.random.choice(fund_types, n_rows)
fund_col = np.random.choice(funds, n_rows)
sub_fund_col = np.random.choice(sub_funds, n_rows)

# NAV et AUM
NAV = np.random.lognormal(mean=2, sigma=0.5, size=n_rows) * 1e6
AUM = NAV * np.random.uniform(0.8, 1.2, size=n_rows)

# Positions longues (123_*)
long_cols = {f'123_{i}': np.random.normal(loc=1e5, scale=5e4, size=n_rows) for i in range(1, 51)}
# Positions short (124_*)
short_cols = {f'124_{i}': np.random.normal(loc=5e4, scale=2e4, size=n_rows) for i in range(1, 51)}
# Autres variables
feat_cols = {f'feat_{i}': np.random.normal(size=n_rows) for i in range(1, 36)}

df = pd.DataFrame({
    'fund_type': fund_type_col,
    'fund': fund_col,
    'sub_fund': sub_fund_col,
    'NAV': NAV,
    'AUM': AUM,
    **long_cols,
    **short_cols,
    **feat_cols
})

df.head()

In [None]:
# 2. Préparation pour clustering
num_cols = [c for c in df.columns if c not in ['fund_type', 'fund', 'sub_fund']]
X = df[num_cols].values
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [None]:
# 3. Réduction de dimension pour DBSCAN
pca_10 = PCA(n_components=10, random_state=42)
X_pca_10 = pca_10.fit_transform(X_scaled)


In [None]:
# 4. Clustering DBSCAN
dbscan = DBSCAN(eps=0.5, min_samples=5)
labels_db = dbscan.fit_predict(X_pca_10)
unique, counts = np.unique(labels_db, return_counts=True)
print("DBSCAN clusters et tailles :", dict(zip(unique, counts)))


In [None]:
# Visualisation 2D PCA des clusters DBSCAN
pca_2 = PCA(n_components=2, random_state=42)
X_pca_2 = pca_2.fit_transform(X_scaled)
plt.figure(figsize=(8,5))
plt.scatter(X_pca_2[:, 0], X_pca_2[:, 1], c=labels_db, s=1)
plt.title('Clusters DBSCAN (2 premières composantes PCA)')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.show()


In [None]:
# 5. Clustering K‑Means
k = 5
kmeans = KMeans(n_clusters=k, random_state=42)
labels_km = kmeans.fit_predict(X_scaled)
print("Silhouette score K‑Means :", silhouette_score(X_scaled, labels_km))


In [None]:
plt.figure(figsize=(8,5))
plt.scatter(X_pca_2[:, 0], X_pca_2[:, 1], c=labels_km, s=1)
plt.title('Clusters K‑Means (2 premières composantes PCA)')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.show()


In [None]:
# 6. Résumé des clusters K‑Means pour NAV et AUM
df['cluster_km'] = labels_km
cluster_summary = df.groupby('cluster_km')[['NAV', 'AUM']].mean().reset_index()
cluster_summary