-
Notifications
You must be signed in to change notification settings - Fork 0
/
model_non_sup.py
263 lines (200 loc) · 10.7 KB
/
model_non_sup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
from typing import List, Tuple
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# Speed sklearn lib
from sklearnex import patch_sklearn
patch_sklearn()
from sklearn.cluster import MeanShift
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.mixture import BayesianGaussianMixture
# from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score
from sklearn.metrics import davies_bouldin_score
from sklearn.metrics import calinski_harabasz_score
def fonction_dist(X: List[float], Y: List[float]) -> List[float]:
# Calculate the distance between each point and the center of its cluster
X_ar = np.array(X)
dist = [np.linalg.norm(x - y) for x, y in zip(X_ar, Y)]
return dist
def n_cluster_KM(X: List[float], deb: int, fin: int) -> int:
print("KMeans\n")
Sum_of_squared_distances = []
K = range(deb, fin)
cluster_KM_df = pd.DataFrame(columns=['n_clusters', 'silhouette_score'])
for k in K:
KM = KMeans(n_clusters=k)
cluster_labels = KM.fit_predict(X)
Sum_of_squared_distances.append(KM.inertia_)
silhouette_avg = silhouette_score(X, cluster_labels)
print("For n_clusters =", k,
"The average silhouette_score is :", silhouette_avg)
cluster_KM_df = cluster_KM_df.append({'n_clusters': k, 'silhouette_score': silhouette_avg}, ignore_index=True)
sil_max = cluster_KM_df.silhouette_score.max()
n_clusters_algo = int(cluster_KM_df.loc[cluster_KM_df['silhouette_score'] == sil_max, 'n_clusters'])
print("Optimal clusters number : {}".format(int(n_clusters_algo)))
return n_clusters_algo
def predictions_tous_les_clusters_separe_KM(X: List[float], pourc: float, centers_KM: np.ndarray,
y_pred_KM: np.ndarray) -> pd.DataFrame:
dist = fonction_dist(X, centers_KM[y_pred_KM])
KM_y_pred2 = pd.DataFrame({'centers': centers_KM[y_pred_KM][:, 0], 'dist': dist})
KM_y_pred2[KM_y_pred2.dist >= KM_y_pred2.groupby('centers').dist.transform('quantile', pourc)] = 1
KM_y_pred2[KM_y_pred2.dist != 1] = 0
return KM_y_pred2
def Kmeans(X: List[float], deb: int, fin: int) -> Tuple[np.ndarray, int, float, str, np.ndarray, float, float]:
n_clusters_algo = n_cluster_KM(X, deb, fin)
KM = KMeans(n_clusters=n_clusters_algo)
KM = KM.fit(X)
y_pred_KM = KM.predict(X)
centers_KM = KM.cluster_centers_
sil_max = silhouette_score(X, y_pred_KM)
calin_max = calinski_harabasz_score(X, y_pred_KM)
davies_max = davies_bouldin_score(X, y_pred_KM)
print("KMeans done")
name = 'KMeans'
return y_pred_KM, n_clusters_algo, sil_max, name, centers_KM, calin_max, davies_max
def predictions_tous_les_clusters_separe_MS(X: List[float], pourc: float, cluster_centers_MS: np.ndarray,
y_pred_MS: np.ndarray) -> pd.DataFrame:
dist = fonction_dist(X, cluster_centers_MS[y_pred_MS])
MS_y_pred_df = pd.DataFrame({'centers': cluster_centers_MS[y_pred_MS][:, 0], 'dist': dist})
MS_y_pred_df[MS_y_pred_df.dist >= MS_y_pred_df.groupby('centers').dist.transform('quantile', pourc)] = 1
MS_y_pred_df[MS_y_pred_df.dist != 1] = 0
return MS_y_pred_df
def Meanshift(X: List[float]) -> Tuple[np.ndarray, int, float, str, np.ndarray, float, float]:
print("Mean Shift\n")
# we use the function of sklearn to find the hyperparameter of the MeanShift
from sklearn.cluster import estimate_bandwidth
nb_bandwidth = estimate_bandwidth(X)
print(nb_bandwidth)
"""if nb_bandwidth < 2.0:
ms = MeanShift(bandwidth = 2.0, n_jobs=-1)
else:
ms = MeanShift(bandwidth = nb_bandwidth, n_jobs=-1)"""
ms = MeanShift(bandwidth=nb_bandwidth, n_jobs=-1)
ms.fit(X)
y_pred_MS = ms.predict(X)
cluster_centers_MS = ms.cluster_centers_
sil_max = silhouette_score(X, y_pred_MS)
nb_cluster_algo = int(len(cluster_centers_MS))
calin_max = calinski_harabasz_score(X, y_pred_MS)
davies_max = davies_bouldin_score(X, y_pred_MS)
print("MeanShift done")
name = 'MShift'
return y_pred_MS, nb_cluster_algo, sil_max, name, cluster_centers_MS, calin_max, davies_max
def nb_cluster_MG(X: List[float], deb: int, fin: int) -> int:
K = range(deb, fin)
cluster_MG_df = pd.DataFrame(columns=['n_clusters', 'silhouette_score'])
for k in K:
MG = GaussianMixture(n_components=k)
MG.fit(X)
cluster_labels_MG = MG.predict(X)
bic = MG.bic(X)
silhouette_avg = silhouette_score(X, cluster_labels_MG)
print("For n_clusters =", k,
"The average silhouette_score is :", silhouette_avg,
"The BIC is :", bic)
cluster_MG_df = cluster_MG_df.append({'n_clusters': k, 'silhouette_score': silhouette_avg, 'bic': bic},
ignore_index=True)
cluster_MG_df['bic_gradient'] = np.gradient(cluster_MG_df.bic)
# Difference between i and i+1
for i in range(0, (len(cluster_MG_df) - 1)):
cluster_MG_df.loc[i + 1, 'diff_bic_gradient'] = (cluster_MG_df.loc[(i + 1), 'bic_gradient']) - (
cluster_MG_df.loc[(i), 'bic_gradient'])
# Absolute value and replace NaN by 0
cluster_MG_df.diff_bic_gradient = abs(cluster_MG_df.diff_bic_gradient)
cluster_MG_df = cluster_MG_df.fillna(0)
# we look for the position of each different nb_cluster according to sil_score and gradient of the bic
cluster_MG_df = cluster_MG_df.sort_values('silhouette_score')
cluster_MG_df = cluster_MG_df.reset_index()
cluster_MG_df['sil_pos'] = (cluster_MG_df.index + 1)
cluster_MG_df = cluster_MG_df.sort_values('bic', ascending=False)
cluster_MG_df = cluster_MG_df.reset_index()
cluster_MG_df['bic_pos'] = (cluster_MG_df.index + 1)
cluster_MG_df = cluster_MG_df.drop(columns=['level_0', 'index'])
cluster_MG_df = cluster_MG_df.sort_values('diff_bic_gradient')
cluster_MG_df = cluster_MG_df.reset_index()
cluster_MG_df['diff_bic_gradient_pos'] = (cluster_MG_df.index + 1)
cluster_MG_df = cluster_MG_df.drop(columns=['index'])
cluster_MG_df = cluster_MG_df.sort_values('n_clusters')
for index, row in cluster_MG_df.iterrows():
cluster_MG_df.loc[index, 'comb_lin'] = row['sil_pos'] + row['diff_bic_gradient_pos']
# we take n_clusters max if there is an equality of points
a = cluster_MG_df[cluster_MG_df.comb_lin == np.max(cluster_MG_df.comb_lin)]
comb_lin_max = a[a.n_clusters == np.max(a.n_clusters)].index.values
n_clusters_algo = int(cluster_MG_df.loc[comb_lin_max, 'n_clusters'])
print("Number of clusters : {}".format(int(n_clusters_algo)))
return n_clusters_algo
def prediction_tous_les_clusters_separes_MG(pourc: float, y_pred_MG: np.ndarray, Y_score_MG: np.ndarray) -> pd.DataFrame:
# we put in a dataframe the cluster and its probability for each point
y_pred_MG_df = pd.DataFrame({'cluster': y_pred_MG, 'proba': -Y_score_MG})
y_pred_MG_df[y_pred_MG_df.proba > y_pred_MG_df.groupby('cluster').proba.transform('quantile', pourc)] = 1
y_pred_MG_df[y_pred_MG_df.proba != 1] = 0
return y_pred_MG_df
def Mixturegaussian(X: List[float], deb: int, fin: int) -> Tuple[np.ndarray, int, float, str, np.ndarray, float, float]:
print("Mixture Gaussian\n")
# we look for the optimal number of clusters according to the score silhouette and the gradient of the bic
n_clusters_algo = nb_cluster_MG(X, deb, fin)
MG = GaussianMixture(n_components=int(n_clusters_algo))
MG.fit(X)
y_pred_MG = MG.predict(X)
Y_score_MG = MG.score_samples(X)
sil_max = silhouette_score(X, y_pred_MG)
calin_max = calinski_harabasz_score(X, y_pred_MG)
davies_max = davies_bouldin_score(X, y_pred_MG)
print("Mixture Gaussian done")
name = 'MixtureGaussian'
return y_pred_MG, n_clusters_algo, sil_max, name, Y_score_MG, calin_max, davies_max
def nb_clusters_BMG(X: List[float]) -> pd.Series:
K: List[float] = [0.0001, 0.001, 0.01, 0.1, 1, 10]
cluster_BMG_df = pd.DataFrame(columns=['weight_concentration_prior', 'silhouette_score'])
for k in K:
BMG = BayesianGaussianMixture(n_components=50, weight_concentration_prior=k, max_iter=200)
BMG.fit(X)
cluster_labels_BMG = BMG.predict(X)
silhouette_avg = silhouette_score(X, cluster_labels_BMG)
print("For weight_concentration_prior =", k,
"The average silhouette_score is :", silhouette_avg)
cluster_BMG_df = cluster_BMG_df.append({'weight_concentration_prior': k, 'silhouette_score': silhouette_avg},
ignore_index=True)
# Display of the curve
fig, ax = plt.subplots()
ax.plot(cluster_BMG_df.weight_concentration_prior, cluster_BMG_df.silhouette_score, 'bx-')
ax.set_title('silhouette')
ax.set(xlabel='k', ylabel='silhouette_score')
# we look for the number of clusters corresponding to the max value of silhouette score
sil_max = cluster_BMG_df.silhouette_score.max()
n_weight_concentration_prior = cluster_BMG_df.loc[
cluster_BMG_df['silhouette_score'] == sil_max, 'weight_concentration_prior']
return n_weight_concentration_prior
def prediction_tous_les_clusters_separes_BMG(pourc: float, y_labels_BGM: np.ndarray,
y_score_BGM: np.ndarray) -> pd.DataFrame:
# we put the probabilities and the cluster corresponding to each point in a dataframe
y_pred_BGM_df_score = pd.DataFrame({'cluster': y_labels_BGM, 'proba': y_score_BGM})
y_pred_BGM_df_score[
y_pred_BGM_df_score.proba >= y_pred_BGM_df_score.groupby('cluster').proba.transform('quantile', pourc)] = 1
y_pred_BGM_df_score[y_pred_BGM_df_score.proba != 1] = 0
return y_pred_BGM_df_score
def Bayesian_mixture_gaussian(X: List[float]) -> Tuple[np.ndarray, int, float, str, np.ndarray, float, float]:
print("Bayesian Mixture Gaussian\n")
n_weight_concentration_prior = nb_clusters_BMG(X)
BGM = BayesianGaussianMixture(n_components=50, weight_concentration_prior=float(n_weight_concentration_prior))
BGM.fit(X)
y_score_BGM = BGM.score_samples(X)
# we determine the clusters of each transaction
y_labels_BGM = BGM.predict(X)
# we change the value of the labels so that they follow each other
nouveaux_labels = 0
unique_labels = np.unique(y_labels_BGM)
for i in unique_labels:
y_labels_BGM[y_labels_BGM == i] = nouveaux_labels
nouveaux_labels += 1
sil_max = silhouette_score(X, y_labels_BGM)
n_cluster_algo = len(np.unique(y_labels_BGM))
calin_max = calinski_harabasz_score(X, y_labels_BGM)
davies_max = davies_bouldin_score(X, y_labels_BGM)
print("Bayesian Mixture Gaussian done")
name = 'BayesianMixtureGaussian'
return y_labels_BGM, n_cluster_algo, sil_max, name, y_score_BGM, calin_max, davies_max
def dbscan():
pass