In [1]:
import numpy as np
from cluster_algorithms import base_kmeans
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from scipy.spatial import Voronoi, voronoi_plot_2d
import time

In [2]:
data_files_path = '../data_files/data17_13TeV.AllPeriods.sgn.probes_lhmedium_EGAM2.bkg.VProbes_EGAM7.GRL_v97/'
file_name       = 'data17_13TeV.AllPeriods.sgn.probes_lhmedium_EGAM2.bkg.VProbes_EGAM7.GRL_v97_et0_eta0.npz'

plots_path      = '../clustering_plots/'
my_seed         = 13

In [3]:
def add_subplot_axes(ax,rect,axisbg='w'):
    fig = plt.gcf()
    box = ax.get_position()
    width = box.width
    height = box.height
    inax_position  = ax.transAxes.transform(rect[0:2])
    transFigure = fig.transFigure.inverted()
    infig_position = transFigure.transform(inax_position)    
    x = infig_position[0]
    y = infig_position[1]
    width *= rect[2]
    height *= rect[3]  # <= Typo was here
    subax = fig.add_axes([x,y,width,height],facecolor=axisbg)
    x_labelsize = subax.get_xticklabels()[0].get_size()
    y_labelsize = subax.get_yticklabels()[0].get_size()
    x_labelsize = rect[2]*0.5
    y_labelsize = rect[3]*0.5
    #subax.xaxis.set_tick_params(labelsize=x_labelsize)
    #subax.yaxis.set_tick_params(labelsize=y_labelsize)
    return subax

def plot_div_evo(al_object, breg_div, tag, path=plots_path):
    plt.figure(figsize=(10,8))    
    ax = plt.gca()
    ax.plot(range(al_object.get_last_iter()), al_object.get_sum_total_div(), '--o', c='g')
    ax.set_title('Total sum of the %s divergence' %(breg_div), fontsize=18)
    ax.set_ylabel(r'$D_{\phi}[C: D]$', fontsize=10)
    ax.set_xlabel(r'Iteractions', fontsize=10)
    ax.set_xticks(np.arange(1, al_object.get_last_iter()+ 1))
    plt.grid()
    ax2 = add_subplot_axes(ax, rect=[.3, .3, .6, .6])
    ax2.plot(range(al_object.get_last_iter()), al_object.get_sum_total_div(), '--o', c='g')
    ax2.set_ylabel(r'$D_{\phi}[C: D]$', fontsize=15)
    ax2.set_xlabel(r'Iteractions', fontsize=15)
    #ax2.set_xticks(np.arange(1, al_object.get_last_iter()+ 1))
    ax2.set_xlim([0, 8])
    ax2.grid()
    plt.savefig(path+'sum_total_divergence_ev_'+tag, dpi=100)
    plt.close()

def plot_voronoi2D_diagram(al_object, X, classes, divergence, tag, path=plots_path):
    
    centers = al_object.get_centroids()
    # Get the Voronoi diagrams
    vor = Voronoi(centers)
    ax_lim = [np.min(X, axis=0), np.max(X, axis=0)]
    fig, axes = plt.subplots(1, 1, figsize=(10,8))
    # Draw data using target to colorize them
    dict_label = {
        0 : ('red','Background'),
        1 : ('blue','Signal')
    }
    for i in np.unique(classes):
        axes.scatter(X[classes==i, 0], X[classes==i, 1], c=dict_label[i][0],
                     edgecolor='k', s=35, alpha=.5, label=dict_label[i][1])
    # Draw the centroids
    axes.plot(centers[:,0], centers[:,1], '^', c='black', markersize=15, label='Final Centroids')
    # Draw voronoi
    voronoi_plot_2d(vor, ax=axes, line_colors='darkorange', line_width=3, show_points=False, show_vertices=True)
    plt.title('Obtained Clusters for %s divergence' %(divergence), fontsize=18)
    plt.grid()
    plt.legend(loc='best', fontsize='x-large')
    plt.xlim([ax_lim[0][0], ax_lim[1][0]])
    plt.ylim([ax_lim[0][1], ax_lim[1][1]])
    plt.xticks(fontsize=13)
    plt.yticks(fontsize=13)
    plt.xlabel(r'$\langle\mu\rangle$', fontsize=15)
    plt.ylabel(r'$E_T$', fontsize=13)
    plt.savefig(path+'voronoi_diagram_'+tag, dpi=100)
    plt.close()

In [4]:
jpsi_data = dict(np.load(data_files_path+file_name))
jpsi_data.keys()

dict_keys(['features', 'etBins', 'etaBins', 'etBinIdx', 'etaBinIdx', 'data', 'target'])

In [5]:
list_of_features = list(jpsi_data['features'])
print(list_of_features)

['avgmu', 'L2Calo_ring_0', 'L2Calo_ring_1', 'L2Calo_ring_2', 'L2Calo_ring_3', 'L2Calo_ring_4', 'L2Calo_ring_5', 'L2Calo_ring_6', 'L2Calo_ring_7', 'L2Calo_ring_8', 'L2Calo_ring_9', 'L2Calo_ring_10', 'L2Calo_ring_11', 'L2Calo_ring_12', 'L2Calo_ring_13', 'L2Calo_ring_14', 'L2Calo_ring_15', 'L2Calo_ring_16', 'L2Calo_ring_17', 'L2Calo_ring_18', 'L2Calo_ring_19', 'L2Calo_ring_20', 'L2Calo_ring_21', 'L2Calo_ring_22', 'L2Calo_ring_23', 'L2Calo_ring_24', 'L2Calo_ring_25', 'L2Calo_ring_26', 'L2Calo_ring_27', 'L2Calo_ring_28', 'L2Calo_ring_29', 'L2Calo_ring_30', 'L2Calo_ring_31', 'L2Calo_ring_32', 'L2Calo_ring_33', 'L2Calo_ring_34', 'L2Calo_ring_35', 'L2Calo_ring_36', 'L2Calo_ring_37', 'L2Calo_ring_38', 'L2Calo_ring_39', 'L2Calo_ring_40', 'L2Calo_ring_41', 'L2Calo_ring_42', 'L2Calo_ring_43', 'L2Calo_ring_44', 'L2Calo_ring_45', 'L2Calo_ring_46', 'L2Calo_ring_47', 'L2Calo_ring_48', 'L2Calo_ring_49', 'L2Calo_ring_50', 'L2Calo_ring_51', 'L2Calo_ring_52', 'L2Calo_ring_53', 'L2Calo_ring_54', 'L2Calo_ri

In [6]:
var_indexes = [list_of_features.index('avgmu'),
               list_of_features.index('L2Calo_et'),]

In [7]:
data_      = jpsi_data['data'][:, var_indexes]
my_filter  = (data_[:,0] <= 80)
sgn_filter = jpsi_data['target'][my_filter]==1
bkg_filter = jpsi_data['target'][my_filter]==0
data_      = data_[my_filter,:]
y          = jpsi_data['target'][my_filter]
print(data_.shape)

(233397, 2)


In [8]:
sgn_choices_filter = np.random.choice(data_[sgn_filter].shape[0], size=800)
bkg_choices_filter = np.random.choice(data_[bkg_filter].shape[0], size=800)
choices_filter     = np.concatenate((sgn_choices_filter,bkg_choices_filter))

In [9]:
data_ = data_[choices_filter]
y     = jpsi_data['target'][choices_filter]
print(data_.shape)

(1600, 2)


In [10]:
GeV = 1e3
epsilon = 1e-1

In [11]:
data_[:, 1] = data_[:, 1]/GeV
#data_[data_[:,0] == 0, 0] = data_[data_[:,0] == 0, 0] + epsilon

In [12]:
n_clusters = [3, 4, 5]
n_folds    = 10
divs       = ['euclidean', 'exp', 'itakura-saito', 'gen_kl', 'gen_kls', 'gen_js']

In [13]:
cluster_measures = {
    'silhouette_score'        : silhouette_score,
    'davies_bouldin_score'    : davies_bouldin_score,
    'calinski_harabasz_score' : calinski_harabasz_score
}

In [14]:
kf = KFold(n_splits=n_folds, random_state=13)

In [15]:
CVO = list(kf.split(data_))

In [16]:
cv_dict = {}

In [17]:
for idiv in divs:
    cv_dict[idiv] = {}
    for idx, ifold in enumerate(CVO):
        trn_id, tst_id = ifold
        scaler         = MinMaxScaler(feature_range=(epsilon, 1))
        scaler.fit(data_[trn_id])
        norm_data      = scaler.transform(data_)
        cv_dict[idiv][idx] = {}
        for icluster in n_clusters:
            #print('Clustering with %i clusters using %s divergence in %i Fold...' %(icluster, idiv, idx))
            cv_dict[idiv][idx][icluster] = {}
            kmeans = base_kmeans(n_clusters=icluster)
            kmeans.fit(norm_data, n_iter=50, tol=1e-3, breg_div=idiv)
            plot_div_evo(kmeans, breg_div=idiv, tag='%s_%i_fold_%i_cluster' %(idiv, idx, icluster))
            plot_voronoi2D_diagram(kmeans, X=norm_data, classes=y, divergence=idiv,
                                   tag='%s_%i_fold_%i_cluster' %(idiv, idx, icluster))
            predicted_labels = kmeans.predict_cluster(norm_data[tst_id])
            for imeasure in cluster_measures.keys():
                cv_dict[idiv][idx][icluster][imeasure] = cluster_measures[imeasure](norm_data[tst_id],
                                                                                    predicted_labels)
            

The conversion criteria was reached... Stopping!
The conversion criteria was reached... Stopping!
The conversion criteria was reached... Stopping!
The conversion criteria was reached... Stopping!
The conversion criteria was reached... Stopping!
The conversion criteria was reached... Stopping!
The conversion criteria was reached... Stopping!
The conversion criteria was reached... Stopping!
The conversion criteria was reached... Stopping!
The conversion criteria was reached... Stopping!
The conversion criteria was reached... Stopping!
The conversion criteria was reached... Stopping!
The conversion criteria was reached... Stopping!
The conversion criteria was reached... Stopping!
The conversion criteria was reached... Stopping!
The conversion criteria was reached... Stopping!
The conversion criteria was reached... Stopping!
The conversion criteria was reached... Stopping!
The conversion criteria was reached... Stopping!
The conversion criteria was reached... Stopping!
The conversion crite

In [18]:
info_cluster_dict = {
    'bregman_divergence'      : [],
    'n_cluster'               : [],
    'silhouette_score'        : [],
    'davies_bouldin_score'    : [],
    'calinski_harabasz_score' : [],
}


In [19]:
for idiv in cv_dict.keys():
    for ifold in cv_dict[idiv].keys():
        for icluster in cv_dict[idiv][ifold].keys():
            info_cluster_dict['bregman_divergence'].append(idiv)
            info_cluster_dict['n_cluster'].append(icluster)
            for jmeasure in cluster_measures.keys():
                info_cluster_dict[jmeasure].append(cv_dict[idiv][ifold][icluster][jmeasure])

In [20]:
import pandas as pd

In [21]:
clus_df = pd.DataFrame(info_cluster_dict)

In [22]:
my_measure = list(cluster_measures.keys())

In [23]:
clus_df.head()

Unnamed: 0,bregman_divergence,n_cluster,silhouette_score,davies_bouldin_score,calinski_harabasz_score
0,euclidean,3,0.345203,0.93193,112.62897
1,euclidean,4,0.316847,1.031959,110.99183
2,euclidean,5,0.347138,0.828939,124.870509
3,euclidean,3,0.364067,0.803067,119.916896
4,euclidean,4,0.342222,0.911218,121.725491


In [24]:
cv_table = clus_df.groupby(['bregman_divergence', 'n_cluster'])[my_measure].agg(['mean', 'std'])

In [25]:
cv_table

Unnamed: 0_level_0,Unnamed: 1_level_0,silhouette_score,silhouette_score,davies_bouldin_score,davies_bouldin_score,calinski_harabasz_score,calinski_harabasz_score
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std,mean,std
bregman_divergence,n_cluster,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
euclidean,3,0.36336,0.023375,0.899728,0.059736,120.668823,12.720407
euclidean,4,0.321915,0.035726,0.955676,0.056967,113.563773,9.683367
euclidean,5,0.334202,0.014201,0.921786,0.0758,116.645391,17.751143
exp,3,0.363984,0.024711,0.894871,0.058142,119.74404,15.017527
exp,4,0.319762,0.020256,0.952975,0.071213,107.420983,21.439648
exp,5,0.319289,0.017826,0.913565,0.063124,110.363211,19.13278
gen_js,3,0.361253,0.021104,0.920596,0.063394,119.200341,9.719072
gen_js,4,0.314517,0.02499,0.962027,0.086403,109.638078,13.872358
gen_js,5,0.30383,0.022487,0.990487,0.07759,104.886211,17.391176
gen_kl,3,0.358561,0.021804,0.913772,0.05019,117.725168,9.799305


* As melhores divergências foram a Euclidiana e a Exponencial;
* Itakura-saito obteve os piores resultados em todas os índices;

In [27]:
cv_table.to_excel('../data_files/clusterization_table.xlsx')