In [None]:
import numpy as np
from cluster_algorithms import base_kmeans
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from scipy.spatial import Voronoi, voronoi_plot_2d
import time

In [None]:
data_files_path = '../data_files/data17_13TeV.AllPeriods.sgn.probes_lhmedium_EGAM2.bkg.VProbes_EGAM7.GRL_v97/'
file_name       = 'data17_13TeV.AllPeriods.sgn.probes_lhmedium_EGAM2.bkg.VProbes_EGAM7.GRL_v97_et0_eta0.npz'

plots_path      = '../clustering_plot/'
my_seed         = 13

In [None]:
jpsi_data = dict(np.load(data_files_path+file_name))
jpsi_data.keys()

In [None]:
list_of_features = list(jpsi_data['features'])
print(list_of_features)

In [None]:
var_indexes = [list_of_features.index('avgmu'),
               list_of_features.index('L2Calo_et'),]

In [None]:
data_      = jpsi_data['data'][:, var_indexes]
my_filter  = (data_[:,0] <= 80)
sgn_filter = jpsi_data['target'][my_filter]==1
bkg_filter = jpsi_data['target'][my_filter]==0
data_      = data_[my_filter,:]
print(data_.shape)

In [None]:
sgn_choices_filter = np.random.choice(data_[sgn_filter].shape[0], size=400)
bkg_choices_filter = np.random.choice(data_[bkg_filter].shape[0], size=400)
choices_filter     = np.concatenate((sgn_choices_filter,bkg_choices_filter))

In [None]:
data_ = data_[choices_filter]
y     = jpsi_data['target'][choices_filter]
print(data_.shape)

In [None]:
GeV = 1e3
epsilon = 1e-2

In [None]:
data_[:, 1] = data_[:, 1]/GeV
data_[data_[:,0] == 0, 0] = data_[data_[:,0] == 0, 0] + epsilon

In [None]:
kmeans = base_kmeans(n_clusters=5, seed=13)

In [None]:
%%time
kmeans.fit(data_, n_iter=50, tol=1e-5, breg_div='euclidean')

In [None]:
kmeans.get_sum_total_div()

In [None]:
plt.figure(figsize=(10,8))
plt.plot(range(kmeans.get_last_iter()), kmeans.get_sum_total_div(), '--o', c='g')
plt.title('Total sum of the divergences', fontsize=15)
plt.ylabel(r'$D_{\phi}[C: D]$', fontsize=13)
plt.xlabel(r'Iteractions', fontsize=13)
plt.grid()
plt.show()

In [None]:
plt.figure(figsize=(10,8))
# Draw data using target to colorize them
plt.plot(data_[:,0], data_[:,1], 'o', label='Data Points')
plt.plot(kmeans.get_first_centroids()[:,0], kmeans.get_first_centroids()[:,1], '*',
         markersize=10, label='Initial Centroids')
plt.plot(kmeans.get_centroids()[:,0], kmeans.get_centroids()[:,1], '^',
         markersize=10, label='Final Centroids')
plt.legend(loc='best', fontsize='x-large')
plt.xlabel(r'$\langle\mu\rangle$', fontsize=13)
plt.ylabel(r'$E_T$', fontsize=13)
plt.show()

In [None]:
centers = kmeans.get_centroids()
# Get the Voronoi diagrams
vor = Voronoi(centers)

In [None]:
ax_lim = [np.min(data_, axis=0), np.max(data_, axis=0)]

In [None]:
fig, axes = plt.subplots(1, 1, figsize=(10,8))
# Draw data using target to colorize them
axes.scatter(data_[:, 0], data_[:, 1], c=y, cmap='Set1',
             edgecolor='k', s=50, alpha=.95)
# Draw the centroids
axes.plot(centers[:,0], centers[:,1], '^', c='black', markersize=15, label='Final Centroids')
# Draw voronoi
voronoi_plot_2d(vor, ax=axes, show_vertices=True)
plt.grid()
plt.legend(loc='best', fontsize='x-large')
plt.xlim([ax_lim[0][0], ax_lim[1][0]])
plt.ylim([ax_lim[0][1], ax_lim[1][1]])
plt.xlabel(r'$\langle\mu\rangle$', fontsize=13)
plt.ylabel(r'$E_T$', fontsize=13)
plt.show()