# Information Geometry

Author: Micael Veríssimo de Araújo (micael.verissimo@lps.ufrj.br)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns

from sklearn.cluster import KMeans

# #------------------------------------------------------------------------------
# # accept a dataframe, remove outliers, return cleaned data in a new dataframe
# # see http://www.itl.nist.gov/div898/handbook/prc/section1/prc16.htm
# #------------------------------------------------------------------------------
# def remove_outlier(df_in, col_name):
#     q1 = df_in[col_name].quantile(0.25)
#     q3 = df_in[col_name].quantile(0.75)
#     iqr = q3-q1 #Interquartile range
#     fence_low  = q1-1.5*iqr
#     fence_high = q3+1.5*iqr
#     df_out = df_in.loc[(df_in[col_name] > fence_low) & (df_in[col_name] < fence_high)]
#     return df_out

In [None]:
data_files_path = '../data_files/data17_13TeV.AllPeriods.sgn.probes_lhmedium_EGAM2.bkg.VProbes_EGAM7.GRL_v97/'
file_name       = 'data17_13TeV.AllPeriods.sgn.probes_lhmedium_EGAM2.bkg.VProbes_EGAM7.GRL_v97_et0_eta0.npz'

plots_path      = '../plots_clusterizacao/'
my_seed         = 13

In [None]:
jpsi_data = dict(np.load(data_files_path+file_name))
jpsi_data.keys()

As variáveis presentes neste data set são:

In [None]:
list_of_features = list(jpsi_data['features'])
print(list_of_features)

Para o processo de clusterização serão utilizadas $2$ variáveis: $\langle \mu \rangle$ e $E_T$.

In [None]:
var_indexes = [list_of_features.index('avgmu'),
               list_of_features.index('L2Calo_et'),]#+list(range(list_of_features.index('L2Calo_ring_0'), list_of_features.index('L2Calo_ring_99')))
               #list_of_features.index('et'),] 
               #list_of_features.index('phi')]# + list(range(list_of_features.index('L2Calo_ring_88'), list_of_features.index('L2Calo_ring_99')))

In [None]:
print(var_indexes)

In [None]:
data_      = jpsi_data['data'][:, var_indexes]
mu_filter  = data_[:,0] <= 60
sgn_filter = jpsi_data['target'][mu_filter]==1
bkg_filter = jpsi_data['target'][mu_filter]==0
data_      = data_[mu_filter,:]
print(data_.shape)

In [None]:
# fig = plt.figure(figsize=(10,8))
# ax = fig.add_subplot(111, projection='3d')
# ax.scatter(data_[:,0], data_[:,1], data_[:,2], s=10, alpha=0.6, edgecolors='w')
# #ax.scatter(np.sum(bkg_data[:,1:], axis=1), bkg_data[:,0], s=10, alpha=0.6, edgecolors='w')

# #ax.set_xlabel(r'$E_{T_{HAD}}$')
# ax.set_ylabel(r'$E_T$')
# ax.set_zlabel(r'$\eta$')
# ax.set_xlabel(r'$\langle\mu\rangle$')

# plt.show()

In [None]:
sgn_choices_filter = np.random.choice(data_[sgn_filter].shape[0], size=300)
bkg_choices_filter = np.random.choice(data_[bkg_filter].shape[0], size=300)
choices_filter     = np.concatenate((sgn_choices_filter,bkg_choices_filter))

In [None]:
data_ = data_[choices_filter,:]
y     = jpsi_data['target'][choices_filter]
print(data_.shape)

## Clusterização Utilizando Divergências de Bregman

As divergências de Bregman são divergências da forma

**Definição** (Bregman, 1967; Censor and Zenios, 1998) Seja $\phi : S \to \mathbb{R}$, $S = \text{dom}(\phi)$ uma função estritamente convexa definida em um conjunto convexo $S \subset \mathbb{R}^d$ tal que $\phi$ é diferenciável em seu interior relativo $(\text{ri}(S))$, assumindo $\text{ri}(S)$ não vazio. A divergência de Bregman $D_{\phi} : S\times \text{ri}(S) \to [0,\infty)$ é definida como:

$$D_{\phi}(x,y) = \phi(x) - \phi(y) - \langle x-y, \nabla\phi(y)\rangle$$

<img src="../misc/bregman_divs_table.png"  style="width: 20cm;"/>

# Usando Ringer

In [None]:
km = KMeans(n_clusters = 3, n_jobs = 4, random_state=my_seed)
km.fit(data_)

In [None]:
centers = km.cluster_centers_
print(centers)

In [None]:
# fig = plt.figure(figsize=(15,9))
# ax = fig.add_subplot(111, projection='3d')
# ax.scatter(centers[:,0], centers[:,1], centers[:,2], s=100, alpha=.9, edgecolors='w')

# ax.set_ylabel(r'$E_{T}$', fontsize=15)
# ax.set_zlabel(r'$\eta$', fontsize=15)
# ax.set_xlabel(r'$\langle\mu\rangle$', fontsize=15)

# plt.show()

In [None]:
plt.plot(centers[:, 0], centers[:, 1], '*')
plt.xlabel(r'$\langle\mu\rangle$', fontsize=15)
plt.ylabel(r'$E_T$', fontsize=15)
plt.show()

In [None]:
# #this will tell us to which cluster does the data observations belong.
# new_labels = km.labels_
# # Plot the identified clusters and compare with the answers
# # set up a figure twice as wide as it is tall
# fig = plt.figure(figsize=(10,8))

# #===============
# #  First subplot
# #===============
# # set up the axes for the first plot
# ax1 = fig.add_subplot(1, 2, 1, projection='3d')
# #===============
# # Second subplot
# #===============
# # set up the axes for the second plot
# ax2 = fig.add_subplot(1, 2, 2, projection='3d')

# #fig, axes = plt.subplots(1, 2, figsize=(16,8), projection='3d')
# ax1.scatter(data_[:, 0], data_[:, 1], data_[:,2], c=y, cmap='gist_rainbow',
# edgecolor='k', s=50, alpha=.2)
# ax2.scatter(data_[:, 0], data_[:, 1], data_[:,2], c=new_labels, cmap='jet',
# edgecolor='k', s=50, alpha=.2)
# ax1.set_xlabel(r'$\langle\mu\rangle$', fontsize=18)
# ax1.set_ylabel(r'$E_T$', fontsize=18)
# ax1.set_ylabel(r'$\eta$', fontsize=18)
# ax2.set_xlabel(r'$\langle\mu\rangle$', fontsize=18)
# ax2.set_ylabel(r'$E_T$', fontsize=18)
# ax2.set_ylabel(r'$\eta$', fontsize=18)
# ax1.tick_params(direction='in', length=10, width=5, colors='k', labelsize=20)
# ax2.tick_params(direction='in', length=10, width=5, colors='k', labelsize=20)
# ax1.set_title('Actual', fontsize=18)
# ax2.set_title('Predicted', fontsize=18)

In [None]:
#this will tell us to which cluster does the data observations belong.
new_labels = km.labels_
# Plot the identified clusters and compare with the answers
fig, axes = plt.subplots(1, 2, figsize=(16,8))
scarter = axes[0].scatter(data_[:, 0], data_[:, 1], c=y, cmap='inferno',
edgecolor='k', s=50, alpha=.7)
axes[0].legend(*scarter.legend_elements(),
                    loc="best", title="Classes", fontsize='x-large')

scarter1 = axes[1].scatter(data_[:, 0], data_[:, 1], c=new_labels, cmap='jet',
edgecolor='k', s=50, alpha=.2)
axes[1].legend(*scarter1.legend_elements(),
                    loc="best", title="Clusters", fontsize='x-large')

axes[0].set_xlabel(r'$\langle\mu\rangle$', fontsize=18)
axes[0].set_ylabel(r'$E_T$', fontsize=18)
axes[1].set_xlabel(r'$\langle\mu\rangle$', fontsize=18)
axes[1].set_ylabel(r'$E_T$', fontsize=18)
axes[0].tick_params(direction='in', length=10, width=5, colors='k', labelsize=20)
axes[1].tick_params(direction='in', length=10, width=5, colors='k', labelsize=20)
axes[0].set_title('Actual', fontsize=18)
axes[1].set_title('Predicted', fontsize=18)

In [None]:
plt.figure(figsize=(10,8))
plt.plot(data_[:, 0], data_[:, 1], 'o')
plt.xlabel(r'$\langle\mu\rangle$', fontsize=15)
plt.ylabel(r'$E_T$', fontsize=15)
plt.show()

In [None]:
from scipy import stats

In [None]:
a = stats.zscore(data_[:,0])
plt.figure(figsize=(10,8))
plt.hist(a, bins=50)
plt.yscale('log')
#plt.hist(sgn_data[:,0], bins=30)
plt.show()

In [None]:
plt.figure(figsize=(10,8))
plt.hist(data_[:,0], bins='sqrt')
#plt.hist(sgn_data[:,0], bins=30)
plt.show()

In [None]:
fig = plt.figure(figsize=(10,8))
ax = fig.add_subplot(111)#, projection='3d')
ax.scatter(data_[:,1], data_[:,0], s=10, alpha=0.6, edgecolors='w')
#ax.scatter(np.sum(bkg_data[:,1:], axis=1), bkg_data[:,0], s=10, alpha=0.6, edgecolors='w')

ax.set_xlabel(r'$E_{T_{HAD}}$')
ax.set_ylabel(r'$\eta$')
#ax.set_zlabel(r'$\langle\mu\rangle$')

plt.show()