-
Notifications
You must be signed in to change notification settings - Fork 6
/
hdb.py
178 lines (154 loc) · 6.26 KB
/
hdb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
print(__doc__)
from sklearn import metrics
import numpy as np
from read_data import read_data
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
from itertools import cycle
import os
from sklearn.neighbors import DistanceMetric
def hdb(txtfile, event, weighted, noise):
run = '10jets_overlap'
##############################################################################
# Read in sample data
X_all, e_all = read_data(txtfile)
X = X_all[event]
e = e_all[event].ravel()
##############################################################################
# Set variables depending on noise and weighting decisions
# Filter out noise
if not noise:
X = X[e > 1e-7]
e = e[e > 1e-7]
X_unweighted = X
e_unweighted = e
# Upweight events with high energy
if weighted:
# Weight by E/E_noise (actually that would be too high)
NOISE_LVL = 0.08 # GeV
X_rep = []
e_rep = []
for p, energy in zip(X, e):
n_replications = int(np.ceil((energy / NOISE_LVL)))
# if energy > 1:
# print 'energy = {}, n_replications = {}'.format(energy, n_replications)
X_rep.extend([p] * (n_replications))
e_rep.extend([energy] * (n_replications))
X = np.array(X_rep)
e = np.array(e_rep)
# Set other params
#min_cluster_size=1250
min_cluster_size=2000
folder = 'weighted'
else: # no weighting
min_cluster_size=5
folder = 'unweighted'
# Sort output into correct directory
if noise:
folder = os.path.join(folder, 'with_noise', run)
else:
folder = os.path.join(folder, 'no_noise', run)
# Make sure that the output directory exists, otherwise create it
if not os.path.exists(folder):
os.makedirs(folder)
##############################################################################
# Compute HDBSCAN
import hdbscan
# from hdbscan.dist_metrics import WMinkowskiDistance
# # MINKOWSKI = DistanceMetric.get_metric('wminkowski', p=2, w=e)
# MINKOWSKI = WMinkowskiDistance(p=2, w=e)
# def minkowski(X, Y=None):
# try:
# return MINKOWSKI.pairwise(X, Y)
# except ValueError:
# return MINKOWSKI.pairwise(X.reshape(1, -1), Y.reshape(1, -1) if Y is not None else None)
clusterer = hdbscan.HDBSCAN(
min_cluster_size=min_cluster_size,
gen_min_span_tree=True,
allow_single_cluster=True,
#min_samples = 10
)
clusterer.fit(X)
##############################################################################
# Output cluster info
labels = clusterer.labels_
n_clusters_ = len(set(labels))
if -1 in set(labels): # '-1' is just the group of unclustered objects
n_real_clusters = n_clusters_ - 1
else:
n_real_clusters = n_clusters_
print('Estimated number of clusters: %d' % (n_real_clusters))
# Find center of mass energy and position for cluster
cluster_centers = []
cluster_energies = []
for l in set(labels): # !! WARNING !!: these should be computed with unweighted quantities
#_e = np.unique(e[labels == l])
#_X = np.unique(np.array([(x, y) for x, y in zip(X[labels == l, 0], X[labels == l, 1])]))
b = np.array(list(set(zip(X[labels == l, 0], X[labels == l, 1], e[labels == l]))))
_X = b[:, :2]
_e = b[:, -1].ravel()
cme = sum(_e)
cm = np.sum(_X * _e[:, np.newaxis], axis=0) / cme
#cme = sum(e[labels == l])
#cm = np.sum(X[labels == l] * e[labels == l][:, np.newaxis], axis=0) / cme
print 'Cluster {}: Center = {}, Energy = {}'.format(l, cm, cme)
cluster_centers.append(cm)
cluster_energies.append(cme)
##############################################################################
# Produce HDBSCAN-specific plots
#plot_hdb(clusterer, folder)
##############################################################################
# Plot result
# -- display event
plot_eta_phi(X, X_unweighted, e, e_unweighted, n_clusters_, labels, cluster_centers, folder)
# ---------------------------------------------------------------------
def plot_eta_phi(X, X_unweighted, e, e_unweighted, n_clusters_, labels, cluster_centers, folder):
binx = np.linspace(-3.0, 3.0, 61)
biny = np.linspace(-3.1, 3.1, 63)
#plt.hist2d(X[:, 0], X[:, 1], weights=e.ravel(), bins=(binx, biny), cmap='rainbow', norm=LogNorm())
plt.hist2d(X_unweighted[:, 0], X_unweighted[:, 1], weights=e_unweighted.ravel(), bins=(binx, biny), cmap='rainbow', norm=LogNorm())
cb = plt.colorbar()
cb.set_label('Energy (GeV)')
plt.xlabel(r'$\eta$')
plt.ylabel(r'$\phi$')
# -- display clustering features
for k in range(n_clusters_):
class_members = labels == k
cluster_center = cluster_centers[k]
if sum(class_members) == 0:
#pass
plt.plot(cluster_center[0], cluster_center[1], 'kx', mew=2, markersize=14)
else:
plt.plot(cluster_center[0], cluster_center[1], 'ko', markerfacecolor='None', #markeredgecolor='k',
markersize=14)
for x in X[class_members]:
plt.plot([cluster_center[0], x[0]], [cluster_center[1], x[1]], color='black',alpha=0.2)
plt.savefig(os.path.join(folder, 'etaphi.pdf'))
plt.show()
# ---------------------------------------------------------------------
def plot_hdb(clusterer, folder):
clusterer.minimum_spanning_tree_.plot(edge_cmap='rainbow',
edge_alpha=0.6,
node_size=40,
edge_linewidth=2)
plt.savefig(os.path.join(folder, 'minimum_spanning_tree.jpg'))
plt.clf()
# clusterer.single_linkage_tree_.plot(cmap='winter')
# plt.savefig(os.path.join(folder, 'single_linkage_tree.pdf'))
# plt.clf()
clusterer.condensed_tree_.plot(select_clusters=True, cmap='winter')
plt.savefig(os.path.join(folder, 'condensed_tree.pdf'))
plt.clf()
# ---------------------------------------------------------------------
if __name__ == '__main__':
import sys
import argparse
# Read in arguments
parser = argparse.ArgumentParser()
parser.add_argument('--txtfile', help="path to the text file with data", default='JetGenerator/clu15_2jets_withNoise.txt')
parser.add_argument('--event', help="int, number of the event to consider", type=int, default=1)
parser.add_argument('--weighted', help="boolean that determines whether to weight the points by their energy", default=False, action='store_true')
parser.add_argument('--noise', help="boolean that determines whether to add noise", default=False, action='store_true')
args = parser.parse_args()
sys.exit(hdb(args.txtfile, args.event, args.weighted, args.noise))