forked from scikit-hep/probfit
-
Notifications
You must be signed in to change notification settings - Fork 0
/
tutorial.py
562 lines (392 loc) · 14.2 KB
/
tutorial.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
# -*- coding: utf-8 -*-
# <nbformat>3.0</nbformat>
# <headingcell level=1>
# dist_fit Basic Tutorial
# <codecell>
from probfit import *
import numpy as np
import iminuit
import UserDict
from math import sqrt, exp, pi
# <headingcell level=2>
# Lets start with a simple simple straight line
# <rawcell>
# We can't really call this a fitting package without being able to fit a straight line, right?
# <codecell>
#lets make s straight line with gaussian(mu=0,sigma=1) error
x = linspace(0,10,20)
y = 3*x+15+ randn(20)
err = np.array([1]*20)
errorbar(x,y,err,fmt='.');
# <codecell>
#lets define our line
#first argument has to be independent variable
#arguments after that are shape parameters
def line(x,m,c): #define it to be parabolic or whatever you like
return m*x+c
#We can make it faster but for this example this is plentily fast.
#We will talk about speeding things up later(you will need cython)
# <codecell>
describe(line)
# <codecell>
#cost function
chi2 = Chi2Regression(line,x,y,err)
# <codecell>
#Chi^2 regression is just a callable object nothing special about it
describe(chi2)
# <codecell>
#minimize it
#yes it gives you a heads up that you didn't give it initial value
#we can ignore it for now
minimizer = RTMinuit.Minuit(chi2) #see RTMinuit tutorial on how to give initial value/range/error
minimizer.migrad(); #very stable robust minimizer
#you can look at your terminal to see what it is doing
# <codecell>
#lets see our results
print minimizer.values
print minimizer.errors
# <codecell>
#or a pretty printing
minimizer.html_results()
#you can also call display(minimizer.html_results()) if this is in the middle of your code
# <rawcell>
# Parabolic error
# is calculated using the second derivative at the minimum
# This is good in most cases where the uncertainty is symmetric not much correlation
# exists. Migrad usually got this accurately but if you want ot be sure
# call minimizer.hesse() after migrad
# Minos Error
# is obtained by scanning chi^2 and likelihood profile and find the the point
# where chi^2 is increased by 1 or -ln likelihood is increased by 0.5
# This error is generally asymmetric and is correct in all case.
# This is quite computationally expensive if you have many parameter.
# call minimizer.minos('variablename') after migrad to get this error
# <codecell>
#let's visualize our line
chi2.draw(minimizer)
#looks good
# <codecell>
#Sometime we want the error matrix
print 'error matrix:\n', minimizer.error_matrix()
#or correlation matrix
print 'correlation matrix:\n', minimizer.error_matrix(correlation=True)
#or a pretty html
#despite the method named error_matrix it's actually a correlation matrix
display(minimizer.html_error_matrix())
# <headingcell level=2>
# Simple gaussian distribution fit
# <rawcell>
# In high energy physics, we usually want to fit a distribution to a histogram. Let's look at simple gaussian distribution
# <codecell>
#First lets make our data
data = randn(10000)*4+1
#sigma = 4 and mean = 1
hist(data,bins=100,histtype='step');
# <rawcell>
# Here is our PDF/model
# <codecell>
#normalized gaussian
def myPDF(x,mu,sigma):
return 1/sqrt(2*pi)/sigma*exp(-(x-mu)**2/2./sigma**2)
# <codecell>
#build your cost function here we use binned likelihood
cost = BinnedLH(myPDF,data)
# <codecell>
#Let's fit
minimizer = RTMinuit.Minuit(cost,sigma=3.) #notice here that we give initial value to sigma
#up parameter determine where it determine uncertainty
#1(default) for chi^2 and 0.5 for likelihood
minimizer.set_up(0.5)
minimizer.migrad() #very stable minimization algorithm
#like in all binned fit with long zero tail. It will have to do something about the zero bin
#dist_fit handle them gracefully but will give you a head up
# <codecell>
#let's see the result
print 'Value:', minimizer.values
print 'Error:', minimizer.errors
# <codecell>
#That printout can get out of hand quickly
display(minimizer.html_results())
#and correlation matrix
#will not display well in firefox(tell them to fix writing-mode:)
display(minimizer.html_error_matrix())
# <codecell>
#you can see if your fit make any sense too
cost.draw(minimizer)#uncertainty is given by symetric poisson
# <codecell>
#how about making sure the error making sense
draw_contour(cost,minimizer,'mu')
# <codecell>
#2d contour error
#you can notice that it takes sometime to draw
#we will this is because our PDF is defined in Python
#we will show how to speed this up later
draw_contour2d(cost,minimizer,'mu','sigma');
# <headingcell level=2>
# How about Chi^2
# <rawcell>
# Let's explore another popular cost function chi^2
# <codecell>
#we will use the same data as in the previous example
np.random.seed(0)
data = randn(10000)*4+1
#sigma = 4 and mean = 1
hist(data,bins=100,histtype='step');
# <codecell>
#And the same PDF: normalized gaussian
def myPDF(x,mu,sigma):
return 1/sqrt(2*pi)/sigma*exp(-(x-mu)**2/2./sigma**2)
# <codecell>
#binned chi^2 fit only makes sense for extended fit
extended_pdf = Extend(myPDF)
# <codecell>
#very useful method to look at function signature
describe(extended_pdf) #you can look at what your pdf means
# <codecell>
#Chi^2 distribution fit is really bad for distribution with long tail
#since when bin count=0... poisson error=0 and blows up chi^2
#so give it some range
chi2 = BinnedChi2(extended_pdf,data,range=(-7,10))
minimizer = RTMinuit.Minuit(chi2,sigma=1)
minimizer.migrad()
# <codecell>
chi2.draw(minimizer)
# <codecell>
#and usual deal
display(minimizer.html_results())
display(minimizer.html_error_matrix())
# <headingcell level=2>
# Unbinned Likelihood and How to speed things up
# <rawcell>
# Unbinned likelihood is computationally very very expensive.
# It's now a good time that we talk about how to speed things up with cython
# <codecell>
#same data
np.random.seed(0)
data = randn(10000)*4+1
#sigma = 4 and mean = 1
hist(data,bins=100,histtype='step');
# <codecell>
#We want to speed things up with cython
#load cythonmagic
%load_ext cythonmagic
# <codecell>
%%cython
cimport cython
from libc.math cimport exp,M_PI,sqrt
#same gaussian distribution but now written in cython
@cython.binding(True)#IMPORTANT:this tells cython to dump function signature too
def cython_PDF(double x,double mu,double sigma):
#these are c add multiply etc not python so it's fast
return 1/sqrt(2*M_PI)/sigma*exp(-(x-mu)**2/2./sigma**2)
# <codecell>
#cost function
ublh = UnbinnedML(cython_PDF,data)
minimizer = RTMinuit.Minuit(ublh,sigma=2.)
minimizer.set_up(0.5)#remember this is likelihood
minimizer.migrad()#yes amazingly fast
ublh.show(minimizer)
display(minimizer.html_results())
display(minimizer.html_error_matrix())
# <codecell>
#remember how slow it was?
#now it's super fast(and it's even unbinned likelihood)
draw_contour(ublh,minimizer,'mu')
# <codecell>
#but you really don't have to write your own gaussian
#there are tons of builtin function written in cython for you
print describe(gaussian)
print type(gaussian)
# <codecell>
ublh = UnbinnedML(gaussian,data)
minimizer = RTMinuit.Minuit(ublh,sigma=2.)
minimizer.set_up(0.5)#remember this is likelihood
minimizer.migrad()#yes amazingly fast
ublh.show(minimizer)
display(minimizer.html_results())
display(minimizer.html_error_matrix())
# <headingcell level=2>
# But... We can't normalize everything analytically and how to generate toy sample from PDF
# <rawcell>
# When fitting distribution to a PDF. One of the common problem that we run into is normalization.
# Not all function is analytically integrable on the range of our interest.
# Let's look at crystal ball function
# <codecell>
#lets first generate a crystal ball sample
#dist_fit has builtin toy generation capability
#lets introduce crystal ball function
#http://en.wikipedia.org/wiki/Crystal_Ball_function
#it's simply gaussian with power law tail
#normally found in energy deposited in crystals
#impossible to normalize analytically
#and normalization will depend on shape parameters
describe(crystalball)
# <codecell>
np.random.seed(0)
bound = (-1,2)
data = gen_toy(crystalball,10000,bound=bound,alpha=1.,n=2.,mean=1.,sigma=0.3,quiet=False)
#quiet = False tells it to plot out original function
#toy histogram and poisson error from both orignal distribution and toy
# <codecell>
#To fit this we need to tell normalized our crystal ball PDF
#this is done with trapezoid rule with simple cache mechanism
#can be done by Normalize functor
ncball = Normalize(crystalball,bound)
#this can also bedone with declerator
#@normalized_function(bound)
#def myPDF(x,blah):
# return blah
print 'unnorm:', crystalball(1.0,1,2,1,0.3)
print ' norm:', ncball(1.0,1,2,1,0.3)
# <codecell>
#it has the same signature as the crystalball
describe(ncball)
# <codecell>
#now we can fit as usual
ublh = UnbinnedML(ncball,data)
minimizer = RTMinuit.Minuit(ublh,
alpha=1.,n=2.1,mean=1.2,sigma=0.3)
minimizer.set_up(0.5)#remember this is likelihood
minimizer.migrad()#yes amazingly fast Normalize is written in cython
ublh.show(minimizer)
#crystalball function is nortorious for its sensitivity on n parameter
#dist_fit give you a heads up where it might have float overflow
# <rawcell>
# Bonus: one thing we normally ask about asymmetric peaking distribution like this is full-width-half-max
# http://en.wikipedia.org/wiki/Full_width_at_half_maximum
# <codecell>
lw,rw = fwhm_f(ncball,(-1,2),minimizer.args)
# <codecell>
ublh.draw(minimizer)
axvspan(lw,rw,alpha=0.2,color='green');
# <headingcell level=2>
# What if things went wrong
# <codecell>
#crystalball is nortoriously sensitive to initial parameter
#now it is a good time to show what happen when things...doesn't fit
ublh = UnbinnedML(ncball,data)
minimizer = RTMinuit.Minuit(ublh)#NO initial value
minimizer.set_up(0.5)#remember this is likelihood
minimizer.migrad()#yes amazingly fast
#Remember there is a heads up
# <codecell>
ublh.show(minimizer)#it bounds to fails
# <codecell>
minimizer.migrad_ok(), minimizer.matrix_accurate()
#checking these two method give you a good sign
# <codecell>
#fix it by giving it initial value/error/limit or fixing parameter see RTMinuit Tutorial
#now we can fit as usual
ublh = UnbinnedML(ncball,data)
minimizer = RTMinuit.Minuit(ublh,
alpha=1.,n=2.1,mean=1.2,sigma=0.3)
minimizer.set_up(0.5)#remember this is likelihood
minimizer.migrad()#yes amazingly fast. Normalize is written in cython
ublh.show(minimizer)
# <headingcell level=2>
# How do we guess initial value?
# <rawcell>
# This is a hard question but visualization can helps us
# <codecell>
besttry = try_uml(ncball,data,alpha=1.,n=2.1,mean=1.2,sigma=0.3)
# <codecell>
#or you can try multiple
#too many will just confuse you
besttry = try_uml(ncball,data,alpha=1.,n=2.1,mean=[1.2,1.1],sigma=[0.3,0.5])
print besttry #and you can find which one give you minimal unbinned likelihood
# <headingcell level=2>
# Extended Fit: 2 Gaussian with Polynomial Background
# <codecell>
peak1 = randn(3000)*0.2
peak2 = randn(5000)*0.1+4
bg = gen_toy(lambda x : (x+2)**2, 20000,(-2,5))
all_data = np.concatenate([peak1,peak2,bg])
hist((peak1,peak2,bg,all_data),bins=200,histtype='step',range=(-2,5));
# <codecell>
%%cython
cimport cython
from dist_fit import Normalize, gaussian
@cython.binding(True)
def poly(double x,double a,double b, double c):
return a*x*x+b*x+c
#remember linear function is not normalizable for -inf ... +inf
nlin = Normalize(poly,(-1,5))
#our extended PDF for 3 types of signal
@cython.binding(True)
def myPDF(double x,
double a, double b, double c, double nbg,
double mu1, double sigma1, double nsig1,
double mu2, double sigma2, double nsig2):
cdef double NBG = nbg*nlin(x,a,b,c)
cdef double NSIG1 = nsig1*gaussian(x,mu1,sigma1)
cdef double NSIG2 = nsig2*gaussian(x,mu2,sigma2)
return NBG + NSIG1 + NSIG2
# <codecell>
print describe(myPDF)
# <codecell>
#lets use what we just learned
#for complicated function good initial value(and initial step) is crucial
#if it doesn't converge try play with initial value and initial stepsize(error_xxx)
besttry = try_binlh(myPDF,all_data,
a=1.,b=2.,c=4.,nbg=20000.,
mu1=0.1,sigma1=0.2,nsig1=3000.,
mu2=3.9,sigma2=0.1,nsig2=5000.,extended=True, bins=300, bound=(-1,5) )
print besttry
# <codecell>
binlh = BinnedLH(myPDF,all_data, bins=200, extended=True, range=(-1,5))
#too lazy to type initial values from what we try?
#use keyword expansion **besttry
#need to put in initial step size for mu1 and mu2
#with error_mu* otherwise it won't converge(try it yourself)
minimizer = RTMinuit.Minuit(binlh, error_mu1=0.1, error_mu2=0.1, **besttry)
# <codecell>
minimizer.migrad()
# <codecell>
binlh.show(minimizer)
# <codecell>
display(minimizer.html_results())
minimizer.html_error_matrix()
# <headingcell level=2>
# Advance: Custom cost function and Simultaneous Fit
# <rawcell>
# Sometimes, what we want to fit is the sum of likelihood /chi^2 of two PDF sharing some parameters.
# dist_fit doesn't provied a built_in facility to do this but it can be built easily.
#
# In this example, we will fit two gaussian distribution where we know that the width are the same
# but the peak is at different places.
# <codecell>
np.random.seed(0)
#Lets make two gaussian
data1 = randn(10000)+3
data2 = randn(10000)-2
hist([data1,data2],bins=100,histtype='step',label=['data1','data2']);
# <codecell>
#remember this is nothing special about builtin cost function
#except some utility function like draw and show
ulh1 = UnbinnedML(gaussian,data1)
ulh2 = UnbinnedML(gaussian,data2)
print describe(ulh1)
print describe(ulh2)
# <codecell>
#you can also use cython to do this
class CustomCost:
def __init__(self,pdf1,data1,pdf2,data2):
self.ulh1 = UnbinnedML(pdf1,data1)
self.ulh2 = UnbinnedML(pdf2,data2)
#this is the important part you need __call__ to calculate your cost
#in our case it's sum of likelihood with sigma
def __call__(self,mu1,mu2,sigma):
return self.ulh1(mu1,sigma)+self.ulh2(mu2,sigma)
# <codecell>
simul_lh = CustomCost(gaussian,data1,gaussian,data2)
# <codecell>
minimizer = RTMinuit.Minuit(simul_lh,sigma=0.5)
minimizer.set_up(0.5)#remember it's likelihood
minimizer.migrad()
# <codecell>
display(minimizer.html_results())
display(minimizer.html_error_matrix())
results = minimizer.values
# <codecell>
draw_compare_hist(gaussian,[results['mu1'],results['sigma']],data1,normed=True);
draw_compare_hist(gaussian,[results['mu2'],results['sigma']],data2,normed=True);