In [1]:
import numpy as np
import matplotlib.pylab as plt
import math as math
from scipy.optimize import fmin
from iminuit import Minuit, describe, Struct
from scipy.spatial import distance
from scipy.sparse import vstack
import time
import seaborn as sns
%matplotlib  notebook

In [2]:
def nn(data0,data1,r=None,nneighbors=None):
    ret = -1
    ret_list=[]
    if r is not None and nneighbors is not None:
        exit(-1)
        return ret
    elif r is not None and nneighbors is None:
        rsq = r*r
        for d in data0.transpose():
            count=0
            diffx=d[0]-data1[0]
            diffy=d[1]-data1[1]
            diff= diffx*diffx + diffy*diffy
            count = len(diff[diff<rsq])
            ret_list.append(float(count)/(float(len(data1[0]))*r))
        ret_list = np.array(ret_list)
        return ret_list
    elif r is None and nneighbors is not None:
        for d in data0.transpose():
            diffx=d[0]-data1[0]
            diffy=d[1]-data1[1]
            diff= diffx*diffx + diffy*diffy
            diff.sort()
            radius2 = diff[nneighbors-1]
            ret_list.append(1./(radius2))
#            ret_list.append(1./np.sqrt(radius2))
            #ret_list.append(float(nneighbors)/(np.pi*radius2)) # Let's do the inverse of the radius squared, since this is a 2D problem.
            #ret_list.append(float(nneighbors)/np.sqrt(radius2)) # 

        ret_list = np.array(ret_list)
        return ret_list
    return ret

In [3]:
# Add code to the following function so that it takes in two datasets, loops over one of them, and finds
# information about the nearest neighbors in the other dataset, based on a flag. 

def nncdist(data0,data1,r=None,nneighbors=None):   
    ret = -1
    ret_list=[]
    if r is not None and nneighbors is not None:
        exit(-1)
        return ret
    elif r is not None and nneighbors is None:
        combined = data0.transpose()
        combined1 = data1.transpose()
        dist=distance.cdist(combined,combined1,'euclidean')
        for num in dist:
            count=len(num[num<r])
            ret_list.append(float(count)/(float(len(data1[0]))*r))
        ret_list = np.array(ret_list)
        return ret_list
    elif r is None and nneighbors is not None:
        for num0 in data0:
            diff = np.abs(num0 - data1)
            diff.sort()
            radius2 = diff[nneighbors-1]
            ret_list.append(1./np.sqrt(radius2)) # radius 
        ret_list = np.array(ret_list)
        return ret_list
    return ret

In [4]:
def normal(x,mean,width):
    return (1.0/(width*np.sqrt(2*np.pi)))*(np.exp(-(x-mean)**2/(2*(width**2))))

# A product of two Gaussians
def signal_2D(npts,means,sigmas):
    pts = []
    for m,s in zip(means,sigmas):
        pts.append(np.random.normal(m,s,npts))
    pts = np.array(pts)
    
    return pts
    

# Flat in 2D
def background_2D(npts,lovals,hivals):
    pts = []
    for lo,hi in zip(lovals,hivals):
        width = hi-lo
        pts.append(lo + width*np.random.random(npts))
    pts = np.array(pts)
    return pts

# Helper function to generate signal and background at the same time
def gen_sig_and_bkg(npts,means,sigmas,lovals,hivals):
    sigpts = signal_2D(npts[0],means,sigmas)
    bkgpts = background_2D(npts[1],lovals,hivals)
    data = [sigpts[0].copy(),sigpts[1].copy()]
    data[0] = np.append(data[0],bkgpts[0])
    data[1] = np.append(data[1],bkgpts[1])
    data = np.array(data)
    return data

In [5]:
# Test the tools to generate the datasets.
sigpts = signal_2D(2000,[5.0,7.0],[0.1,0.1])
sns.jointplot(sigpts[0],sigpts[1],kind='hex')
bkgpts = background_2D(10000,[3.5,5],[6,9])
sns.jointplot(bkgpts[0],bkgpts[1],kind='hex')
data1 = [sigpts[0].copy(),sigpts[1].copy()]
data1[0] = np.append(data1[0],bkgpts[0])
data1[1] = np.append(data1[1],bkgpts[1])
data1 = np.array(data1)
sns.jointplot(data1[0],data1[1],kind='hex')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<seaborn.axisgrid.JointGrid at 0x7efed7d24cd0>

In [48]:

nbkg = 1000
sigmeans = [5.0,7.0]
bkglos = [3.5,5]
bkghis = [6,9]

def calc_pull(iterations, nsig, nMC_sig, nMC_bkg, nneigh,cdist_bool,sigwidths):
    
    pull_frac_list=[]
    average_best_frac = 0
    frac = []
    fit_frac = []
    fit_frac_uncert = []
    frac_org = nsig/float(nsig+nbkg)

    for num in range(iterations):
        
        nsig_iteration = np.random.poisson(nsig)
        nbkg_iteration = np.random.poisson(nbkg)
        data = gen_sig_and_bkg([nsig_iteration,nbkg_iteration],sigmeans,sigwidths,bkglos,bkghis)
        
        signal_points= signal_2D(nMC_sig,sigmeans,sigwidths)
        background_points = background_2D(nMC_bkg,bkglos,bkghis)
        frac_iteration = float(nsig_iteration)/(float(nbkg_iteration+nsig_iteration))
        frac.append(frac_iteration)
        

        signal_prob=nn(data,signal_points, nneighbors=nneigh)
        background_prob = nn(data,background_points, nneighbors=nneigh)

        def tot_prob(frac):
            tot_prob=[]
            #tot_prob.append(frac*signal_prob*(nMC_sig/(nMC_sig+nMC_bkg)) + ((1-frac)*background_prob)*(nMC_bkg)/(nMC_sig+nMC_bkg))
            tot_prob.append(frac*signal_prob/(nMC_sig) + ((1-frac)*background_prob)/(nMC_bkg))
            return np.array(tot_prob)
        
        def probability(frac):
            prob=tot_prob(frac)
            return -np.log(prob[prob>0]).sum()
        
        m1=Minuit(probability,frac= 0.2,limit_frac=(0.001,1),error_frac=0.001,errordef = 0.5,print_level=0)
        m1.migrad()

        if (m1.get_fmin().is_valid):
            param=m1.values
            err=m1.errors
            fit_frac.append(param["frac"])
            fit_frac_uncert.append(err["frac"])
            pull_frac=(frac_org-param["frac"])/err["frac"]
            pull_frac_list.append(pull_frac)
            
    return pull_frac_list, frac, fit_frac, fit_frac_uncert,iterations

In [47]:

means1=[]
stds1=[]

sig_list=[200]
MC_sig_list=[1000]
MC_bkg=5000
#rad_list=[0.1]
#width_list=[0.06,.15]
width_list=[0.1]
n=[5]

niterations = 100


pulls_list ={}
for sig in sig_list:
    for MC_sig in MC_sig_list: #,100000]:
        start = time.time()
        for nneighbors in n:#,0.05,0.1,0.15,0.20]: #, 0.10,0.20,0.50]:
            for w in width_list:
                sig_widths=[w,w]
                start = time.time()
                print 'number of signal points: %s \t number of MC point: %s \tradius: %s \t width: %s' % (sig,MC_sig,nneighbors,w)
                pulls,org_vals1,fit_vals1,fit_uncerts1,iteration= calc_pull(niterations, sig, MC_sig, MC_bkg, nneighbors,False,sig_widths)
                name='MC' +str(MC) + 'rad'+str(nneighbors)
                pulls_list[name]=pulls

                parameters_mean = {'signal': sig, 'MC_points': MC, 'nearest neighbors': nneighbors, 'mean pulls': np.mean(pulls), 'width':w}
                means1.append(parameters_mean)
                parameters_std = {'signal': sig, 'MC_points': MC, 'nearest neighbors': nneighbors, 'mean stds': np.std(pulls),'width':w}
                stds1.append(parameters_std)
                print "time to calc %d iterations: %f s" % (niterations, time.time() - start)        

print means1
print stds1
print '--------------------'



number of signal points: 200 	 number of MC point: 1000 	radius: 5 	 width: 0.1
time to calc 100 iterations: 31.479975 s
[{'width': 0.1, 'signal': 200, 'mean pulls': -0.66196329235060813, 'nearest neighbors': 5, 'MC_points': 1000}]
[{'width': 0.1, 'signal': 200, 'mean stds': 0.87096959011795005, 'nearest neighbors': 5, 'MC_points': 1000}]
--------------------


In [149]:
print (np.array(fit_vals1)-np.array(org_vals1))/np.array(fit_uncerts1)

print fit_uncerts1

[ -6.70670191e-01   3.94918057e-01   2.93046217e-01   5.18563921e-01
  -6.48608093e-01  -4.05612958e-02   6.62553251e-03  -2.21812098e-01
  -4.04786319e-01   7.35861427e-02  -3.71475759e-01  -4.75222664e-01
  -1.59505203e-01   1.01113222e-01  -4.19501990e-01  -1.88787049e-01
  -1.60855411e-01  -3.59891930e-01  -2.18376388e-01   6.11224722e-01
   3.28266090e-02  -1.18742911e-01   3.82673675e-01   3.92622562e-01
  -9.87550622e-02   5.85535200e-02   2.58547619e-01  -5.30360529e-01
   6.48685774e-01   5.19675318e-02   2.73268705e-01   4.10717344e-01
   1.39063744e-01  -2.64225195e-01  -3.48537227e-01   1.00104409e-01
  -7.47247077e-02   3.00911898e-01   1.99019539e-01  -1.41305576e-01
  -4.72153280e-02  -1.93904877e-01  -3.37735579e-01  -5.39824366e-01
  -2.99373348e-01   2.47846853e-01  -2.03190471e-01  -5.50819365e-01
  -2.23411547e-01  -5.00890911e-02  -4.11352941e-01   5.54741011e-01
  -6.45313558e-02   1.54737596e-01  -5.31831641e-01  -3.98308804e-02
  -5.43542699e-01   1.18892209e-01

In [150]:
plt.figure()
plt.hist(pulls,bins=20)

<IPython.core.display.Javascript object>

(array([   4.,    8.,   16.,   24.,   41.,   66.,   65.,   89.,   98.,
         119.,  117.,   98.,   91.,   65.,   38.,   25.,   16.,    9.,
           4.,    7.]),
 array([-2.60312363, -2.32532947, -2.0475353 , -1.76974114, -1.49194698,
        -1.21415281, -0.93635865, -0.65856449, -0.38077032, -0.10297616,
         0.174818  ,  0.45261216,  0.73040633,  1.00820049,  1.28599465,
         1.56378882,  1.84158298,  2.11937714,  2.39717131,  2.67496547,
         2.95275963]),
 <a list of 20 Patch objects>)

In [76]:
plt.figure()
colors=['b','g','y','r']
markers=['o','^','+']
marker_sizes=[5,10,60]
labels=[]


for item in means1:
    if item['width']==width_list[0]:
        label='MC=' + str(item['MC_points']) + ' signal=' + str(item['signal'])
        
    
        if item['MC_points']== MC_list[0]:
            color=colors[0]
        #if item['MC_points']==MC_list[1]:
        #    color=colors[1]

        if item['signal']==sig_list[0]:
            marker=markers[0]
        #if item['signal']==sig_list[1]:
        #    marker=markers[1]
        for x in labels:
            if label==x:
                label=""       
        plt.plot(item['nearest neighbors'],item['mean pulls'], color=color, marker=marker,label=label)
        labels.append(label)

plt.legend(loc='lower right')
plt.title("Means with widths of "+str(width_list[0]))
           
plt.figure()
labels=[]
           
for item in means1:
    if item['width']==width_list[1]:
        label='MC=' + str(item['MC_points']) +  ' signal=' + str(item['signal'])
    
        if item['MC_points']== MC_list[0]:
            color=colors[0]
        #if item['MC_points']==MC_list[1]:
        #    color=colors[1]

        if item['signal']==sig_list[0]:
            marker=markers[0]
        #if item['signal']==sig_list[1]:
        #    marker=markers[1]

        for x in labels:
            if label==x:
                label=""
            
        plt.plot(item['nearest neighbors'],item['mean pulls'], color=color, marker=marker,label=label)
        labels.append(label)

plt.legend(loc='lower right')    

plt.title("Means with widths of "+str(width_list[1]))


plt.figure()
labels=[]
           
for item in stds1:
    if item['width']==width_list[0]:
        label='MC=' + str(item['MC_points']) +  ' signal=' + str(item['signal'])
    
        if item['MC_points']== MC_list[0]:
            color=colors[0]
        #if item['MC_points']==MC_list[1]:
        #    color=colors[1]

        if item['signal']==sig_list[0]:
            marker=markers[0]
        #if item['signal']==sig_list[1]:
        #    marker=markers[1]

        for x in labels:
            if label==x:
                label=""
            
        plt.plot(item['nearest neighbors'],item['mean stds'], color=color, marker=marker,label=label)
        labels.append(label)

plt.legend(loc='lower right')   

plt.title("Standard Deviations with widths of "+str(width_list[0]))




plt.figure()
labels=[]
           
for item in stds1:
    if item['width']==width_list[1]:
        label='MC=' + str(item['MC_points']) +  ' signal=' + str(item['signal'])
    
        if item['MC_points']== MC_list[0]:
            color=colors[0]
        #if item['MC_points']==MC_list[1]:
        #    color=colors[1]

        if item['signal']==sig_list[0]:
            marker=markers[0]
        #if item['signal']==sig_list[1]:
        #    marker=markers[1]

        for x in labels:
            if label==x:
                label=""
            
        plt.plot(item['nearest neighbors'],item['mean stds'], color=color, marker=marker,label=label)
        labels.append(label)

plt.legend(loc='lower right')

plt.title("Standard Deviations with widths of "+str(width_list[1]))



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<matplotlib.text.Text at 0x7ff7d5d07990>

In [125]:
#800 signal points
#pythagorean method
means800=[]
stds800=[]

#sig_widths = [0.6,0.6]
sig_list=[200,400]
MC_list=[10000,100000]
rad_list=[0.01,0.05,0.1,0.15,0.20]
width_list=[0.06,.15]

pulls_list ={}
for sig in sig_list:
    for MC in MC_list: #,100000]:
        start = time.time()
        for rad in rad_list:#,0.05,0.1,0.15,0.20]: #, 0.10,0.20,0.50]:
            for w in width_list:
                sign_widths=[w,w]
                start = time.time()
                print 'number of signal points: %s \t number of MC point: %s \tradius: %s \t width: %s' % (sig,MC,rad,w)
                pulls,org_vals1,fit_vals1,fit_uncerts1,iteration= calc_pull(10, sig, MC, rad,False,sig_widths)
                name='MC' +str(MC) + 'rad'+str(rad)
                pulls_list[name]=pulls

                parameters_mean = {'signal': sig, 'MC_points': MC, 'radius': rad, 'mean pulls': np.mean(pulls), 'width':w}
                means800.append(parameters_mean)
                parameters_std = {'signal': sig, 'MC_points': MC, 'radius': rad, 'mean stds': np.std(pulls),'width':w}
                stds800.append(parameters_std)
                print "time to calc 10 iterations: %f s" % (time.time() - start)        

print means800
print stds800
print '--------------------'



number of signal points: 200 	 number of MC point: 10000 	radius: 0.01 	 width: 0.06
time to calc 10 iterations: 0.938094 s
number of signal points: 200 	 number of MC point: 10000 	radius: 0.01 	 width: 0.15
time to calc 10 iterations: 0.917374 s
number of signal points: 200 	 number of MC point: 10000 	radius: 0.05 	 width: 0.06
time to calc 10 iterations: 0.937858 s
number of signal points: 200 	 number of MC point: 10000 	radius: 0.05 	 width: 0.15
time to calc 10 iterations: 0.956699 s
number of signal points: 200 	 number of MC point: 10000 	radius: 0.1 	 width: 0.06
time to calc 10 iterations: 0.972548 s
number of signal points: 200 	 number of MC point: 10000 	radius: 0.1 	 width: 0.15
time to calc 10 iterations: 0.939491 s
number of signal points: 200 	 number of MC point: 10000 	radius: 0.15 	 width: 0.06
time to calc 10 iterations: 0.977794 s
number of signal points: 200 	 number of MC point: 10000 	radius: 0.15 	 width: 0.15
time to calc 10 iterations: 0.993756 s
number of 

In [152]:
plt.figure()
colors=['b','g','y','r']
markers=['o','^','+']
marker_sizes=[5,10,60]
labels=[]

for item in means800:
    label='MC=' + str(item['MC_points']) + ' width='+str(item['width'])+ ' signal=' + str(item['signal'])
    if item['MC_points']== MC_list[0]:
        color=colors[0]
    if item['MC_points']==MC_list[1]:
        color=colors[1]
        
    if item['signal']==sig_list[0]:
        marker=markers[0]
    if item['signal']==sig_list[1]:
        marker=markers[1]
        
    if item['width']==width_list[0]:
        marker_size=marker_sizes[0]
    if item['width']==width_list[1]:
        marker_size=marker_sizes[1]
        
    for x in labels:
        if label==x:
            label=""
            
    plt.plot(item['radius'],item['mean pulls'], color=color, marker=marker,markersize=marker_size,label=label)
    labels.append(label)

plt.legend(loc='center left', bbox_to_anchor=(0.6,0.5))
#plt.ylim(-1,.1)
#plt.ylim(-.5,.5)
labels=[]

plt.figure()
for item in stds800:
    label='MC=' + str(item['MC_points']) + ' width='+str(item['width'])+ ' signal=' + str(item['signal'])
    if item['MC_points']== MC_list[0]:
        color=colors[0]
    if item['MC_points']==MC_list[1]:
        color=colors[1]
    if item['signal']==sig_list[0]:
        marker=markers[0]
    if item['signal']==sig_list[1]:
        marker=markers[1]
    if item['width']==width_list[0]:
        marker_size=marker_sizes[0]
    if item['width']==width_list[1]:
        marker_size=marker_sizes[1]
    for x in labels:
        if label==x:
            label=""
            
    plt.plot(item['radius'],item['mean stds'], color=color, marker=marker,markersize=marker_size,label=label)
labels.append(label)

plt.legend(loc='center left', bbox_to_anchor=(0.6,0.5))
#plt.xlim(0,.15)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<matplotlib.legend.Legend at 0x7f1b205f5790>