In [1]:
from uproot_io import Events, View
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import math
#titlesize = 20
#labelsize = 14
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.datasets import make_classification

In [2]:
events = Events(r"C:\Users\andre\Desktop\Physics Project\bigdata\CheatedRecoFile_1.root")

In [None]:
# training data

def fit_line(x, y):
    A = np.vstack([x, np.ones_like(x)]).T
    m, c = np.linalg.lstsq(A, y, rcond=None)[0]
    return m, c

def calculate_residuals(x, y, m, c):
    #y_predicted = (m*x) + c
    perpdist = abs(((-m*x) + y - c) / np.sqrt(m**2 + 1))
    #residuals = np.abs(perpdist - y_predicted)
    return perpdist

def BDTdata():
    data = []
    label = []
    
    for n in training:
        
        idx = np.where(events.event_number == n)[0]

        # line of best fit
        gradients = []
        intercepts = []

        xmax = -1000
        xmin = 1000
        zmax = -1000
        zmin = 1000
        for i in idx:
            x0 = events.reco_hits_x_w[i]
            z0 = events.reco_hits_w[i]
            if len(x0) > 0:
                if max(x0) >= xmax:
                    xmax = max(x0)
                if min(x0) <= xmin:
                    xmin = min(x0)
            if len(z0) > 0:
                if max(z0) >= zmax:
                    zmax = max(z0)
                if min(z0) <= zmin:
                    zmin = min(z0)

        xlen = 0.1*(xmax - xmin)
        zlen = 0.1*(zmax - zmin)

        for i in idx:
            
            # features
            entry = []
            
            # hits, pdg code, purity, completeness, adc values, vertex coordinates

            x = events.reco_hits_x_w[i]
            z = events.reco_hits_w[i]
            pdg = events.mc_pdg[i]
            purity = events.purity[i]
            complete = events.completeness[i]

            adc = events.reco_adcs_w[i]
            vx = events.reco_particle_vtx_x[i]
            vz = events.reco_particle_vtx_w[i]
            
            party = []

            # cuts made for clean data
            if len(x) > 10 and pdg != 0 and purity > 0.75 and complete > 0.75:
                party.append(pdg)
                
            for q in range(len(party)):
                if party[q] == 22. or party[q] == -11. or party[q] == 11.:
                    label.append(0) # showers
                else:
                    label.append(1) # tracks
            
            if len(x) > 10 and pdg != 0 and purity > 0.75 and complete > 0.75:
                
                best_inliers = []
                best_model = None
                sample_size = 10
                max_iterations = 100

                for _ in range(max_iterations):
                    sample_size = min(sample_size, len(x))
                    sample_indices = np.random.choice(len(x), size=sample_size, replace=False)
                    x_sample = x[sample_indices]
                    z_sample = z[sample_indices]

                    m, c = fit_line(x_sample, z_sample)
                    residuals = calculate_residuals(x, z, m, c)
                    inliers = np.where(residuals < (0.15*xlen))[0]

                    if len(inliers) > len(best_inliers):
                        best_inliers = inliers
                        best_model = (m, c)

                if len(best_inliers) > 0:
                    m, c = best_model

                    gradients.append(m)
                    intercepts.append(c)

                    # changing the coordinates into longitudinal and tranverse from best line

                    znew = z - c
                    angle = -math.atan(m)
                    l = x * np.cos(angle) - znew * np.sin(angle)
                    t = x * np.sin(angle) + znew * np.cos(angle)

                    # feature 1: proximal hits within 5% of transverse range of the hits.

                    tdiff = np.abs(max(t)-min(t))
                    margin = tdiff*0.05
                    inside = [x for x in t if x <= margin and x >= -margin]
                    entry.append(len(inside)/len(t))

                    # feature 2: the standard deviation of the last 30% of a topology

                    p1 = int(0.7 * len(t))
                    X = t[p1:]
                    Xvar = np.var(X)

                    p2 = int(0.3 * len(t))
                    Y = t[:p2]
                    Yvar = np.var(Y)

                    if  Yvar > Xvar:
                        resultY = np.sqrt(Yvar)
                        entry.append(resultY)           
                    else:
                        resultX = np.sqrt(Xvar)
                        entry.append(resultX)

                    # feature 3: the hits per length

                    above = np.sum(t > 0)
                    below = np.sum(t < 0)

                    lmin, lmax = np.min(l), np.max(l)
                    length = lmax - lmin

                    if above == 0 or below == 0 or length == 0:
                        entry.append(2)

                    else:
                        # If symmetric, this should be close to 1, otherwise > 1
                        hits = (above + below) / length
                        entry.append(hits)


                    # feature 4: energy in the last 10% of the track

                    vznew = vz - c

                    vl = vx * np.cos(angle) - vznew * np.sin(angle)
                    vt = vx * np.sin(angle) + vznew * np.cos(angle)

                    if max(l) < vl:
                        l *= -1

                    combined = list(zip(l, t, adc))
                    sort = sorted(combined, key = lambda coord: coord[0])

                    lnew, tnew, adcnew = zip(*sort)

                    percent = int(0.9 * len(adcnew))
                    X = adcnew[percent:]
                    Xsum = np.sum(X)

                    total = np.sum(adcnew)
                    result = Xsum / total

                    entry.append(result)

                    # feature 5: mean connection distance

                    total_distance = 0.0
                    for j in range(len(x) - 1):
                        distance = np.sqrt((x[j+1] - x[j])**2 + (z[j+1] - z[j])**2)
                        total_distance += distance
                    meandist = total_distance/(len(x)-1)
                    entry.append(meandist)
                    data.append(entry)


                else:
                    entry.append(0.1)  # proximal hits
                    entry.append(1.5)  # standard deviation
                    entry.append(2)    # hits per length
                    entry.append(0.25) # energy
                    entry.append(1)    # mean connection distance
                    data.append(entry)
    
    return data, label