In [3]:
import pandas as pd 
import uproot as ur
import numpy as np

In [4]:
def root_to_pd(file_path, tree_name, branch_names):
    # opening root file
    try:
        root_file = ur.open(file_path)
        print("Reading ROOT file")
    except FileNotFoundError:
        print("File not found!")
    
    root_tree = root_file[tree_name]
    
    branch_values = root_tree.arrays(branch_names,library="pd")

    return branch_values
    

def read_fwd_pd(file_path):
    tree_name = "fwd"
    branch_names = ["fstHits.mXYZ.fX","fstHits.mXYZ.fY","fstHits.mXYZ.fZ","reco.mQATruth"]
    pd_frame = root_to_pd(file_path,tree_name,branch_names)
    qa_vector = pd_frame["reco.mQATruth"] 
    modified_qa = [len(arr) for arr in qa_vector]
    modified_qa = [1 if element == 1 else 0 for element in modified_qa]
    pd_frame["reco.mQATruth"] = modified_qa
    pd_frame.rename(columns={"fstHits.mXYZ.fX":"fX","fstHits.mXYZ.fY":"fY","fstHits.mXYZ.fZ":"fZ","reco.mQATruth":"mQATruth"},inplace=True)
    return pd_frame
    

pd_frame = read_fwd_pd("fwdtree.root")

pd_frame.to_csv("data.csv")
pd_frame.head()

Reading ROOT file


Unnamed: 0,fX,fY,fZ,mQATruth
0,"[2.9763989448547363, 4.305664539337158, 4.3056...","[5.708104133605957, 8.257354736328125, 8.25735...","[151.7640380859375, 178.7950439453125, 168.745...",1
1,"[20.780773162841797, 23.651390075683594, 0.197...","[1.1487510204315186, 1.3074373006820679, 6.434...","[153.1590576171875, 180.1900634765625, 151.764...",0
2,"[-12.367856979370117, -14.72852611541748, -7.7...","[8.59738540649414, 10.23837947845459, -9.40523...","[151.7640380859375, 180.1900634765625, 165.260...",0
3,"[8.442672729492188, 11.049135208129883, 11.049...","[3.929877519607544, 5.143128395080566, 5.14312...","[151.7640380859375, 178.7950439453125, 168.745...",1
4,"[0.3816095292568207, 0.3816095292568207, -8.95...","[6.4261794090271, 6.4261794090271, 2.538786411...","[151.7640380859375, 178.7950439453125, 165.262...",0


In [5]:
#import scikit lean library
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split

np.random.seed =(42)

pd_np = pd_frame.to_numpy()
 
pd_flattened = np.array([np.hstack(x) for x in pd_np],dtype=object)

X = [arr[:-1] for arr in pd_flattened]
max_length = max(len(x) for x in X)
def pad_sequences(sequences, maxlen, value=0):
    return np.array([np.pad(seq, (0, maxlen - len(seq)), 'constant', constant_values=value) for seq in sequences])

X_padded = pad_sequences(X, max_length)
y = [arr[-1] for arr in pd_flattened]

X_train, X_test, y_train, y_test = train_test_split(X_padded,y,test_size=0.2)

clf = LinearSVC(max_iter=1000)
clf.fit(X_train,y_train)

clf.score(X_test,y_test)





In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)
clf.score(X_test,y_test)


0.9629629629629629

In [None]:
from imblearn.over_sampling import SMOTE 
from collections import Counter
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_padded, y)




NameError: name 'X_padded' is not defined