In [None]:
from IceCube.Essential import *
from IceCube.Model import *
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pickle
import pdb

In [None]:
def BoostedDecisionTree(X, y, max_depth=2, n_estimators=400, learning_rate=0.5):
    dt = DecisionTreeClassifier(max_depth=max_depth)
    clf = AdaBoostClassifier(base_estimator=dt, 
        n_estimators=n_estimators, learning_rate=learning_rate, random_state=SEED)

    # Train classifier on training set
    clf.fit(X, y)

    # Test classifier on testing set
    score = clf.decision_function(X)
    y_hat = clf.predict(X)
    
    accuracy = accuracy_score(y, y_hat)
    LOGGER.info(f"Train accuracy: {accuracy * 100:.2f}%")

    return y_hat, score, clf


def draw_hist(x, density=True, nbins=30):
    gnn_better = x[error < errorx]
    fit_better = x[error > errorx]

    plt.figure()
    plt.hist(gnn_better, bins=nbins, color='blue', label='GNN', histtype='step', density=density)
    plt.hist(fit_better, bins=nbins, color='orange', label='Fit', histtype='step', density=density)
    plt.legend()


In [None]:
BATCHES_TEST = list(range(11, 21))
# BATCHES_TEST = [1]

# ground truth
true_df = get_target_angles(BATCHES_TEST)
true_df = angles2vector(true_df)
print(true_df.head(5))
n = true_df[["nx","ny","nz"]].to_numpy()

# reconstructed directions
reco_df = get_reco_angles(BATCHES_TEST)
print(reco_df.head(5))
n_hat = reco_df[["x", "y", "z"]].to_numpy()

e = reco_df[["ex", "ey", "ez"]].to_numpy()
xe = np.sum(n_hat * e, axis=1)
print(xe.shape)
proj = n_hat - xe[:, np.newaxis] * e
proj /= (np.linalg.norm(proj, axis=1, keepdims=True) + 1e-8)

error, az_error, ze_error = angle_errors(n, n_hat)
print(f"error, az_error, ze_error = {error.mean()}, {az_error.mean()}, {ze_error.mean()}")

errorx, az_errorx, ze_errorx = angle_errors(n, proj)
print(f"error, az_error, ze_error = {errorx.mean()}, {az_errorx.mean()}, {ze_errorx.mean()}")

idx = error > errorx

In [None]:
# plot errors
plt.figure()
plt.hist(error, bins=30, color='blue', label='GNN', histtype='step', density=False)
plt.hist(error[error < errorx], bins=30, color='black', label='GNN better', histtype='step', density=False)
plt.hist(errorx, bins=30, color='orange', label='Fit', histtype='step', density=False)
plt.hist(errorx[error > errorx], bins=30, color='red', label='Fit better', histtype='step', density=False)
plt.legend()

In [None]:
# here fit_error can be considered as the goodness of fit
# TODO try to add also the statistical uncertainty of the fit
# i.e. sigma_coefficients
Nbins = 100
draw_hist(np.log10(np.sqrt(reco_df["fit_error"])/reco_df["hits"] + 1e-6), nbins=Nbins)
draw_hist(np.sin(reco_df["zenith"]) ** 2, nbins=Nbins)
draw_hist(np.clip(reco_df["hits"], 0, 1000), nbins=Nbins)
draw_hist(np.log10(reco_df["sumc"] + 1e-6), nbins=Nbins)
draw_hist(np.log10(reco_df["dt"] + 1e-3), nbins=Nbins)
draw_hist(reco_df["unique_x"], nbins=Nbins)
draw_hist(reco_df["unique_y"], nbins=Nbins)
draw_hist(reco_df["unique_z"], nbins=Nbins)
draw_hist(np.arccos(xe), nbins=Nbins)

In [None]:
# reco_df inputs
reco = reco_df[["fit_error", "sumc", "hits", "zenith", "ez", "dt", "unique_x", "unique_z"]].to_numpy()
reco[:, 0] = np.log10(reco[:, 0] / reco[:, 2] + 1e-6)
reco[:, 1] = np.log10(reco[:, 1] + 1e-6)
reco[:, 3] = np.sin(reco[:, 3]) ** 2
reco[:, 5] = np.log10(reco[:, 5] + 1e-3)
xe = np.arccos(xe)

# load the model and predict
LOGGER.info("Loading BDT model...")
clf = pickle.load(open(os.path.join(MODEL_PATH, 'BDT_clf.DeepHighLR.sklearn'), 'rb'))
LOGGER.info("Predicting...")
X = np.concatenate([reco, np.abs(xe[:, np.newaxis])], axis=1)
y_hat = clf.predict(X)
score = clf.decision_function(X)

# Evaluate accuracy
accuracy = accuracy_score(idx, y_hat)
LOGGER.info(f"Test accuracy: {accuracy * 100:.2f}%")
error[y_hat] = errorx[y_hat]
LOGGER.info(f"error -> {error.mean()}")

In [None]:
error, az_error, ze_error = angle_errors(n, n_hat)
LOGGER.info(f"error, az_error, ze_error = {error.mean()}, {az_error.mean()}, {ze_error.mean()}")

errorx, az_errorx, ze_errorx = angle_errors(n, proj)
LOGGER.info(f"error, az_error, ze_error = {errorx.mean()}, {az_errorx.mean()}, {ze_errorx.mean()}")

criteria = score > -0.00001
error[criteria] = errorx[criteria]
LOGGER.info(f"error -> {error.mean()}")

draw_hist(np.clip(score, -0.01, 0), density=True)