In [None]:
from IceCube.Essential import *
from IceCube.Model import *
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pickle
import pdb

In [None]:
def BoostedDecisionTree(X, y, max_depth=2, n_estimators=400, learning_rate=0.5):

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=SEED)

    dt = DecisionTreeClassifier(max_depth=max_depth)
    clf = AdaBoostClassifier(base_estimator=dt, 
        n_estimators=n_estimators, learning_rate=learning_rate, random_state=SEED)

    # Train classifier on training set
    clf.fit(X_train, y_train)

    # Test classifier on testing set
    score = clf.decision_function(X)
    y_pred = clf.predict(X)

    y_pred_train = clf.predict(X_train)
    accuracy = accuracy_score(y_train, y_pred_train)
    print("Train Accuracy: {:.2f}%".format(accuracy * 100))

    y_pred_test = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred_test)
    print("Test  Accuracy: {:.2f}%".format(accuracy * 100))

    return y_pred, score, clf

In [None]:
# ground truth
true_df = get_target_angles([2])
true_df = angles2vector(true_df)
print(true_df.head(5))
n = true_df[["nx","ny","nz"]].to_numpy()

# reconstructed directions
reco_df = pd.read_parquet("/root/autodl-tmp/kaggle/working/prediction/pred_2.parquet")
reco_df["azimuth"] = np.remainder(reco_df["azimuth"], 2 * np.pi)
print(reco_df.head(5))
n_hat = reco_df[["x", "y", "z"]].to_numpy()

e = reco_df[["ex", "ey", "ez"]].to_numpy()
xe = np.sum(n_hat * e, axis=1)
print(xe.shape)
proj = n_hat - xe[:, np.newaxis] * e
proj /= (np.linalg.norm(proj, axis=1, keepdims=True) + 1e-8)

error, az_error, ze_error = angle_errors(n, n_hat)
print(f"error, az_error, ze_error = {error.mean()}, {az_error.mean()}, {ze_error.mean()}")

errorx, az_errorx, ze_errorx = angle_errors(n, proj)
print(f"error, az_error, ze_error = {errorx.mean()}, {az_errorx.mean()}, {ze_errorx.mean()}")

idx = error > errorx

In [None]:
# plot errors
plt.figure()
plt.hist(error, bins=30, color='blue', label='GNN', histtype='step', density=False)
plt.hist(error[error < errorx], bins=30, color='black', label='GNN better', histtype='step', density=False)
plt.hist(errorx, bins=30, color='orange', label='Fit', histtype='step', density=False)
plt.hist(errorx[error > errorx], bins=30, color='red', label='Fit better', histtype='step', density=False)
plt.legend()

In [None]:
def draw_hist(x, density=True):
    gnn_better = x[error < errorx]
    fit_better = x[error > errorx]

    plt.figure()
    plt.hist(gnn_better, bins=30, color='blue', label='GNN', histtype='step', density=density)
    plt.hist(fit_better, bins=30, color='orange', label='Fit', histtype='step', density=density)
    plt.legend()

In [None]:
draw_hist(np.clip(reco_df["fit_error"], 0, 1000))
draw_hist(np.clip(reco_df["good_hits"], 0, 1000))
draw_hist(np.clip(reco_df["azimuth"], 0, 2 * np.pi))
draw_hist(np.clip(reco_df["zenith"], 0, 2 * np.pi))
draw_hist(reco_df["ez"])
draw_hist(xe)

In [None]:
# control
TRAIN = True

# inputs
X = np.concatenate([
    reco_df[["fit_error", "good_hits", "zenith", "ez"]].to_numpy(), 
    xe[:, np.newaxis]], axis=1)

if TRAIN:
    # train the model
    y_pred, score, clf = BoostedDecisionTree(X, idx, max_depth=3, n_estimators=800, learning_rate=0.8)
    # save the model
    pickle.dump(clf, open(os.path.join(MODEL_PATH, 'BDT_clf.sklearn'), 'wb'))
else:
    # load the model and predict
    clf = pickle.load(open(os.path.join(MODEL_PATH, 'BDT_clf.sklearn'), 'rb'))
    y_pred = clf.predict(X)
    score  = clf.decision_function(X)

# Evaluate accuracy
accuracy = accuracy_score(idx, y_pred)
print("Accuracy: {:.2f}%".format(accuracy * 100))
error[y_pred] = errorx[y_pred]
print(f"error -> {error.mean()}")

In [None]:
draw_hist(np.clip(score, -0.005, 0), density=True)

In [None]:
error, az_error, ze_error = angle_errors(n, n_hat)
print(f"error, az_error, ze_error = {error.mean()}, {az_error.mean()}, {ze_error.mean()}")

errorx, az_errorx, ze_errorx = angle_errors(n, proj)
print(f"error, az_error, ze_error = {errorx.mean()}, {az_errorx.mean()}, {ze_errorx.mean()}")

criteria = score > -0.0002
error[criteria] = errorx[criteria]
print(f"error -> {error.mean()}")

In [None]:
np.count_nonzero(y_pred), len(y_pred)