In [None]:
# Imports
from tqdm.notebook import tqdm
import os
from os.path import  join
from datetime import datetime
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from pennylane import numpy as np
from sklearn.metrics import roc_auc_score, roc_curve
import matplotlib.pyplot as plt
from glob import glob
import seaborn as sns
import pandas as pd
from xgboost import XGBClassifier
import torch

from qmlhep.data_handling.dataset import ParticlePhysics
from qmlhep.utils.helper import GridSearch, NestablePool, get_random_numbers
from qmlhep.qml import AdamModel, OptunaModel
from qmlhep.config import analisys_results_path, figures_path, use_gpu
from qmlhep.utils.helper import get_features

# Import fig style
from qmlhep.utils.plot_results import *

## Baseline Notebook

Author: Miguel Caçador Peixoto

## Measuring the baseline performance using a XGBoost for classification

This will use the training dataset with all datapoints and features.

In [None]:
# Training a XGBoost with a full range of features and datapoints
train = ParticlePhysics("train", standardization="ML").all_data_Dataframe()
train.drop(columns=['name'], inplace=True)
features = train.columns[:-2]

X_train, y_train, w_train = train[features], train['label'], train['weights']

# Retormalize weights
w_train[y_train == 1] = (w_train[y_train == 1] / w_train[y_train == 1].sum()) * w_train.shape[0] / 2
w_train[y_train == 0] = (w_train[y_train == 0] / w_train[y_train == 0].sum()) * w_train.shape[0] / 2

test = ParticlePhysics("test", standardization="ML").all_data_Dataframe()
test.drop(columns=['name'], inplace=True)
X_test, y_test, w_test = test[features], test['label'], test['weights']

# Retormalize weights
w_test[y_test == 1] = (w_test[y_test == 1] / w_test[y_test == 1].sum()) * w_test.shape[0] / 2
w_test[y_test == 0] = (w_test[y_test == 0] / w_test[y_test == 0].sum()) * w_test.shape[0] / 2

# GPU
if use_gpu:
    tree_method = 'gpu_hist'
else:
    tree_method = 'hist'

    if torch.cuda.is_available():
        print("GPU is available but set to False in config.py. It's very likely that this will take a long time!")

In [None]:
clf = XGBClassifier(
    n_estimators=100,
    learning_rate=1e-5,
    objective='binary:logistic',
    eval_metric='auc',
    use_label_encoder=False,
    n_jobs=-1,
    tree_method=tree_method
)

clf.fit(X_train, y_train, sample_weight=w_train)

In [None]:
# Predict
y_pred = clf.predict_proba(X_test)[:, 1]

# Calculate AUC
auc = roc_auc_score(y_test, y_pred, sample_weight=w_test)

# Plot ROC
fpr, tpr, _ = roc_curve(y_test, y_pred, sample_weight=w_test)
plt.figure(figsize=(10, 10))
plt.plot(fpr, tpr)#, label='AUC = {:.3f}'.format(auc))
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False positive rate', fontsize=MEDIUM_SIZE+5)
plt.ylabel('True positive rate', fontsize=MEDIUM_SIZE+5)
plt.title('ROC curve (AUC = {:.3f})'.format(auc), fontsize=BIGGER_SIZE)

# Set the tick label font size
plt.tick_params(axis='both', which='major', labelsize=TICK_SIZE)

plt.legend(loc='best')

plt.tight_layout()

# Save figure
plt.savefig(join(figures_path, 'baseline_xgb.pdf'))


## Feature Selection Methods

### PCA Performance (Top 5)

In [None]:
df = ParticlePhysics("validation", PCA=True).all_data_Dataframe()

In [None]:
weights = df["weights"]
label = df["label"]
df.drop(columns=["weights", "label", "name"], inplace=True)

In [None]:
book = {}

weights[label == 1] = (weights[label == 1] / weights[label == 1].sum()) * weights.shape[0] / 2
weights[label == 0] = (weights[label == 0] / weights[label == 0].sum()) * weights.shape[0] / 2

# Calculate AUC for each feature
for feature in df.columns:
    auc = roc_auc_score(label, df[feature], sample_weight=weights)
    book[feature] = auc

# Show top 5 in table format
pd.DataFrame.from_dict(book, orient="index", columns=["AUC"]).sort_values(by="AUC", ascending=False).head(5)

#### SBS (k <>= 5)

In [None]:
for i in range(1, 6):
    print(sorted(get_features(i)))

In [None]:
df = ParticlePhysics("train", features=get_features(5)).all_data_Dataframe()

In [None]:
weights = df["weights"]
label = df["label"]
df.drop(columns=["weights", "label", "name"], inplace=True)

Top 5

In [None]:
book = {}

weights[label == 1] = (weights[label == 1] / weights[label == 1].sum()) * weights.shape[0] / 2
weights[label == 0] = (weights[label == 0] / weights[label == 0].sum()) * weights.shape[0] / 2

for feature in df.columns:
    auc = roc_auc_score(label, df[feature], sample_weight=weights)
    book[feature] = auc

# Show top 5 in table format
pd.DataFrame.from_dict(book, orient="index", columns=["AUC"]).sort_values(by="AUC", ascending=False).head(5)

## Top 5 features by AUC 

In [None]:
df = ParticlePhysics("train").all_data_Dataframe()

weights = df["weights"]
label = df["label"]
df.drop(columns=["weights", "label", "name"], inplace=True)


book = {}

weights[label == 1] = (weights[label == 1] / weights[label == 1].sum()) * weights.shape[0] / 2
weights[label == 0] = (weights[label == 0] / weights[label == 0].sum()) * weights.shape[0] / 2

for feature in df.columns:
    auc = roc_auc_score(label, df[feature], sample_weight=weights)
    book[feature] = auc


# Show top 5 in table format
pd.DataFrame.from_dict(book, orient="index", columns=["AUC"]).sort_values(by="AUC", ascending=False).head(5)