In [None]:
import pickle
import sys
from sklearn.metrics import roc_curve, auc
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import qmlhep
from os.path import join
from qmlhep.config import  others_path
from qmlhep.data_handling.dataset import ParticlePhysics
import seaborn as sns

# Ignore warnings, FutureWarnings
import warnings
warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=FutureWarning)

# Dataset size reduction

This notebook is used to train a logistic regression model using the *KMeans* reduced dataset, a dataset reduced using random undersampling, and the original dataset. 

This notebook is also used to plot the figures that appear in the article.

Author: Maria Gabriela Jordão Oliveira

In [None]:
# open the book - to get the features
book={}
with open(join(others_path, 'SBS.pkl'), 'rb') as f:
    book=pickle.load(f)

# load data
train_data = ParticlePhysics(category='train', standardization='ML', random_seed=42).all_data_Dataframe()
train_data.drop(columns=['name'], inplace=True)

validation_data = ParticlePhysics(category='validation', standardization='ML', random_seed=42).all_data_Dataframe()
validation_data.drop(columns=['name'], inplace=True)


# Plot details
sns.set(style="whitegrid")

SMALL_SIZE = 12
MEDIUM_SIZE = 20
BIGGER_SIZE = 22
LEGEND_SIZE = 14
TICK_SIZE = 16
sns.set(font_scale=50)  # Data ticks


plt.rc("font", size=SMALL_SIZE)  # controls default text sizes
plt.rc("axes", titlesize=SMALL_SIZE)  # fontsize of the axes title
plt.rc("axes", labelsize=MEDIUM_SIZE)  # fontsize of the x and y labels
plt.rc("xtick", labelsize=SMALL_SIZE)  # fontsize of the tick labels
plt.rc("ytick", labelsize=SMALL_SIZE)  # fontsize of the tick labels
plt.rc("legend", fontsize=LEGEND_SIZE)  # legend fontsize
plt.rc("figure", titlesize=BIGGER_SIZE)  # fontsize of the figure title

### Original dataset - comparison

In [None]:
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.01])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')

# to save auc values
aucs_original=[]
for n_features in [1,2,3,4,5]:

    features=book[n_features]
    X_train, Y_train, W_train = train_data[features], train_data['label'], train_data['weights']
    X_val, Y_val, W_val = validation_data[features], validation_data['label'], validation_data['weights']

    
    W_train[Y_train == 0] =( W_train[Y_train == 0] / W_train[Y_train == 0].sum())*W_train.shape[0]/2
    W_train[Y_train == 1] =( W_train[Y_train == 1] / W_train[Y_train == 1].sum())*W_train.shape[0]/2
    W_val[Y_val == 0] =( W_val[Y_val == 0] / W_val[Y_val == 0].sum())*W_val.shape[0]/2
    W_val[Y_val == 1] =( W_val[Y_val == 1] / W_val[Y_val == 1].sum())*W_val.shape[0]/2

    # Logistic Regression
    clf = LogisticRegression(n_jobs=-1)
   
    clf.fit(X_train,
            Y_train, 
            sample_weight= W_train)

    Y_pred = clf.predict_proba(X_val)
    y_scores = Y_pred[:, 1]

    fpr, tpr, _ = roc_curve(Y_val, y_scores,sample_weight=W_val)
    roc_auc = auc(fpr, tpr)
    aucs_original.append(roc_auc)

    plt.plot(fpr, tpr, label=f'#Features = {n_features}, AUC = {round(roc_auc,4)}')


plt.grid(True)
plt.legend(loc="lower right")
plt.savefig('roc_curve_lr_full_dataset_oficial.pdf', format='pdf')

### Random undersampling - comparison

In [None]:
plt.figure(figsize=(10,10))
for n_datapoints in [50,250,500,2500]:
    aucs = []
    error = []
    for n_features in [1,2,3,4,5]:
        aucs_random_sampling=[]

        # r is the random seed for the r th iteration
        for r in range(10):
            features=book[n_features]

            datat = ParticlePhysics(
            category="train", standardization="ML", random_seed=r, n_datapoints=2*n_datapoints).all_data_Dataframe()
            datat.drop(columns=["name"], inplace=True)

            X_resampled, Y_resampled, W_resampled = datat[features], datat["label"], datat["weights"]

            datav = ParticlePhysics(
            category="validation", standardization="ML", random_seed=r, n_datapoints=2*n_datapoints).all_data_Dataframe()
            datav.drop(columns=["name"], inplace=True)

            X_resampledV, Y_resampledV, W_resampledV = datav[features], datav["label"], datav["weights"]

            # Renormalize weights
            W_resampled[Y_resampled == 0] = ( W_resampled[Y_resampled == 0] / W_resampled[Y_resampled == 0].sum())*W_resampled.shape[0]/2
            W_resampled[Y_resampled == 1] = ( W_resampled[Y_resampled == 1] / W_resampled[Y_resampled == 1].sum())*W_resampled.shape[0]/2
            W_resampledV[Y_resampledV == 0] = ( W_resampledV[Y_resampledV == 0] / W_resampledV[Y_resampledV == 0].sum())*W_resampledV.shape[0]/2
            W_resampledV[Y_resampledV == 1] = ( W_resampledV[Y_resampledV == 1] / W_resampledV[Y_resampledV == 1].sum())*W_resampledV.shape[0]/2

            # Logistic Regression
            clf = LogisticRegression()

            clf.fit(X_resampled,
                    Y_resampled,
                    sample_weight= W_resampled)


            Y_pred = clf.predict_proba(X_resampledV)
            y_scores = Y_pred[:, 1]

            fpr, tpr, _ = roc_curve(Y_resampledV, y_scores,sample_weight=W_resampledV)
            roc_auc = auc(fpr, tpr)
            aucs_random_sampling.append(roc_auc)
    
        aucs.append(np.mean(aucs_random_sampling))
        error.append(np.std(aucs_random_sampling))
    plt.errorbar([1,2, 3, 4, 5],aucs, yerr=error, label=f'{2*n_datapoints} datapoints')
plt.plot([1,2, 3, 4, 5], aucs_original, label='Origal dataset', linestyle='--')

plt.grid(True)    
plt.xlabel('Number of features', fontsize = MEDIUM_SIZE)
plt.ylabel('AUC', fontsize = MEDIUM_SIZE)
plt.xticks(fontsize = TICK_SIZE)
plt.yticks(fontsize = TICK_SIZE)
plt.legend(fontsize = MEDIUM_SIZE)
plt.savefig("plot_roc_undersampling.pdf")
      

### Kmeans reduction - train dataset 
Random undersampling on the validation dataset

In [None]:
with open(join(others_path,'kmeans_dataset_train.pkl'),'rb') as f:
    samples=pickle.load(f)
samples=pd.DataFrame.from_dict(samples)
plt.figure(figsize=(10,10))
for centrus in [50,250,500,2500]:
    aucs_random_val = []
    error_random_val = []
    
    for features in [1,2,3,4,5]:
        # choose the number of centrus
        samples_k = samples['#Clusters']== centrus
        samples_k = samples[samples_k]
        
        features=book[features]

        # Choose the features
        X_resampled, Y_resampled, W_resampled = pd.DataFrame(samples_k['X_train'].iloc[0])[features], samples_k['Y_train'].iloc[0], samples_k['W_train1'].iloc[0]
        
        aucs = []
        # r is the random seed of r th iteration
        for r in range(10):
            datav = ParticlePhysics(
            category ="validation", standardization="ML", random_seed=r, n_datapoints=2*centrus).all_data_Dataframe()
            datav.drop(columns=["name"], inplace=True)

            X_resampledV, Y_resampledV, W_resampledV = datav[features], datav["label"], datav["weights"]

            # Renormalize weights
            W_resampledV[Y_resampledV == 0] = ( W_resampledV[Y_resampledV == 0] / W_resampledV[Y_resampledV == 0].sum())*W_resampledV.shape[0]/2
            W_resampledV[Y_resampledV == 1] = ( W_resampledV[Y_resampledV == 1] / W_resampledV[Y_resampledV == 1].sum())*W_resampledV.shape[0]/2


            # Logistic Regression
            clf = LogisticRegression()

            clf.fit(X_resampled,
                    Y_resampled, 
                    sample_weight= W_resampled)
            
            
            Y_pred = clf.predict_proba(X_resampledV)
            y_scores = Y_pred[:, 1]

            fpr, tpr, _ = roc_curve(Y_resampledV, y_scores,sample_weight=W_resampledV)
            roc_auc = auc(fpr, tpr)
            aucs.append(roc_auc)

        aucs_random_val.append(np.mean(aucs))
        error_random_val.append(np.std(aucs))

    plt.errorbar([1,2, 3, 4, 5],aucs_random_val, yerr=error_random_val, label=f'{2*centrus} datapoints')
plt.plot([1,2, 3, 4, 5], aucs_original, label='Original dataset', linestyle='--')

plt.grid(True)    
plt.rcParams['font.size'] = MEDIUM_SIZE
plt.xlabel('Number of features', fontsize = MEDIUM_SIZE)
plt.ylabel('AUC', fontsize = MEDIUM_SIZE)
plt.xticks(fontsize = TICK_SIZE)
plt.yticks(fontsize = TICK_SIZE)
plt.legend(fontsize = MEDIUM_SIZE)
plt.savefig("plot_roc_1g_random_val.pdf",  format='pdf')
      