<a href="https://colab.research.google.com/github/nikita-0209/ml_quark_gluon/blob/main/get_roc_scores.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics, model_selection, linear_model, discriminant_analysis

In [None]:
def get_perf_stats(x, y, clf):
    acc = clf.score(x, y)
    measures = clf.decision_function(x)
    labels = y
    measures = np.nan_to_num( measures )
    auc = metrics.roc_auc_score( labels, measures )
    fpr,tpr,thresholds = metrics.roc_curve( labels, measures )
    fpr2 = [ fpr[i] for i in range( len( fpr ) ) if tpr[i]>=0.5]
    tpr2 = [ tpr[i] for i in range( len( tpr ) ) if tpr[i]>=0.5]
    try:
        imtafe = np.nan_to_num( 1 / fpr2[ list( tpr2 ).index( find_nearest( list( tpr2 ), 0.5 ) ) ] )
    except:
        imtafe = 1
    return acc, auc, imtafe
    
def find_nearest( array, value ):
    array = np.asarray( array )
    idx = ( np.abs( array-value ) ).argmin()
    return array[idx]

def k_fold_evaluation(x, y, loss, n_splits=10, alpha=0.0001, eta0=1):
    x_norm = x - x.mean(0, keepdims=True)
    x_norm /= x_norm.std(0, keepdims=True)
    kf = model_selection.KFold(n_splits=n_splits, shuffle=True)
    result = []
    #test_scores = []
    #test_labels = []
    for train, test in kf.split(x_norm):
        if loss in ['log', 'hinge', 'squared_hinge']:
            clf = linear_model.SGDClassifier(loss=loss, penalty='l2', alpha=alpha, learning_rate='adaptive', eta0=eta0, max_iter=200)
        elif loss == 'lda':
            clf = discriminant_analysis.LinearDiscriminantAnalysis(solver='lsqr')
        else:
            raise RuntimeError
        clf.fit(x_norm[train], y[train])
        result.append(get_perf_stats(x_norm[test], y[test], clf))
        #test_scores.append( x_norm[test] )
        #test_labels.append( y[test] )
    result = np.array(result).mean(0)
    #return result, test_scores, test_labels
    return result

def k_fold_evaluation_detailed(x, y, loss, n_splits=10, alpha=0.0001, eta0=1):
    x_norm = x - x.mean(0, keepdims=True)
    x_norm /= x_norm.std(0, keepdims=True)
    kf = model_selection.KFold(n_splits=n_splits, shuffle=True)
    result = []
    test_scores = []
    test_labels = []
    for train, test in kf.split(x_norm):
        if loss in ['log', 'hinge', 'squared_hinge']:
            clf = linear_model.SGDClassifier(loss=loss, penalty='l2', alpha=alpha, learning_rate='adaptive', eta0=eta0, max_iter=200)
        elif loss == 'lda':
            clf = discriminant_analysis.LinearDiscriminantAnalysis(solver='lsqr')
        else:
            raise RuntimeError
        clf.fit(x_norm[train], y[train])
        result.append(get_perf_stats(x_norm[test], y[test], clf))
        test_scores.append( clf.decision_function( x_norm[test] ) )
        test_labels.append( y[test] )
    result = np.array(result).mean(0)
    return result, test_scores, test_labels
    #return result

In [None]:
labels = np.load("/content/drive/MyDrive/lhc/nopid_rTrue_pFalse_tTrue_tw1.0_cf_False_maskTrue_cmaskFalse_nc_100_MD_1000_DF_1000_NH_4_NL_4_NHL_2_temp_0.10_opt_adam_bsize_128_lr_0.00005_SB_1.0_N_1jetclr_labs_1.npy")

In [None]:
reps = np.load("/content/drive/MyDrive/lhc/nopid_rTrue_pFalse_tTrue_tw1.0_cf_False_maskTrue_cmaskFalse_nc_100_MD_1000_DF_1000_NH_4_NL_4_NHL_2_temp_0.10_opt_adam_bsize_128_lr_0.00005_SB_1.0_N_1jetclr_reps_1.npy")[:, 0]

# Squared Hinge

In [None]:
%%time
alpha = 0.0001
resx1, scoresx1, labelsx1 = k_fold_evaluation_detailed(reps, labels, 'squared_hinge', alpha=alpha, eta0=0.0001)
print(f'acc = {resx1[0]:.5f}, auc = {resx1[1]:.5f}, imtafe = {resx1[2]:.5f}')

CPU times: user 1min 37s, sys: 1.87 s, total: 1min 39s
Wall time: 1min 36s


In [None]:
print(f'acc = {resx1[0]:.5f}, auc = {resx1[1]:.5f}, imtafe = {resx1[2]:.5f}')

acc = 0.77179, auc = 0.84396, imtafe = 17.71581


In [None]:
np.save( "/content/drive/MyDrive/lhc/roc_curve_data/nopid/jetclr_sqhinge_scores.npy", np.reshape( np.array( scoresx1 ), -1 ) )
np.save( "/content/drive/MyDrive/lhc/roc_curve_data/nopid/jetclr_sqhinge_labels.npy", np.reshape( np.array( labelsx1 ), -1 ) )

# Log

In [None]:
%%time
alpha = 0.0001
resx1, scoresx1, labelsx1 = k_fold_evaluation_detailed(reps, labels, 'log', alpha=alpha, eta0=0.01)
print(f'acc = {resx1[0]:.5f}, auc = {resx1[1]:.5f}, imtafe = {resx1[2]:.5f}')

acc = 0.77256, auc = 0.84447, imtafe = 18.05863
CPU times: user 3min 5s, sys: 1.82 s, total: 3min 7s
Wall time: 3min 5s


In [None]:
np.save( "/content/drive/MyDrive/lhc/roc_curve_data/nopid/jetclr_log_scores.npy", np.reshape( np.array( scoresx1 ), -1 ) )
np.save( "/content/drive/MyDrive/lhc/roc_curve_data/nopid/jetclr_log_labels.npy", np.reshape( np.array( labelsx1 ), -1 ) )

# LDA

In [None]:
%%time
loss = 'lda'
resx3, scoresx3, labelsx3 = k_fold_evaluation_detailed(reps, labels, 'lda')
print(f'acc = {resx3[0]:.5f}, auc = {resx3[1]:.5f}, imtafe = {resx3[2]:.5f}')

acc = 0.77060, auc = 0.84244, imtafe = 17.50099
CPU times: user 2min 38s, sys: 10.9 s, total: 2min 49s
Wall time: 2min 9s


In [None]:
np.save( "/content/drive/MyDrive/lhc/roc_curve_data/nopid/jetclr_lda_scores.npy", np.reshape( np.array( scoresx3 ), -1 ) )
np.save( "/content/drive/MyDrive/lhc/roc_curve_data/nopid/jetclr_lda_labels.npy", np.reshape( np.array( labelsx3 ), -1 ) )

# Hinge

In [None]:
%%time
alpha = 0.000001
resx4, scoresx4, labelsx4 = k_fold_evaluation_detailed(reps, labels, 'hinge', alpha=alpha, eta0=0.0001)
print(f'acc = {resx4[0]:.5f}, auc = {resx4[1]:.5f}, imtafe = {resx4[2]:.5f}')

acc = 0.76422, auc = 0.83812, imtafe = 16.36567
CPU times: user 1min 24s, sys: 1.78 s, total: 1min 26s
Wall time: 1min 23s


In [None]:
np.save( "/content/drive/MyDrive/lhc/roc_curve_data/nopid/jetclr_hinge_scores.npy", np.reshape( np.array( scoresx4 ), -1 ) )
np.save( "/content/drive/MyDrive/lhc/roc_curve_data/nopid/jetclr_hinge_labels.npy", np.reshape( np.array( labelsx4 ), -1 ) )