# Rare decay search

In [None]:
%pylab inline

In [None]:
import pandas
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
import utils

In [None]:
!pip install hep_ml

# Load dataset and split into training / test

`training.csv` is a mixture of simulated signal, real background.
It has the following columns.

`test.csv` has the following columns:



In [None]:
train_ada = pandas.read_csv('reference/training.csv', sep=',')
test_ada = pandas.read_csv('reference/test.csv', sep=',', index_col='id')

In [None]:
print ("Training full sample columns:", ", ".join(train_ada.columns), "\nShape:", train_ada.shape)

In [None]:
print ("Test full sample columns:", ", ".join(test_ada.columns), "\nShape:", test_ada.shape)
test_ada.head()

# Train simple model using part of the training sample

In [None]:
train, test = train_test_split(train_ada, train_size=0.7, test_size=0.3, random_state=13)

Let's chose features to train a model

In [None]:
variables = list(set(train_ada.columns) - {'id', 'signal', 'mass', 'production', 'min_ANNmuon'})
print (variables)

In [None]:
%%time
clf = AdaBoostClassifier(n_estimators=150, 
                         learning_rate=0.009, 
                         random_state=13,
                         base_estimator=DecisionTreeClassifier(max_depth=20, 
                                                               min_samples_leaf=40,
                                                               max_features=10,
                                                               random_state=13))
clf.fit(train[variables], train['signal'])

# Check model quality on a half of the training sample


In [None]:
def plot_metrics(y_true, y_pred):
    """
    Plots the ROC curve
    
    Parameters
    ----------
    y_true : array-like
        The ground-truth
    y_pred : array-like
        The predictions
    """
    
    fpr, tpr, thresholds = roc_curve(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, y_pred)

    plt.plot(fpr, tpr, label='ROC AUC=%f' % roc_auc)
    plt.xlabel("FPR")
    plt.ylabel("TPR")
    plt.legend()
    plt.title("ROC Curve")

In [None]:
y_pred = clf.predict_proba(test[variables])[:, 1]

plot_metrics(test['signal'], y_pred)
test.shape, y_pred.shape

ROC AUC is just a part of the solution, you also have to make sure that

- the classifier output is not correlated with the mass
- classifier performs similarily on MC and real data of the normalization channel


### Mass correlation check

In [None]:
df_corr_check = pandas.read_csv("reference/check_correlation.csv")

In [None]:
df_corr_check.shape

In [None]:
y_pred = clf.predict(df_corr_check[variables])

In [None]:
# NOTE: In the original file, a routine called `efficiencies` was defined here
#       This was however never used, and were refering to unreferenced variables, including self.*

In [None]:
eff = utils.get_efficiencies(y_pred, df_corr_check.mass, thresholds=[0.5]) #, thresholds=[0.2, 0.4, 0.5, 0.6, 0.8])

In [None]:
eff.keys()

In [None]:
for label_name, eff_data in eff.items():
    pyplot.plot(eff_data[0], eff_data[1], label="global eff  %.1f" % label_name)
    
pyplot.xlabel('mass')
pyplot.ylabel('Efficiency')
pyplot.legend();

In [None]:
corr_metric = utils.check_correlation(y_pred, df_corr_check['mass'])
print (corr_metric)

## MC vs Real difference

In [None]:
df_agreement = pandas.read_csv('reference/check_agreement.csv')

In [None]:
df_agreement.columns

In [None]:
df_agreement[variables].head()

In [None]:
agreement_probs = clf.predict_proba(df_agreement[variables])[:, 1]

ks = utils.compute_ks(agreement_probs[df_agreement['signal'].values == 0],
                      agreement_probs[df_agreement['signal'].values == 1],
                      df_agreement[df_agreement['signal'] == 0]['weight'].values,
                      df_agreement[df_agreement['signal'] == 1]['weight'].values)

print ('KS metric:', ks, "is OK:", ks < 0.09)

In [None]:
def plot_ks(X_agreement, y_pred):
    """
    Plot the prediction distribution
    
    Parameters
    ----------
    X_agreement : DataFrame
        DataFrame with the agreement data
        Must include the column "signal"
    y_pred : array-like
        The prediction
    """
    
    sig_ind = X_agreement[X_agreement['signal'] == 1].index
    bck_ind = X_agreement[X_agreement['signal'] == 0].index

    mc_prob = y_pred[sig_ind]
    mc_weight = numpy.array(X_agreement.loc[sig_ind]['weight'])
    
    data_prob = y_pred[bck_ind]
    data_weight = numpy.array(X_agreement.loc[bck_ind]['weight'])
    
    inds = data_weight < 0
    
    mc_weight = numpy.array(list(mc_weight) + list(-data_weight[inds]))
    mc_prob = numpy.array(list(mc_prob) + list(data_prob[inds]))
    
    data_prob = data_prob[data_weight >= 0]
    data_weight = data_weight[data_weight >= 0]
    
    hist(data_prob, weights=data_weight, color='r', histtype='step', density=True, bins=60, label='data')
    hist(mc_prob, weights=mc_weight, color='b', histtype='step', density=True, bins=60, label='mc')
    
    xlabel("prediction")
    legend(loc=2)
    
    show()

In [None]:
plot_ks(df_agreement, agreement_probs)

### Let's see if adding some noise can improve the agreement

In [None]:
agreement_probs_noise = utils.add_noise(clf.predict_proba(df_agreement[variables])[:, 1])

In [None]:
ks_noise = utils.compute_ks(agreement_probs_noise[df_agreement['signal'].values == 0],
                            agreement_probs_noise[df_agreement['signal'].values == 1],
                            df_agreement[df_agreement['signal'] == 0]['weight'].values,
                            df_agreement[df_agreement['signal'] == 1]['weight'].values)

print ('KS metric:', ks_noise, "is OK:", ks_noise < 0.09)

In [None]:
plot_ks(df_agreement, agreement_probs_noise)

### Check ROC with noise

In [None]:
test.shape

In [None]:
y_pred = utils.add_noise(clf.predict_proba(test[variables])[:, 1])

plot_metrics(test['signal'], y_pred)
test.shape, y_pred.shape

# Train the model using the whole training sample

In [None]:
%%time
clf.fit(train_ada[variables], train_ada['signal'])

In [None]:
train_ada_probs = clf.predict_proba(train_ada[variables])[:, 1]
plot_metrics(train_ada['signal'], train_ada_probs)

Compute prediction and add noise

In [None]:
y_ada = clf.predict_proba(test_ada[variables])[:, 1]
y_pred = utils.add_noise(y_ada, level=0.17)

# Prepare submission file

In [None]:
def save_submission(y_pred, index, filename='result'):
    """
    Saves the submission to a csv.gz file
    
    Parameters
    ----------
    y_pred : array-like
        The prediction
    index : array-like
        The id-index corresponding to the prediction
    filename : str
        The base name of the submission file (i.e. excluding the extension)
    
    Returns
    -------
    filename : str
        The file name of the submission file
    """
    
    sep = ','
    filename = '{}.csv.gz'.format(filename)
    pandas.DataFrame({'id': index, 
                      'prediction': y_pred}).to_csv(filename, 
                                                    sep=sep, 
                                                    index=False,
                                                    compression='gzip')
    print ("Saved file: ", filename, "\nShape:", (y_pred.shape[0], 2))
    return filename

In [None]:
save_submission(y_pred, test_ada.index, "sample_submission")