In [1]:
import modin.pandas as pd
import nums
import nums.numpy as nps
nums.init()

Automatically increasing RLIMIT_NOFILE to max value of 1048576




Using driver node ip as head node.
head node 172.31.4.230
total cpus 46.0
device_grid (0, 0) 0=node:172.31.4.230/cpu:1


# Preparation

### Load and preprocess dataset with Modin.

In [2]:
%%time
higgs_train = pd.read_csv("training.zip")
higgs_train.loc[higgs_train['Label'] == 'b', 'Label'] = 0
higgs_train.loc[higgs_train['Label'] == 's', 'Label'] = 1
higgs_train = higgs_train.drop(columns=['EventId'])
columns = higgs_train.columns.values
X_columns, y_column = columns[:-1], columns[-1:]

CPU times: user 1.25 s, sys: 182 ms, total: 1.43 s
Wall time: 1.74 s


### Convert Modin DataFrame to NumS BlockArray.

In [3]:
%%time
X_train = nums.from_modin(higgs_train[X_columns].astype(float))
weights = X_train[:, -1]
X_train = X_train[:, :-1]
# Drop weight column from names.
X_columns = X_columns[:-1]
y_train = nums.from_modin(higgs_train[y_column].astype(int)).reshape(-1)

CPU times: user 593 ms, sys: 154 ms, total: 747 ms
Wall time: 557 ms


# Exploration

### Compute principal components of dataset.

In [4]:
%%time
# Compute PCA via SVD.
C = nps.cov(X_train, rowvar=False)
V, S, VT = nps.linalg.svd(C)
assert nps.allclose(V, VT.T)
pc = X_train @ V

CPU times: user 250 ms, sys: 99 ms, total: 349 ms
Wall time: 1.9 s


### Compute eigen values from singular values, and explained variance from eigen values.

In [5]:
eigen_vals = S**2 / (X_train.shape[0] - 1)
explained_variance = eigen_vals / nps.sum(eigen_vals)
for i, val in enumerate(nps.cumsum(explained_variance).get()[:10]):
    print(i, val)

0 0.9410306560525487
1 0.9940381637372117
2 0.999623062463564
3 0.9999181743384
4 0.9999972582402462
5 0.9999984533073455
6 0.9999992569556513
7 0.9999996406346515
8 0.999999853779554
9 0.999999918295548




### Order features by explained variance.

In [6]:
components = VT
sorted_variance = nps.argsort(-nps.sum(nps.abs(components[:2]), axis=0)).get()
for col in X_columns[sorted_variance]:
    print(col)

PRI_jet_leading_pt
PRI_jet_leading_eta
PRI_jet_leading_phi
DER_mass_jet_jet
PRI_jet_subleading_pt
DER_deltaeta_jet_jet
DER_lep_eta_centrality
PRI_jet_subleading_phi
PRI_jet_subleading_eta
DER_prodeta_jet_jet
DER_mass_MMC
PRI_met_sumet
DER_sum_pt
PRI_jet_all_pt
DER_pt_h
DER_mass_transverse_met_lep
PRI_met
PRI_tau_pt
PRI_lep_pt
DER_pt_tot
DER_mass_vis
PRI_jet_num
DER_met_phi_centrality
DER_deltar_tau_lep
DER_pt_ratio_lep_tau
PRI_lep_eta
PRI_tau_eta
PRI_met_phi
PRI_tau_phi
PRI_lep_phi




# Modelling

### Import scikit-learn models from nums.

In [7]:
from nums.sklearn import (train_test_split, 
                          StandardScaler, 
                          GaussianNB, 
                          LogisticRegression, 
                          SVC, 
                          MLPClassifier, 
                          GradientBoostingClassifier, 
                          RandomForestClassifier)
print("Models imported.")

Models imported.


### Define the performance metric.

In [8]:
def metric(ytrue, ypred, weights):
    """ Approximate Median Significance defined as:
        AMS = sqrt(
                2 { (s + b + b_r) log[1 + (s/(b+b_r))] - s}
              )
    where b_r = 10, b = background, s = signal, log is natural logarithm """
    # True-positive rate.
    s = nps.sum(weights[(ytrue == 1) & (ytrue == ypred)])
    # False-positive rate.
    b = nps.sum(weights[(ytrue == 1) & (ytrue != ypred)])
    br = 10.0
    radicand = 2 * ((s + b + br) * nps.log(1.0 + s / (b + br)) - s)
    return nps.sqrt(radicand)
print("Metric defined.")

Metric defined.


### Conduct a search over a small set of feature sets, preprocessors, and models.

In [9]:
%%time
scores = []
for drop_features in [0, 3]:
    if drop_features > 0:
        feature_mask = nps.zeros(shape=sorted_variance.shape, dtype=bool)
        feature_mask[sorted_variance[:-drop_features]] = True
        Xt, Xv, yt, yv, wt, wv = train_test_split(X_train[:, feature_mask], y_train, weights)
    else:
        Xt, Xv, yt, yv, wt, wv = train_test_split(X_train, y_train, weights)
    numfeatstr = "num_feats=%s" % Xt.shape[1]
    for p_cls in [StandardScaler, None]:
        if p_cls is None:
            ppstr = "preproc=None"
            pXt = Xt
            pXv = Xv
        else:
            ppstr = "preproc=" + p_cls.__name__
            p_inst = p_cls()
            pXt = p_inst.fit_transform(Xt)
            pXv = p_inst.fit_transform(Xv)

        # Tree-based Ensemble Methods
        for n_estimators in [10]:
            for max_depth in [2]:
                for max_features in [None]:
                    m = RandomForestClassifier(n_estimators=n_estimators, 
                                               max_depth=max_depth, max_features=max_features)
                    m.fit(pXt, yt)
                    scores.append([(numfeatstr + ", "
                                    + ppstr + ", "
                                    + m.__class__.__name__
                                    + ("(%s, %s, %s)" % (n_estimators, max_depth, max_features))),
                                   (m.predict(pXv), yv, wv)])
                    for learning_rate in [.4]:
                        for subsample in [.9]:
                            m = GradientBoostingClassifier(n_estimators=n_estimators, 
                                                           max_depth=max_depth, 
                                                           max_features=max_features,
                                                           learning_rate=learning_rate,
                                                           subsample=subsample)
                            m.fit(pXt, yt)
                            scores.append([(numfeatstr + ", "
                                            + ppstr + ", "
                                            + m.__class__.__name__ 
                                            + ("(%s, %s, %s, %s, %s)" % (n_estimators, 
                                                                max_depth, 
                                                                max_features,
                                                                learning_rate,
                                                                subsample))),
                                           (m.predict(pXv), yv, wv)])
print("Training %s pipelines." % len(scores))

drop_features 0
drop_features 3
Training 8 pipelines.
CPU times: user 128 ms, sys: 19.7 ms, total: 148 ms
Wall time: 885 ms


### Run performance metric and sort the pipelines by their performance.

In [10]:
%%time
results = []
for res in scores:
    results.append((res[0], metric(*res[1]).get()))
for res in sorted(results, key=lambda x: -x[-1]):
    print(*res)

num_feats=27, preproc=None, NumsGradientBoostingClassifier(10, 2, None, 0.4, 0.9) 1.2594896089016074
num_feats=30, preproc=None, NumsGradientBoostingClassifier(10, 2, None, 0.4, 0.9) 1.2501577148553882
num_feats=27, preproc=NumsStandardScaler, NumsGradientBoostingClassifier(10, 2, None, 0.4, 0.9) 1.2485718059653912
num_feats=30, preproc=NumsStandardScaler, NumsGradientBoostingClassifier(10, 2, None, 0.4, 0.9) 1.2297814074454088
num_feats=30, preproc=None, NumsRandomForestClassifier(10, 2, None) 0.9982379850930554
num_feats=30, preproc=NumsStandardScaler, NumsRandomForestClassifier(10, 2, None) 0.995172325721902
num_feats=27, preproc=None, NumsRandomForestClassifier(10, 2, None) 0.9865626382441374
num_feats=27, preproc=NumsStandardScaler, NumsRandomForestClassifier(10, 2, None) 0.9789352787033203
CPU times: user 776 ms, sys: 162 ms, total: 937 ms
Wall time: 9.39 s


# Sources
- https://www.kaggle.com/c/higgs-boson/code
- https://nycdatascience.com/blog/student-works/top2p-higgs-boson-machine-learning/