# Import and Initialize Modin and NumS.

In [1]:
import modin.pandas as pd
import nums
import nums.numpy as nps
nums.init()

2021-11-04 00:06:52,186	INFO services.py:1265 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


Using driver node ip as head node.
head node 192.168.0.168
total cpus 4.0
device_grid (0, 0) 0=node:192.168.0.168/cpu:1


# Load and Preprocess Dataset with Modin.

In [2]:
%%time
higgs_train = pd.read_csv("training.zip")
higgs_train.loc[higgs_train['Label'] == 'b', 'Label'] = 0
higgs_train.loc[higgs_train['Label'] == 's', 'Label'] = 1
higgs_train = higgs_train.drop(columns=['EventId'])
columns = higgs_train.columns.values
X_columns, y_column = columns[:-1], columns[-1:]

CPU times: user 1.51 s, sys: 176 ms, total: 1.68 s
Wall time: 2.77 s


# Convert Modin DataFrame to NumS BlockArray

In [3]:
X_train = nums.from_modin(higgs_train[X_columns].astype(float))
weights = X_train[:, -1]
X_train = X_train[:, :-1]
# Drop weight column from names.
X_columns = X_columns[:-1]
y_train = nums.from_modin(higgs_train[y_column].astype(int)).reshape(-1)

In [4]:
drop_missing = False
impute_missing = False
keep_columns = []
kept_column_names = []
for j in range(X_train.shape[1]):
    mask = X_train[:, j] == -999.0
    num_nans = nps.sum(mask)
    frac = num_nans / X_train.shape[0]
    if drop_missing and frac > 0.7:
        print("drop", j, X_columns[j])
        continue
    keep_columns.append(j)
    kept_column_names.append(X_columns[j])
    if impute_missing and frac > 0.0:
        X_train[mask, j] = nps.nanmean(X_train[:, j])
        print("impute", j, X_columns[j])
X_train = X_train[:, keep_columns]

In [5]:
%%time
# Compute PCA via SVD.
C = nps.cov(X_train, rowvar=False)
V, S, VT = nps.linalg.svd(C)
assert nps.allclose(V, VT.T)
pc = X_train @ V

CPU times: user 66 ms, sys: 5.43 ms, total: 71.4 ms
Wall time: 131 ms


In [7]:
eigen_vals = S**2 / (X_train.shape[0] - 1)
explained_variance = eigen_vals / nps.sum(eigen_vals)
for i, val in enumerate(nps.cumsum(explained_variance).get()[:10]):
    print(i, val)

0 0.9410306560525489
1 0.9940381637372118
2 0.9996230624635641
3 0.9999181743384001
4 0.9999972582402463
5 0.9999984533073456
6 0.9999992569556514
7 0.9999996406346516
8 0.9999998537795541
9 0.9999999182955481




In [8]:
components = VT
sorted_importance = nps.argsort(-nps.sum(nps.abs(components[:2]), axis=0)).get()
for col in X_columns[sorted_importance]:
    print(col)

PRI_jet_leading_pt
PRI_jet_leading_eta
PRI_jet_leading_phi
DER_mass_jet_jet
PRI_jet_subleading_pt
DER_deltaeta_jet_jet
DER_lep_eta_centrality
PRI_jet_subleading_phi
PRI_jet_subleading_eta
DER_prodeta_jet_jet
DER_mass_MMC
PRI_met_sumet
DER_sum_pt
PRI_jet_all_pt
DER_pt_h
DER_mass_transverse_met_lep
PRI_met
PRI_tau_pt
PRI_lep_pt
DER_pt_tot
DER_mass_vis
PRI_jet_num
DER_met_phi_centrality
DER_deltar_tau_lep
DER_pt_ratio_lep_tau
PRI_lep_eta
PRI_tau_eta
PRI_met_phi
PRI_tau_phi
PRI_lep_phi




In [9]:
from nums.sklearn import (train_test_split, 
                          StandardScaler, 
                          GaussianNB, 
                          LogisticRegression, 
                          SVC, 
                          MLPClassifier, 
                          GradientBoostingClassifier, 
                          RandomForestClassifier)

In [11]:
# Final metric.
def metric(ytrue, ypred, weights):
    """ Approximate Median Significance defined as:
        AMS = sqrt(
                2 { (s + b + b_r) log[1 + (s/(b+b_r))] - s}
              )
    where b_r = 10, b = background, s = signal, log is natural logarithm """
    # Max AMS on training set is 67.7
    # return 1.0 - nps.sum(nps.abs(ytrue - ypred))/ytrue.shape[0]
    # True-positive rate.
    s = nps.sum(weights[(ytrue == 1) & (ytrue == ypred)])
    # False-positive rate.
    b = nps.sum(weights[(ytrue == 1) & (ytrue != ypred)])
    br = 10.0
    radicand = 2 * ((s + b + br) * nps.log(1.0 + s / (b + br)) - s)
    return nps.sqrt(radicand)

print(metric(y_train, y_train, weights).get())

67.71112289505676


In [18]:
%%time
scores = []
for drop_features in [0, 3]:
    if drop_features > 0:
        feature_mask = nps.zeros(shape=sorted_importance.shape, dtype=bool)
        feature_mask[sorted_importance[:-drop_features]] = True
        Xt, Xv, yt, yv, wt, wv = train_test_split(X_train[:, feature_mask], y_train, weights)
    else:
        Xt, Xv, yt, yv, wt, wv = train_test_split(X_train, y_train, weights)
    print("drop_features", drop_features)
    numfeatstr = "num_feats=%s" % Xt.shape[1]
    for p_cls in [StandardScaler, None]:
        if p_cls is None:
            ppstr = "preproc=None"
            pXt = Xt
            pXv = Xv
        else:
            ppstr = "preproc=" + p_cls.__name__
            p_inst = p_cls()
            pXt = p_inst.fit_transform(Xt)
            pXv = p_inst.fit_transform(Xv)

        # Tree-based Ensemble Methods
        for n_estimators in [10]:
            for max_depth in [2]:
                for max_features in [None]:
                    m = RandomForestClassifier(n_estimators=n_estimators, 
                                               max_depth=max_depth, max_features=max_features)
                    m.fit(pXt, yt)
                    scores.append([(numfeatstr + ", "
                                    + ppstr + ", "
                                    + m.__class__.__name__
                                    + ("(%s, %s, %s)" % (n_estimators, max_depth, max_features))),
                                   (m.predict(pXv), yv, wv)])
                    for learning_rate in [.4]:
                        for subsample in [.9]:
                            m = GradientBoostingClassifier(n_estimators=n_estimators, 
                                                           max_depth=max_depth, 
                                                           max_features=max_features,
                                                           learning_rate=learning_rate,
                                                           subsample=subsample)
                            m.fit(pXt, yt)
                            scores.append([(numfeatstr + ", "
                                            + ppstr + ", "
                                            + m.__class__.__name__ 
                                            + ("(%s, %s, %s, %s, %s)" % (n_estimators, 
                                                                max_depth, 
                                                                max_features,
                                                                learning_rate,
                                                                subsample))),
                                           (m.predict(pXv), yv, wv)])
print("Training %s pipelines." % len(scores))

drop_features 0
drop_features 3
Training 8 pipelines.
CPU times: user 880 ms, sys: 159 ms, total: 1.04 s
Wall time: 19.7 s


Required resources for this actor or task: {node:192.168.0.168: 0.000100}, {CPU: 1.000000}
Available resources on this node: {0.000000/4.000000 CPU, 29.992548 GiB/29.992548 GiB memory, 1.000000/1.000000 GPU, 14.996274 GiB/14.996274 GiB object_store_memory, 1.000000/1.000000 accelerator_type:GTX, 0.999600/1.000000 node:192.168.0.168}
In total there are 0 pending tasks and 3 pending actors on this node.


In [19]:
%%time
for res in scores:
    res[-1] = metric(*res[-1]).get()

CPU times: user 1.63 s, sys: 323 ms, total: 1.95 s
Wall time: 26.7 s


In [20]:
for res in sorted(scores, key=lambda x: -x[-1]):
    print(*res)

num_feats=27, preproc=None, NumsGradientBoostingClassifier(10, 2, None, 0.4, 0.9) 1.269384930400543
num_feats=30, preproc=None, NumsGradientBoostingClassifier(10, 2, None, 0.4, 0.9) 1.258370485103042
num_feats=27, preproc=NumsStandardScaler, NumsGradientBoostingClassifier(10, 2, None, 0.4, 0.9) 1.2471542638097393
num_feats=30, preproc=NumsStandardScaler, NumsGradientBoostingClassifier(10, 2, None, 0.4, 0.9) 1.2183461009279437
num_feats=30, preproc=None, NumsRandomForestClassifier(10, 2, None) 0.9869143385415494
num_feats=27, preproc=None, NumsRandomForestClassifier(10, 2, None) 0.9841719268750823
num_feats=27, preproc=NumsStandardScaler, NumsRandomForestClassifier(10, 2, None) 0.9841086576817014
num_feats=30, preproc=NumsStandardScaler, NumsRandomForestClassifier(10, 2, None) 0.9746846523215774


# Sources
- https://www.kaggle.com/c/higgs-boson/code
- https://nycdatascience.com/blog/student-works/top2p-higgs-boson-machine-learning/