In [33]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from numba import jit
import os
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
import time
import pprint

In [2]:
os.chdir("C:\\Users\\Micah\\Documents\\nist_data")

In [3]:
def read_pseudo(path):
    return np.core.records.fromfile(str(path), formats = 'i8, (2048)i2', names = 'index, wave')

In [4]:
def read_EventList(path):
    return np.core.records.fromfile(str(path), formats = 'i8, i2, i2, (6)i2, i2, i2', names = 'index, type, num_protons, T0s_and_amps, earliest_T0, max_val')

##### EventList = (index, type, number of protons, 6 T0s and amplitudes, earliest T0, max value)
# Event label convention
 0. Pure noise
 1. Single proton event
 2. Double proton event
 3. Triple proton event
 4. Cosmic event
 5. Electron event
 6. Electron + proton
 7. Cosmic + proton
 8. Decay-in-flight proton

In [5]:
X = read_pseudo('sim100k_wfs.dat')['wave']

In [6]:
X_train = X[:10000]
X_test = X[10000:]

In [7]:
EventList = read_EventList('sim100k_proEventList.dat')

##### For now, focus on the labels from the Event List. They will be stored as Y

In [8]:
Y_train = np.zeros(10000)
Y_test = np.zeros(90000)
for i in range(10000):
    Y_train[i] = EventList[i][1]
for i in range(10000, 100000):
    Y_test[i-10000] = EventList[i][1]

##### Make grid of hyperparameters that we want to tune, so the algorithm is maximally effective at predicting

In [9]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

pprint.pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}


In [10]:
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=0, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, Y_train)
end = time.time()

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed: 13.6min
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed: 204.3min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 397.0min finished


Elapsed time: 26743.27


In [12]:
rf_random.best_params_

{'n_estimators': 2000,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 30,
 'bootstrap': False}

In [14]:
rf_random.best_score_

0.9807

In [9]:
# Number of trees in random forest
n_estimators = [2000, 4000]
# Number of features to consider at every split
max_features = ['sqrt']
# Maximum number of levels in tree
max_depth = [25, 30, 35]
# Minimum number of samples required to split a node
min_samples_split = [4,5,6,7]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1]
# Method of selecting samples for training each tree
bootstrap = [False]

param_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

pprint.pprint(param_grid)

{'bootstrap': [False],
 'max_depth': [25, 30, 35],
 'max_features': ['sqrt'],
 'min_samples_leaf': [1],
 'min_samples_split': [4, 5, 6, 7],
 'n_estimators': [2000, 4000]}


In [10]:
rf = RandomForestClassifier()
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, cv = 3, n_jobs = -1, verbose = 2)

In [11]:
grid_search.fit(X_train, Y_train)

Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed: 109.1min
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed: 396.1min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             iid

In [13]:
grid_search.best_params_

{'bootstrap': False,
 'max_depth': 35,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 4,
 'n_estimators': 2000}

In [14]:
grid_search.best_score_

0.9808

In [9]:
rf = RandomForestClassifier(n_estimators = 2000, min_samples_split = 4, min_samples_leaf = 1, max_features = 'sqrt', max_depth = 35, bootstrap = False)

In [10]:
rf.fit(X_train,Y_train)

RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
                       max_depth=35, max_features='sqrt', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=4,
                       min_weight_fraction_leaf=0.0, n_estimators=2000,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [11]:
Y_pred = rf.predict(X_test)

In [35]:
cf_matrix = confusion_matrix(Y_test, Y_pred)
print(cf_matrix)

[[ 1958     0     0     0     0     0     0     0     0]
 [  120 80862     0     0     0     0     0     0    71]
 [    0   150  4869     0     0     0     0     0     0]
 [    0     0    21   193     0     0     0     0     0]
 [    2    34     0     0    99    20     0    13    44]
 [    2    90    19     4    23    37     0     0    57]
 [    0    18    90    15     7     4    23    45     0]
 [    0     6     1     0    30     0     3   165     4]
 [   34   733     0     0     0     0     0     0   134]]
