In [1]:
%matplotlib inline
import gc
import math
import pandas as pd
import numpy as np
import time, multiprocessing
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from tqdm import tqdm
import seaborn as sns
from pathlib import Path

import pickle
import logging

from rerf.rerfClassifier import rerfClassifier

from scipy import stats
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import balanced_accuracy_score

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

gc.enable()

import warnings
warnings.simplefilter('ignore')

In [2]:
def sort_keep_balance(X,y,block_lengths):
    # Sort data and labels into blocks that preserve class balance
    # X: data matrix
    # y : 1D class labels
    # block_lengths : Block sizes to sort X,y into that preserve class balance
    clss,counts = np.unique(y, return_counts=True)
    ratios = counts / sum(counts)
    class_idxs = [np.where(y==i)[0] for i in clss]

    sort_idxs = []
    
    prior_idxs = np.zeros(len(clss)).astype(int)
    for n in block_lengths:
        get_idxs = np.rint(n*ratios).astype(int)
        for idxs,prior_idx,next_idx in zip(class_idxs,prior_idxs,get_idxs):
            sort_idxs.append(idxs[prior_idx:next_idx])
        prior_idxs = get_idxs
        
    sort_idxs = np.hstack(sort_idxs)
    
    return((X[sort_idxs,:], y[sort_idxs]))

In [14]:
DATA_PATH = Path('/mnt/c/Users/Ronan Perry/Documents/JHU/jovo-lab/rerf/morf_real_data/')

with open(DATA_PATH / 'slice_axial_smri_ageXsex.p', 'rb') as f:
    X,y,idx = pickle.load(f)
    
img_height = X.shape[1]
img_width = X.shape[2]

X = X.reshape(X.shape[0], -1)
n_features = X.shape[1]

n_test = 8000

In [20]:
ns = list(idx.keys())

In [21]:
X2,y2 = sort_keep_balance(X,y,ns + [len(y)])

In [26]:
#test_data = np.genfromtxt('./ECG200_TEST.txt')
#train_data = np.genfromtxt('./ECG200_TRAIN.txt')
n_train = 100

y_train = y2[:n_train]
X_train = X2[:n_train,:]

y_test = y2[n_test:]
X_test = X2[n_test:,:]

In [8]:
ncores=1
num_runs=1
n_est=100
mx = 10
mn = 1

## sqrt(n_features) = 99
feature_range = [10,30,60,100,150]
patch_max = [5,8,11,15,20]

names = {"RF":"#fb9a99", 
         "SPORF":"#ff7f00", 
         "MORF":"#e31a1c"}

clfs = [RandomForestClassifier(n_estimators=n_est, max_features='auto', n_jobs=ncores),
       rerfClassifier(n_estimators = n_est, projection_matrix = "RerF",
            max_features = 28, n_jobs = ncores),
       rerfClassifier(
            projection_matrix="S-RerF", 
            max_features = 150,
            n_estimators=n_est,
            n_jobs=ncores,
            image_height=img_height, 
            image_width=img_width, 
            patch_height_max=5,
            patch_width_max=15,
            patch_height_min=1,
            patch_width_min=1,
        )]

In [28]:
clf = clfs[2]

clf.fit(X_train, y_train)

rerfClassifier(feature_combinations=1.5, image_height=91, image_width=109,
               max_depth=None, max_features=150, min_parent=1, n_estimators=100,
               n_jobs=1, oob_score=False, patch_height_max=5,
               patch_height_min=1, patch_width_max=15, patch_width_min=1,
               projection_matrix='S-RerF', random_state=None)

In [33]:
out = clf.predict(X_test)

In [34]:
lhat = np.mean(np.not_equal(out, y_test).astype(int))

## Full pipeline

In [55]:
logdir = Path('./')

write_path = logdir / 'brain_data_mf.csv'

logging.basicConfig(filename=logdir / 'mf_logging.log',
                        format='%(asctime)s:%(levelname)s:%(message)s',
                        level=logging.DEBUG
                        )
logging.info('NEW MF Brain Data RUN')

In [56]:
#ns = np.linspace(2,np.log10(len(y_train)),5)
#ns = np.power(10,ns).astype(int)
ns = [100,400,1000,3000,8000]

X_train,y_train = sort_keep_balance(X_train,y_train,ns)

In [68]:
parameters = [{'max_features':feature_range},
              {'max_features':feature_range},
              {'max_features':feature_range, 'patch_height_max': patch_max, 'patch_width_max':patch_max}]
skf = StratifiedShuffleSplit(n_splits=3)

best_clfs = []

for name,params,clf in zip(names.keys(),parameters,clfs):
    logging.info(f'Grid Search Classifier {name}')
    gs = GridSearchCV(clf, params, cv=skf, verbose=1)
    gs.fit(X_train, y_train)
    best_clfs.append(gs.best_estimator_)
    logging.info(f'Best Parameters: {str(gs.get_params())}')

Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
Exception ignored in: <function _releaseLock at 0x7f4fa3e50510>
Traceback (most recent call last):
  File "/home/rflperry/miniconda3/envs/rerf/lib/python3.7/logging/__init__.py", line 221, in _releaseLock
    def _releaseLock():
KeyboardInterrupt


BrokenProcessPool: A task has failed to un-serialize. Please ensure that the arguments of the function are all picklable.

In [67]:
## Prep output file:

write_path = 'mf_brain_data.csv'

with open(write_path, 'w') as f:
    f.write("classifier,n,Lhat,trainTime,testTime,iterate\n")

runList = [(n, clf, iterate) for n in ns\
                             for clf in zip(best_clfs, [key for key in names])
                             for iterate in range(1, num_runs + 1)]

for n, clf, iterate in tqdm(runList):
    logging.info(f'Run:n={n},clf={clf[1]}')
    gc.collect()
    Xn = X_train[:n, :]
    yn = y_train[:n]

    trainStartTime = time.time()
    clf[0].fit(Xn, yn)
    trainEndTime = time.time()
    trainTime = trainEndTime - trainStartTime

    testStartTime = time.time()
    out = clf[0].predict(X_test)
    testEndTime = time.time()
    testTime = testEndTime - testStartTime

    lhat = np.mean(np.not_equal(out, y_test).astype(int))

    ####("variable,Lhat,trainTime,testTime,iterate")
    with open(write_path, 'a') as f:
        f.write(f"{clf[1]}, {n}, {lhat:2.9f}, {trainTime:2.9f}, {testTime:2.9f}, {iterate}\n")

    gc.collect()

100%|██████████| 5/5 [01:38<00:00, 21.86s/it]
