In [1]:
import pandas as pd
from scipy import stats
import numpy as np

import matplotlib
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
%matplotlib inline
import seaborn as sns

import pickle
import time
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier

from rerf.rerfClassifier import rerfClassifier

import warnings
warnings.simplefilter('ignore')

In [2]:
def correlated_samples(nchs, n, cls=0, num_noise_chs=3, i=1):
    # The desired mean values of the sample.
    mu = np.array([5.0, 
                    0.0, 
                    10.0])
    
    y_noise = np.random.random((num_noise_chs, n))

    # The desired covariance matrix.
    if cls == 0:
        r = np.array([
            [  3, -2.75*i, -2.00*i],
            [ -2.75*i,  5,  1.50*i],
            [ -2.00*i,  1.50*i,  1]
        ])
    elif cls == 1:
        r = np.array([
            [  3, 0, 0],
            [ 0,  10, 0],
            [ 0,  0,  1.5]
        ])
    
    # Generate the random samples.
    y = np.random.multivariate_normal(mu, r, size=n).T
    
    y = np.vstack((y[0,:], y_noise[0,:], y[1,:], 
                        y_noise[1,:], y[2,:], y_noise[2,:]))
    
    return y

In [3]:
def simulate_data(cov_factor, ns):
    np.random.seed(1234)

    # length of data sequence
    test_size = 0.5

    # simulated data parameters
    T = 100
    nchs = 6

    # initialize data structures for train/test data
    X_train, Y_train = np.empty(shape=(0,nchs,T)), np.empty(shape=(0))
    X_test, Y_test = np.empty(shape=(0,nchs,T)), np.empty(shape=(0))

    # simulate over varying sizes of data sequence
    for n in ns:
        y = []
        X = []

        # generate correlated multi-variate time series
        for i in range(n):
            _x = correlated_samples(nchs, T, cls=0, i=cov_factor)
            X.append(_x)
            y.append(0)
        for i in range(n):
            _x = correlated_samples(nchs, T, cls=1, i=cov_factor)
            X.append(_x)
            y.append(1)
        X = np.array(X)
        y = np.array(y)

        # perform training test split
        _X_train, _X_test, _y_train, _y_test = train_test_split(X, y, 
                                                            test_size=test_size, 
                                                            random_state=42)
        X_train = np.vstack((X_train, _X_train))
        Y_train = np.hstack((Y_train, _y_train))
        X_test = np.vstack((X_test, _X_test))
        Y_test = np.hstack((Y_test, _y_test))

    X_train = X_train.reshape(X_train.shape[0], -1)
    X_test = X_test.reshape(X_test.shape[0], -1)
    return X_train, Y_train, X_test, Y_test

In [5]:
names = {"MT-MORF": "red"}

ncores=20
num_runs=3
n_est=100  # number of estimators

classifiers = [
    rerfClassifier(projection_matrix="MT-MORF",
                   max_features='auto',
                   n_jobs=ncores,
                    n_estimators=n_est,
                    oob_score=False,
                    random_state=0,
                    image_height=nchs,
                    image_width=T,
                    patch_height_max=3,
                    patch_height_min=1,
                    patch_width_max=20,
                    patch_width_min=5
                   )
    ]

In [8]:
ns = np.array([50,100,200,400,1000,2000,4000])

runList = [(n, clf, run) for n in ns\
                       for clf in zip(classifiers, [key for key in names])\
                       for run in range(num_runs)]

In [11]:
# Train each classifier on each dataset size, then test
# Prep output file:
fname = f'./mt-morf_impulse_experiment_covariances.csv'
f = open(fname, 'w+')
f.write("classifier,covariace, n,Lhat,trainTime,testTime,iterate\n")
f.flush()
    
for cov_factor in [1e-5, 0.1, 0.3, 0.5, 0.7, 0.9]:
    X_train, y_train, X_test, y_test = simulate_data(cov_factor)

    for n, clf, iteration in tqdm(runList):
        # print(clf)
        if X_train.ndim == 3:
            X_train = X_train.reshape(-1, X_train.shape[-1]).T
        if X_test.ndim == 3:
            X_test = X_test.reshape(-1, X_test.shape[-1]).T
        X = X_train[:n]
        y = Y_train[:n]

        trainStartTime = time.time()
        clf[0].fit(X, y)
        trainEndTime = time.time()
        trainTime = trainEndTime - trainStartTime

        testStartTime = time.time()
        out = clf[0].predict(X_test)
        testEndTime = time.time()
        testTime = testEndTime - testStartTime

        lhat = np.mean(np.not_equal(out, Y_test).astype(int))

        ####("variable,Lhat,trainTime,testTime,iterate")
        f.write(f"{clf[1]}, {cov_factor}, {n}, {lhat:2.9f}, {trainTime:2.9f}, {testTime:2.9f}, {iteration}\n")
        f.flush()

    f.close()


  0%|          | 0/21 [00:00<?, ?it/s][A
  5%|▍         | 1/21 [00:00<00:02,  8.29it/s][A
 14%|█▍        | 3/21 [00:00<00:01,  9.14it/s][A
 24%|██▍       | 5/21 [00:00<00:01,  9.55it/s][A
 29%|██▊       | 6/21 [00:00<00:01,  9.56it/s][A
 33%|███▎      | 7/21 [00:00<00:01,  8.17it/s][A
 38%|███▊      | 8/21 [00:00<00:01,  7.02it/s][A
 43%|████▎     | 9/21 [00:01<00:01,  6.18it/s][A
 48%|████▊     | 10/21 [00:01<00:02,  4.25it/s][A
 52%|█████▏    | 11/21 [00:01<00:02,  3.72it/s][A
 57%|█████▋    | 12/21 [00:02<00:02,  3.39it/s][A
 62%|██████▏   | 13/21 [00:03<00:04,  1.82it/s][A
 67%|██████▋   | 14/21 [00:05<00:06,  1.07it/s][A
 71%|███████▏  | 15/21 [00:06<00:06,  1.02s/it][A
 76%|███████▌  | 16/21 [00:09<00:07,  1.60s/it][A
 81%|████████  | 17/21 [00:12<00:07,  1.98s/it][A
 86%|████████▌ | 18/21 [00:15<00:06,  2.23s/it][A
 90%|█████████ | 19/21 [00:21<00:06,  3.49s/it][A
 95%|█████████▌| 20/21 [00:28<00:04,  4.46s/it][A
100%|██████████| 21/21 [00:34<00:00,  1.66s/it

ValueError: I/O operation on closed file.

In [None]:
dat = pd.read_csv(fname)

d1 = pd.DataFrame(columns = ['classifier', 'n', 'Lhat', 'color'])

k = 0
for ni in np.unique(dat['n']):
    for cl in np.unique(dat['classifier']):
        tmp = dat[np.logical_and(dat['classifier'] == cl,dat['n'] == ni)][['n', 'Lhat']]
        d1.loc[k] = [cl] + list(tmp.mean()) + [names[cl]]
        k += 1


sns.set(style="darkgrid", rc={'figure.figsize':[12,8], 'figure.dpi': 300})
fig, ax = plt.subplots(figsize = (8,6))

for key in names.keys():
    grp = d1[d1['classifier'] == key]
    ax = grp.plot(ax=ax, kind='line', x='n', y='Lhat', label=key, \
            c = names[key], alpha =0.65)
    #ax.set_yscale('log')

plt.legend(loc='best',title='Algorithm')
plt.title('Algorithm Comparison')
plt.ylabel('Mean Test Error')
plt.xlabel('Number of Training Samples')
#plt.savefig('./s-rerf_impulse_experiment.pdf',dpi=300,format='pdf')
plt.show()