# Simulation counterbalancing vs. regression
To do:
- create correlated data (perhaps in separate function?)
- 

In [255]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_selection import f_classif
from skbold.preproc import ConfoundRegressor, MajorityUndersampler
%matplotlib inline

In [282]:
iters = 5
n_samp = 100
n_feat = 5
n_fold = 10
std = 1

skf = StratifiedKFold(n_splits=n_fold)

results = {'accuracy': np.zeros(iters)}

confound_control = 'cb'
acc = np.zeros((iters, n_fold))
for i in range(iters):
    
    if i % int(iters / 5) == 0:
        print("Iteration %i" % i)

    n_half = int(n_samp / 2)
    y = np.repeat([0, 1], repeats=n_half)
    c = np.roll(y, 10)
    
    data = np.random.randn(n_samp, n_feat)
    #data[c == 1, :] += 1
    data[y == 1, :] += 0.5
    X = data
    
    for ii, (train_idx, test_idx) in enumerate(skf.split(X, y)):
        
        pipeline = [
            ('confoundreg', ConfoundRegressor(confound=y, fit_idx=train_idx, cross_validate=False)),
            ('scaler', StandardScaler()),
            ('svm', SVC(kernel='linear'))
        ]
        
        if confound_control != 'regress':
            pipeline.pop(0)
        
        pipeline = Pipeline(pipeline)
        
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        
        if confound_control == 'cb':
            idx = counterbalance(y_train, c[train_idx], verbose=False)
            X_train, y_train = X_train[idx], y_train[idx]
            
        #print(f_classif(X_train, y_train)[0])
        pipe.fit(X_train, y_train)
        acc[i, ii] = pipe.score(X_test, y_test)
    results['accuracy'][i] = acc[i, :].mean()
print(np.mean(results['accuracy']))

Iteration 0
Iteration 1
Iteration 2
Iteration 3
Iteration 4
0.684


In [118]:
def counterbalance(y, c, verbose=False):
    
    overall_minimum = np.min([np.min(np.bincount(c[y == lab])) for lab in np.unique(y)])
    full_idx = np.arange(c.size)
    idx = []

    for ylab in np.unique(y):
    
        c_subset = c[y == ylab]
        subset_idx = full_idx[y == ylab]
    
        for clab in np.unique(c_subset):
        
            clab_subset = subset_idx[c_subset == clab]
            sample = np.random.choice(clab_subset, overall_minimum,
                                      replace=False)
            
            idx.append(sample)

    new_idx = np.sort(np.concatenate(idx))
    if verbose:
        print((full_idx.size - new_idx.size) / full_idx.size)
    return new_idx