In [5]:
import sys
import numpy as np
import os
import h5py
import multiprocessing as mp
from glob import glob
# from sklearn import cross_validation
from sklearn.model_selection import LeaveOneGroupOut
import pickle


# if > 0 use multiprocessing, otherwise regular single-process
# execution:

run_on_rhino = True
# run_on_rhino = False
# experiment = 'FR1'
experiment = 'catFR1'

GOOD_ELECS_ONLY = False
# GOOD_ELECS_ONLY = True

if run_on_rhino:
    rhino_mount = ''
else:
    rhino_mount = '/home/ctw/fusemounts/rhino'



if GOOD_ELECS_ONLY:
    class_path_suffix = '_goodchans'
else:
    class_path_suffix = ''



In [6]:
output_root = '/scratch/cweidema/bootcamp/tmpdat'
paths = {
    'power': {
        'catFR1': (rhino_mount + output_root +
                   '/RAM/RAM_catFR/' +
                   'RAM_catFR1_power/encoding/hdf5_files_subj/'),
        'FR1': (rhino_mount + output_root +
                '/RAM/RAM_FR/' +
                'RAM_FR1_power/encoding/hdf5_files_subj/')},
    'classifier': {
        'catFR1': (rhino_mount + output_root +
                   '/RAM/RAM_FRcatFR/encoding'+
                   class_path_suffix+'/' +
                   'RAM_catFR1lolo_classifiers_allfreqs/'+
                   'RAM_catFR1lolo_classifiers_allfreqs_withshuffles/'),
    'FR1': (rhino_mount + output_root +
            '/RAM/RAM_FRcatFR/encoding'+
            class_path_suffix+'/' +
            'RAM_FR1lolo_classifiers_allfreqs/'+
            'RAM_FR1lolo_classifiers_allfreqs_withshuffles/'),
    }
}


# power_files_all = {}
# for exp in paths['power']:
#     power_files_all[exp] = np.sort(glob(os.path.join(
#         paths['power'][exp], '*_pow.hdf5')))

# power_files_bothexp = {}
# for exp in power_files_all:
#     power_files_bothexp[exp] = []

# for pf in power_files_all['catFR1']:
#     pf_FR = pf.replace('catFR', 'FR')
#     if pf_FR in power_files_all['FR1']:
#         power_files_bothexp['catFR1'].append(pf)
#         power_files_bothexp['FR1'].append(pf_FR)

exp = 'catFR1'
power_files_bothexp = {exp: np.sort(
    glob(os.path.join(
        paths['power'][exp], '*_pow.hdf5')))}
        
        
classifier_path = paths['classifier'][experiment]
subjpaths = power_files_bothexp[experiment]

time_bins = [(0.0, 1.6)]


In [7]:
cs = [np.logspace(np.log10(1e-6), np.log10(1e4), 22)[6]]

lr_params = {'penalty': 'l2', 'dual': False, 'tol': 0.0001, 'C': np.nan,
             'fit_intercept': True, 'intercept_scaling': 1,
             'class_weight': 'balanced', 'random_state': None,
             'solver': 'liblinear', 'max_iter': 2000,
             'multi_class': 'ovr', 'verbose': False}


In [8]:
def train_test_lr(lr_params, features, target, train, test, indx=-1):
    from sklearn.linear_model import LogisticRegression
    lr = LogisticRegression(**lr_params).fit(
        features[train], target[train])
    classifier_info = {
        'trained_classifier': lr,
        'train': train,
        'test': test,
        'decision_function': lr.decision_function(features[test]),
        'predict': lr.predict(features[test]),
        'log_probas': lr.predict_log_proba(features[test]),
        'probas': lr.predict_proba(features[test]),
        'score': lr.score(features[test], target[test]),
        'params': lr.get_params(),
        'coef': lr.coef_,
        'intercept': lr.intercept_,
        'indx': indx}
    return(classifier_info)


def train_test_lr_shuffles(lr_params, features, target, train, test,
                           shuffles=1000, indx=-1):
    from sklearn.linear_model import LogisticRegression
    shuffled_classifiers = []
    shuffled_classifier_info = []
    for i in range(shuffles):
        trgt = np.copy(target[train])
        np.random.shuffle(trgt)
        lr = LogisticRegression(**lr_params).fit(
            features[train], trgt)
        classifier_info = {
            'trained_classifier': lr,
            'train': train,
            'test': test,
            'decision_function': lr.decision_function(features[test]),
            'predict': lr.predict(features[test]),
            'log_probas': lr.predict_log_proba(features[test]),
            'probas': lr.predict_proba(features[test]),
            'score': lr.score(features[test], target[test]),
            'params': lr.get_params(),
            'coef': lr.coef_,
            'intercept': lr.intercept_,
            'train_target': trgt,
            'indx': indx}
        shuffled_classifiers.append(lr)
        shuffled_classifier_info.append(classifier_info)
    return(shuffled_classifiers, shuffled_classifier_info)


In [9]:
def get_valchans(subjdat):
    subj = subjdat['data'].attrs['subject']
    elfile = glob(rhino_mount+'/data/eeg/'+subj+
                  '*/docs/*electrode_categories.txt')
    if len(elfile) == 0:
        elfile = glob(rhino_mount+'/scratch/pwanda/electrode_categories/'+
                      subj+'*_electrode_categories.txt')
    if len(elfile) == 0:
        elfile = glob(rhino_mount+'/scratch/pwanda/electrode_categories/' +
                      'electrode_categories_' + subj + '*.txt')
    if len(elfile) == 2:
        subj_with_duplicates = ['R1171M', 'R1191J', 'R1290M', 'R1288P',
                                'R1230J']
        if subj in subj_with_duplicates:
            elfile.pop(0)
    if len(elfile) != 1:
        raise ValueError('Invalid elfile for subj '+subj+': '+str(elfile))
    elfile = elfile[0]
    with open(elfile, 'r') as ef:
        lines = [mystr.strip().replace(' ', '').upper()
                 for mystr in ef.readlines()
                 if len(mystr.strip().replace(' ', ''))> 0]
    if 'tal_struct' in subjdat.keys():
        bppairs = [bp.strip().replace(' ', '').upper().split('-')
                   for bp in subjdat['tal_struct/code']]
    elif 'h5info' in subjdat.keys():
        bppairs = [bp.strip().replace(' ', '').upper().split('-')
                   for bp in subjdat['h5info/bipolar_info/contact_name']]
    else:
        raise ValueError('Missing electrode info for '+str(subjdat))
    badchans = np.array([np.any([e in lines for e in bp])
                         for bp in bppairs])
    return(~badchans)


In [10]:
def get_features(data, time_bins=time_bins, sessfilt=None,
                 freqsfilt=None, channels=None, valid_chan_filter=None):
    # valid_chan_filter = ~np.all(~np.isfinite(data[:, 0, :, 0]), 1)
    if valid_chan_filter is None:
        valid_chan_filter = ~np.all(~np.isfinite(data[0, :, :, 0]), -1)
    else:
        valid_chan_filter &= ~np.all(~np.isfinite(data[0, :, :, 0]), -1)
    # valid_evs_filter = np.isfinite(np.min(data[valid_chan_filter, 0, :, 0], 0))
    valid_evs_filter = np.isfinite(np.min(data[0, valid_chan_filter, :, 0], 0))
    # make sure to delete complete lists:
    invalid_sess_list = [(s, l) for s, l in zip(
        data.dims[2]['session'][~valid_evs_filter],
        data.dims[2]['list'][~valid_evs_filter])]
    # it's a bit inefficient to loop through all of the elements of
    # the list but creating a list of unique (session, list) tuples is
    # a bit tricky in a way that works generally is a bit tricky
    # (tried with np.unique and set) and in generally the
    # invalid_sess_list list should be short:
    for inval_sess, inval_list in invalid_sess_list:
        valid_evs_filter[(data.dims[2]['session'][:] == inval_sess) &
                         (data.dims[2]['list'][:] == inval_list)] = False
    # data_valid = data_valid[:, :, valid_evs_filter]
    if freqsfilt is None:
        # freqsfilt = np.ones(data.shape[1], np.bool)
        freqsfilt = np.ones(data.shape[0], np.bool)
    # timebin_features = (len(data.dims[0]['channels'][valid_chan_filter]) * np.sum(freqsfilt))
    if sessfilt is not None:
        valid_evs_filter &= sessfilt
    timebin_features = np.sum(valid_chan_filter) * np.sum(freqsfilt)

    # if sessfilt is None:
    #     # sessfilt = np.ones(data.shape[2], np.bool)
    #     sessfilt = valid_evs_filter
    # else:
    #     sessfilt = valid_evs_filter & sessfilt
    sessdat = data[:, :, valid_evs_filter]
    # sessdat = sessdat[valid_chan_filter]
    sessdat = sessdat[:, valid_chan_filter]
    # sessdat = sessdat[:, freqsfilt]
    sessdat = sessdat[freqsfilt]
    features = {}
    # features['data'] = np.ones((len(data.dims[2]['recalled']),
    features['data'] = np.ones((np.sum(valid_evs_filter),
                                timebin_features * len(time_bins))) * np.nan
    features['time_bin_labels'] = []
    features['channels'] = []
    features['freqs'] = []
    if channels is None:
        channels = data.dims[0][0]
    for t, tb in enumerate(time_bins):
        time_filter = ((data.dims[3]['time'].value >= tb[0]) &
                       (data.dims[3]['time'].value < tb[1]))
        features['data'][
            :, (t * timebin_features):((t + 1) * timebin_features)] = np.mean(
            sessdat[:, :, :, time_filter], 3).reshape(
                (np.sum(valid_chan_filter) *
                 np.sum(freqsfilt),
                 # len(data.dims[1]['freqs']),
                 np.sum(valid_evs_filter))).T
        #          len(data.dims[2]['recalled']))).T
        features['time_bin_labels'].extend(
            [str(tb[0]) + '-' + str(tb[1])] * timebin_features)
        for chan in channels[valid_chan_filter]:
            features['channels'].extend(
                # [chan] * (len(data.dims[1]['freqs'])))
                [chan] * (np.sum(freqsfilt)))
            features['freqs'].extend(data.dims[1][0][freqsfilt])
    return(features, valid_chan_filter, valid_evs_filter)


In [28]:
def run_classification(subjpath, train_test_func, train_test_func_shuffles,
                       classifier_params,
                       classifier_path, time_bins, shuffles=1000):
    classifier_path_pickle = classifier_path + 'pickle_files/'
    classifier_path_hdf5 = classifier_path + 'hdf5_files/'
    if not os.path.exists(classifier_path_pickle +
                          str(classifier_params['C']) + '/'):
        try:
            os.makedirs(classifier_path_pickle+
                        str(classifier_params['C']) + '/')
        except OSError as e:
            print('no problem', e)
    if not os.path.exists(classifier_path_hdf5 +
                          str(classifier_params['C']) + '/'):
        try:
            os.makedirs(classifier_path_hdf5 +
                        str(classifier_params['C']) + '/')
        except OSError as e:
            print('no problem', e)
    subjdat = h5py.File(subjpath, 'r')
    if 'session' in subjdat['data'].attrs:
        session = subjdat['data'].attrs['session']
        out_path_pickle = str(
            classifier_path_pickle + str(classifier_params['C']) +
            '/' + str(subjdat['data'].attrs['subject']) +
            '_' + str(subjdat['data'].attrs['session']) +
            '_classifier_info.pickle')
        out_path_hdf5 = str(classifier_path_hdf5 + str(classifier_params['C']) +
                         '/' + str(subjdat['data'].attrs['subject']) +
                         '_' + str(subjdat['data'].attrs['session']) +
                         '_classifier_info.hdf5')
    else:
        out_path_pickle = str(
            classifier_path_pickle + str(classifier_params['C']) +
            '/' + str(subjdat['data'].attrs['subject']) + 
            '_classifier_info.pickle')
        out_path_hdf5 = str(
            classifier_path_hdf5 + str(classifier_params['C']) + '/' +
            str(subjdat['data'].attrs['subject']) + 
            '_classifier_info.hdf5')
    if os.path.exists(out_path_pickle) or os.path.exists(out_path_hdf5):
        return
    try:
        out_hdf5 = h5py.File(out_path_hdf5, 'w-', libver='latest')
    except IOError:
        print('Cannot create', out_path_hdf5)
        return
    sess_list_labels = np.array(['{:02d}'.format(s)+'-'+'{:02d}'.format(l)
                                 for s,l in zip(
                                         subjdat['data'].dims[2]['session'][:],
                                         subjdat['data'].dims[2]['list'][:])])
    sessfilt = (np.array([np.sum((subjdat['data'].dims[2]['session'][:] == s) &
                                (subjdat['data'].dims[2]['list'][:] == l))
                         for s,l in zip(subjdat['data'].dims[2]['session'][:],
                                        subjdat['data'].dims[2]['list'][:])])
                == 12)
    # remove practice lists:
    sessfilt &= subjdat['data'].dims[2]['session'][:] >= 0
    # sess_list_labels = sess_list_labels[sessfilt]
    # sess_list_labels = sess_list_labels
    logo = LeaveOneGroupOut()
    # lolo = cross_validation.LeaveOneLabelOut(sess_list_labels)
    # recalled = subjdat['data'].dims[2]['recalled'][sessfilt] == 1
    recalled = subjdat['data'].dims[2]['recalled'][:] == 1
    res = {}
    res_shuffles = {}
    means = {}
    stds = {}
    if GOOD_ELECS_ONLY:
        valid_chan_filter = get_valchans(subjdat)
    else:
        valid_chan_filter = None
    chans = None
    if 'h5info' in subjdat.keys():
        if 'bipolar_info' in subjdat['h5info'].keys():
            chans = subjdat['h5info/bipolar_info/contact_name'][:]
    elif 'tal_struct' in subjdat.keys():
        # if 'code' in subjdat['tal_struct'].keys():           
        chans = subjdat['tal_struct/code'][:]
    for tb in time_bins:
        means[tb] = []
        stds[tb] = []
        res[tb] = []
        res_shuffles[tb] = []
        features, valid_chan_filter, valid_evs_filter = get_features(
            subjdat['data'], time_bins=[tb], sessfilt=sessfilt,
            # freqsfilt=subjdat['freqs'][:] > 9)
            freqsfilt=subjdat['freqs'][:] > 0, channels=chans,
            valid_chan_filter=valid_chan_filter)
        # for study_list, (train, test) in enumerate(lolo):
        for study_list, (train, test) in enumerate(
                logo.split(features['data'], recalled[valid_evs_filter],
                                  groups=sess_list_labels[valid_evs_filter])):
            print(study_list, '/', logo.get_n_splits(
                features['data'], recalled[valid_evs_filter],
                groups=sess_list_labels[valid_evs_filter]))
            features_norm = features['data'].copy()
            means[tb].append([])
            stds[tb].append([])
            assert(np.all([sess in np.unique(subjdat['events/session'][
                valid_evs_filter][train])
                           for sess in np.unique(subjdat['events/session'][
                                   valid_evs_filter][test])]))
            for sess in np.unique(
                    subjdat['events/session'][valid_evs_filter]):
                scalesessfilt = subjdat['events/session'][valid_evs_filter] == sess
                sessmeans = features_norm[train][scalesessfilt[train]].mean(0)
                sessstds = features_norm[train][scalesessfilt[train]].std(0)
                means[tb][-1].append(sessmeans)
                stds[tb][-1].append(sessstds)
                features_norm[scalesessfilt] -= sessmeans
                features_norm[scalesessfilt] /= sessstds
            res[tb].append(
                train_test_func(
                    classifier_params, features_norm,
                    recalled[valid_evs_filter], train,
                    test, study_list))
            res_shuffles[tb].append(
                train_test_func_shuffles(
                    classifier_params, features_norm,
                    recalled[valid_evs_filter], train,
                    test, shuffles, study_list))
    try:
        pickle.dump([res, res_shuffles], open(out_path_pickle, 'w'), -1)
        # make file read only to avoid accidental loss:
        os.chmod(out_path_pickle, 0o444)
    except (IOError, TypeError):
        print('Cannot create ', out_path_pickle)
    features_out = out_hdf5.create_dataset(
        'features', data=features['data'],
        compression='gzip', compression_opts=9)
    features_out.dims[0].label = 'events'
    # out_hdf5.copy(subjdat['events'], out_hdf5, name='events')
    out_hdf5.copy(subjdat['events'], out_hdf5, name='events')
    for key in out_hdf5['events'].keys():
        features_out.dims.create_scale(out_hdf5['events/' + key], key)
        features_out.dims[0].attach_scale(out_hdf5['events/' + key])
    # create dimension for features:
    if 'tal_struct' in subjdat.keys():
        out_hdf5.copy(subjdat['tal_struct'], out_hdf5, name='tal_struct')
    if 'h5info' in subjdat.keys():
        out_hdf5.copy(subjdat['h5info'], out_hdf5, name='h5info')
    recalled = out_hdf5.create_dataset(
        'target', data=recalled, compression="gzip", compression_opts=9)
    out_hdf5['channels'] = features['channels']
    out_hdf5['valid_chan_filter'] = valid_chan_filter
    out_hdf5['valid_evs_filter'] = valid_evs_filter    
    out_hdf5['freqs'] = features['freqs']
    # out_hdf5['time_bins'] = features['time_bin_labels']
    out_hdf5['time_bins'] = time_bins
    features_out.dims[1].label = 'time_bins*channels*frequencies'
    features_out.dims.create_scale(out_hdf5['channels'], 'channels')
    features_out.dims[1].attach_scale(out_hdf5['channels'])
    features_out.dims.create_scale(out_hdf5['freqs'], 'freqs')
    features_out.dims[1].attach_scale(out_hdf5['freqs'])
    # features_out.dims.create_scale(out_hdf5['time_bins'], 'time_bins')
    # features_out.dims[1].attach_scale(out_hdf5['time_bins'])
    #
    for param in classifier_params:
        if param == 'random_state':
            features_out.attrs[param] = str(classifier_params[param])
        else:
            features_out.attrs[param] = classifier_params[param]
    for c_info in res[time_bins[0]][0]:
        if c_info == 'trained_classifier':
            continue
        elif c_info == 'params':
            for param in res[time_bins[0]][0][c_info]:
                if param == 'random_state':
                    features_out.attrs[param] = str(
                        res[time_bins[0]][0][c_info][param])
                else:
                    features_out.attrs[param] = res[
                        time_bins[0]][0][c_info][param]
            continue
        out_hdf5.create_dataset(
            c_info, data = [[r[c_info] for r in res[tb]] for tb in time_bins],
            compression="gzip", compression_opts=9)
        out_hdf5.create_dataset(
            c_info + '_shuffles',
            data = [[[r[c_info] for r in rtb[1]] for rtb in res_shuffles[tb]]
                    for tb in time_bins],
            compression="gzip", compression_opts=9)
    for tb in time_bins:
        out_hdf5.create_dataset('means/'+str(tb), data=means[tb],
                                compression="gzip", compression_opts=9)
        out_hdf5.create_dataset('stds/'+str(tb), data=stds[tb],
                                compression="gzip", compression_opts=9)
    out_hdf5.create_dataset(
        'train_targs_shuffles',
        data = [[[r['train_target'] for r in rtb[1]]
                 for rtb in res_shuffles[tb]]
                for tb in time_bins],
        compression="gzip", compression_opts=9)
    out_hdf5.close()
    # make file read only to avoid accidental loss:
    os.chmod(out_path_hdf5, 0o444)


In [29]:
for subjpath in subjpaths:
    for c in cs:
        print(c, subjpath)
        lr_params['C'] = c
        run_classification(subjpath, train_test_lr, train_test_lr_shuffles,
                           lr_params, classifier_path, time_bins)



0.0007196856730011514 /scratch/cweidema/bootcamp/tmpdat/RAM/RAM_catFR/RAM_catFR1_power/encoding/hdf5_files_subj/R1374T_pow.hdf5
0 / 26
1 / 26
2 / 26
3 / 26
4 / 26
5 / 26
6 / 26
7 / 26
8 / 26
9 / 26
10 / 26
11 / 26
12 / 26
13 / 26
14 / 26
15 / 26
16 / 26
17 / 26
18 / 26
19 / 26
20 / 26
21 / 26
22 / 26
23 / 26
24 / 26
25 / 26
Cannot create  /scratch/cweidema/bootcamp/tmpdat/RAM/RAM_FRcatFR/encoding/RAM_catFR1lolo_classifiers_allfreqs/RAM_catFR1lolo_classifiers_allfreqs_withshuffles/pickle_files/0.0007196856730011514/b'R1374T'_classifier_info.pickle
0.0007196856730011514 /scratch/cweidema/bootcamp/tmpdat/RAM/RAM_catFR/RAM_catFR1_power/encoding/hdf5_files_subj/R1375C_pow.hdf5
0 / 52
1 / 52
2 / 52
3 / 52
4 / 52
5 / 52
6 / 52
7 / 52
8 / 52
9 / 52
10 / 52
11 / 52
12 / 52
13 / 52
14 / 52
15 / 52
16 / 52
17 / 52
18 / 52
19 / 52
20 / 52
21 / 52
22 / 52
23 / 52
24 / 52
25 / 52
26 / 52
27 / 52
28 / 52
29 / 52
30 / 52
31 / 52
32 / 52
33 / 52
34 / 52
35 / 52
36 / 52
37 / 52
38 / 52
39 / 52
40 / 52
4