## Initial Package Import and Simple Module definition

Modified from ZFTurbo

In [None]:
#%matplotlib inline


import shutil
import json

import datetime
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from scipy.io import loadmat
from operator import itemgetter
import random
import os
import time
import glob
import re
from multiprocessing import Process
import copy

#importing CSP modules
import mne
from mne.decoding import CSP



#Importing old and new Kfold
from sklearn.cross_validation import KFold
from sklearn.model_selection import KFold as NewKF
from sklearn.model_selection import StratifiedKFold as StratKF

#Importing GroupKfold, only available since version 0.18
from sklearn.model_selection import GroupKFold


#Importing function for scaling data before PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import scale

#Importing PCA packages
from sklearn.decomposition import PCA, KernelPCA

#Importing FFT package
from scipy.fftpack import fft

#Importing crossvalidation metrics and Gridsearch
from sklearn import cross_validation, metrics
from sklearn.model_selection import GridSearchCV

#Importing wrapper to use XGB with Gridsearch

from xgboost.sklearn import XGBClassifier

#Importing plotting packages (optional)

import matplotlib.pylab as plt

from pandas.tools.plotting import scatter_matrix

from sklearn.model_selection import LeavePGroupsOut
from sklearn.model_selection import GroupShuffleSplit

#Oversampling

from imblearn.over_sampling import SMOTE

from collections import Counter, OrderedDict

from imblearn.combine import SMOTETomek

###Band Frequency filtering###
from scipy.signal import cheby2, butter, lfilter


from numpy import inf


#Defining general modules used in the classification

random.seed(2016)
np.random.seed(2016)


def natural_key(string_):
    return [int(s) if s.isdigit() else s for s in re.split(r'(\d+)', string_)]


def create_feature_map(features):
    outfile = open('xgb.fmap', 'w')
    for i, feat in enumerate(features):
        outfile.write('{0}\t{1}\tq\n'.format(i, feat))
    outfile.close()


def get_importance(gbm, features):
    create_feature_map(features)
    importance = gbm.get_fscore(fmap='xgb.fmap')
    importance = sorted(importance.items(), key=itemgetter(1), reverse=True)
    return importance


def intersect(a, b):
    return list(set(a) & set(b))


def print_features_importance(imp):
    for i in range(len(imp)):
        print("# " + str(imp[i][1]))
        print('output.remove(\'' + imp[i][0] + '\')')


def mat_to_pandas(path):
    mat = loadmat(path)
    names = mat['dataStruct'].dtype.names
    ndata = {n: mat['dataStruct'][n][0, 0] for n in names}
    samp_freq = ndata['iEEGsamplingRate'][0, 0]
    sequence = -1
    if 'sequence' in names:
        sequence = mat['dataStruct']['sequence']
    return pd.DataFrame(ndata['data'], columns=ndata['channelIndices'][0]), sequence, samp_freq

def mat_to_pandas_seq(path):
    mat = loadmat(path)
    names = mat['dataStruct'].dtype.names
    sequence = -1
    if 'sequence' in names:
        sequence = mat['dataStruct']['sequence']
    return sequence

def create_submission(score, test, prediction, feature_model, short_size, new_test):
    # Make Submission
    now = datetime.datetime.now()
    sub_file = 'submission_' + str(score) + '_' + 'model_'+str(feature_model)+'_short_'+str(short_size)+'_new_test_'+str(new_test)+'_'+ str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
    print('Writing submission: ', sub_file)
    f = open(sub_file, 'w')
    f.write('File,Class\n')
    total = 0
    for id in test['Id']:
        patient = id // 100000
        fid = id % 100000
        str1 = 'new_' + str(patient) + '_' + str(fid) + '.mat' + ',' + str(prediction[total])
        str1 += '\n'
        total += 1
        f.write(str1)
    f.close()


def get_features(train, test):
    trainval = list(train.columns.values)
    testval = list(test.columns.values)
    output = intersect(trainval, testval)
    output.remove('Id')
    # output.remove('file_size')
    return sorted(output)

    


## Module for Creating Features and Saving to CSV Files

#### One file per patient per test/training

In [None]:
#Modules to read train and test data.
#Short_dataset can be False or TRue. It decides whether to use the long or short sample size.

#patient_id:
#fini
#fend
#fovr
#feature_model


def create_simple_csv_train(patient_id, feature_model, num_features, fini, fend, fovr,
                            short_dataset=False, new_test=False):
    
    #Constructing main part of feature file name
    feature_file='_'+str(feature_model)+'_short_'+str(short_dataset)+'_new_test_'+ str(new_test)\
            +'_fini_'+str(fini)+'_fend_'+str(fend)+'_fovr_'+str(fovr)
    
    
    #Deciding on short ot small dataset
    if short_dataset:
        
        source_dir="./data/train_"
    else:
        source_dir="./train_"
    
    new_label=''
    
    #Using old or new test files
    if new_test:
        
        new_label='_new'

    out = open("simple_train_" + str(patient_id) + feature_file + ".csv", "w")
    out.write("Id,sequence_id,sequence_num,patient_id,")
  
    #Generating column names

    columns=''
    for i in range(16):
        for j in range(num_features):
            columns+= 'ch_'+str(i)+'_'+"band_"+str(j)+","        

    out.write(columns+"file_size,result\n")

    # Generating features
    
    out_str = ''
    
    ##reading files
    files = sorted(glob.glob(source_dir + str(patient_id) + "/*.mat"), key=natural_key)
    print ('train files'+ str(patient_id), len(files))    
    
    ##Indicators for debugging
    pos1=0
    neg1=0
    sequence_id_pre = int(patient_id)*1000
    sequence_id_inter = int(patient_id)*1000
    total_pre = 0
    total_inter=0
    seq1=0
    
    ##Selecting 'safe' files from old test
    
    new_train = pd.read_csv('train_and_test_data_labels_safe'+'.csv')
    new_data = new_train['image']
    
    selection = new_train[new_train['safe'] == 1].drop('safe', axis=1)
    
    ## Iterating through file
    
    for fl in files:
        
        # print('Go for ' + fl)
               
        if os.path.basename(fl) not in selection['image'].values:
            continue
        
        id_str = os.path.basename(fl)[:-4]
        arr = id_str.split("_")
        patient = int(arr[0])
        id = int(arr[1])
        result = int(arr[2])
        
        if result == 1:
            
            total_pre += 1
            sequence_id=int(patient_id)*1000+int((total_pre-1) // 6) + int((total_inter-1) // 6) + 1

            
        elif result == 0:
            
            total_inter += 1            
            sequence_id=int(patient_id)*1000+int((total_pre) // 6) + int((total_inter-1) // 6)

        
        new_id = int(patient*100000 + id)
        try:
            tables, sequence_from_mat, samp_freq = mat_to_pandas(fl)
            seq1=int(sequence_from_mat[0][0][0][0])
        except:
            print('Some error here {}...'.format(fl))
            continue
        
        
        if (new_id % 1000) % 6 == 0:
            sequence_validator=6
        else:
            sequence_validator=(new_id % 1000) % 6
        
        
        if seq1!=sequence_validator:
            print('sequence mismatch!',seq1, sequence_validator)
        else:
            print('sequence match ',seq1, sequence_validator) 
        
        
        print(sequence_id)
        out_str += str(new_id) + "," + str(sequence_id) + "," + str(seq1) + ","+str(patient)

        sizesignal=int(tables.shape[0])       
        
        for f in sorted(list(tables.columns.values)):
            
            out_str=feature_eng(tables[f], out_str,feature_model, sizesignal, samp_freq,  fini, fend, fovr,)
            
            
        out_str += "," + str(os.path.getsize(fl)) + "," + str(result) + "\n"
        #print(sequence_from_mat)
        #print(type(sequence_from_mat))
       
        print('total preictal: ', total_pre,' total interictal: ', total_inter,' sequence local: ', seq1)
        if (total_pre % 6 == 0) and result == 1:
                pos1 += 1
                print('Positive ocurrence sequence finished', pos1)
                if (seq1==6):
                    sequence_id_pre += 1
                    print ('sequence preictal next',sequence_id_pre)
        
        if (total_inter % 6 == 0) and result == 0:                
                neg1 += 1
                print('Negative ocurrence sequence finished', neg1)
                if (seq1==6):
                    sequence_id_inter += 1
                    print ('sequence interictal next',sequence_id_inter)

    out.write(out_str)
    
    out.close()
    print('Train CSV for patient {} has been completed...'.format(patient_id))


def create_simple_csv_test(patient_id, feature_model, num_features, fini, fend, fovr,
                           short_dataset=False, new_test=False):
    
    feature_file='_'+str(feature_model)+'_short_'+str(short_dataset)+'_new_test_'+str(new_test)\
            +'_fini_'+str(fini)+'_fend_'+str(fend)+'_fovr_'+str(fovr)
    
    if short_dataset:
        
        source_dir="./data/test_"
    else:
        source_dir="./test_"
    
    new_label=''
    
    if new_test:
        
        new_label="_new"

    # TEST
    out_str = ''
    files = sorted(glob.glob(source_dir + str(patient_id) + new_label + "/*.mat"), key=natural_key)
    print ('test files'+ str(patient_id), len(files))    
    out = open("simple_test_" + str(patient_id) + feature_file + ".csv", "w")
    out.write("Id,patient_id,")
    
    columns=''
    for i in range(16):
        for j in range(num_features):
            columns+= 'ch_'+str(i)+'_'+"band_"+str(j)+","        
    
    out.write(columns+"file_size\n")
    
        
    for fl in files:
        # print('Go for ' + fl)
        id_str = os.path.basename(fl)[4:-4]
        arr = id_str.split("_")
        patient = int(arr[0])
        id = int(arr[1])
        new_id = int(patient*100000 + id)
        try:
            tables, sequence_from_mat, samp_freq = mat_to_pandas(fl)

        except:
            print('Some error here {}...'.format(fl))
            continue
        out_str += str(new_id) + "," + str(patient)

        sizesignal=int(tables.shape[0])
              
               
        for f in sorted(list(tables.columns.values)):
            
            out_str=feature_eng(tables[f], out_str,feature_model, sizesignal, samp_freq, fini, fend, fovr,)
                        
        out_str += "," + str(os.path.getsize(fl)) + "\n"
        # break

    out.write(out_str)
    out.close()
    print('Test CSV for patient {} has been completed...'.format(patient_id))


In [None]:

def create_subset_train(feature_model):
    
   
    
#    filestotal = sorted(glob.glob(source_dir + str(patient_id) + "/*.mat"), key=natural_key)

    
    folder_list=['train_1','train_2','train_3']
    
    
    
    for folder_label in folder_list:
        
        short_size=False
        
        patient_id=int(list(filter(str.isdigit, folder_label))[0])
        
        new_test=True      
        if len(folder_label)>7:
  
            var=6
            
        else:
            var=7
        
        files = pd.read_csv('simple_'+ folder_label[0:var]+'_'+str(feature_model)+'_short_'+str(short_size)+'_new_test_'+str(new_test)+'.csv')

        files_index=pd.Series(files['sequence_id'], index=files['sequence_id'].index)

        
        files_unique =files.drop_duplicates(subset=['sequence_id'])
        
#       print(files_unique)
        

        files_1=files_unique[files_unique['result'] == 1]['sequence_id']
        files_0=files_unique[files_unique['result'] == 0]['sequence_id']
        
        print(files_1.shape)
        print(files_0.shape)
    
        n_samples_1=int(len(files_1)/10)
        n_samples_0=int(len(files_0)/10)
        

    
        files_1_rand=np.random.choice(files_1, size=n_samples_1)
        files_0_rand=np.random.choice(files_0, size=n_samples_0)
        
        
        files_1_rand_seq=files[files_index.isin(files_1_rand)]['Id']
        files_0_rand_seq=files[files_index.isin(files_0_rand)]['Id']
        
        for file in files_1_rand_seq:
        
            id_file=str(file % 1000)
            source_file='./'+folder_label+'/'+str(patient_id)+'_'+id_file+'_1.mat'
            target_file='./data_random/' + folder_label+'/'+str(patient_id)+'_'+id_file+'_1.mat'
            print(target_file)
    
            shutil.copyfile(source_file, target_file) 

        for file in files_0_rand_seq:
        
            id_file=str(file % 1000)
            source_file='./'+folder_label+'/'+str(patient_id)+'_'+id_file+'_0.mat'
            target_file='./data_random/' + folder_label+'/'+str(patient_id)+'_'+id_file+'_0.mat'
            print(target_file)
    
            shutil.copyfile(source_file, target_file)   
    

In [None]:
feature_model=3
short_size=False
num_features=6
new_test=True


create_subset_train(feature_model)

In [None]:


def create_subset_test(feature_model):
    
   
    
#    filestotal = sorted(glob.glob(source_dir + str(patient_id) + "/*.mat"), key=natural_key)

    
    folder_list=['test_1_new','test_2_new','test_3_new']
    
    
    
    for folder_label in folder_list:
        
        short_size=False
        
        patient_id=int(list(filter(str.isdigit, folder_label))[0])
        
        new_test=True      
        if len(folder_label)>7:
  
            var=6
            
        else:
            var=7
        
        files = pd.read_csv('simple_'+ folder_label[0:var]+'_'+str(feature_model)+'_short_'+str(short_size)+'_new_test_'+str(new_test)+'.csv')

             

        files_total=files['Id']

    
        n_samples=int(len(files_total)/10)

    
        files_rand=np.random.choice(files_total, size=n_samples)

             
        
        for file in files_rand:
        
            id_file=str(file % 1000)
            source_file='./'+folder_label+'/new_'+str(patient_id)+'_'+id_file+'.mat'
            target_file='./data_random/' + folder_label+'/new_'+str(patient_id)+'_'+id_file+'.mat'
            print(target_file)
    
            shutil.copyfile(source_file, target_file) 


In [None]:
feature_model=3
short_size=False
num_features=6
new_test=True


create_subset_test(feature_model)

## Feature Engineering

In [None]:
#eng_number is the feature_value that has values 0,1,2,3... infinite, given by the list below.

def feature_eng(data_sensor, out_str, eng_number, sizesignal, fs, fini5,fend5,fovr5):

                
    yf1 = fft(data_sensor)
    fftpeak=2/sizesignal * np.abs(yf1[0:sizesignal/2])
 
    numberofbands=4

    sizeband=20/numberofbands
    
    if eng_number==5:
        
    
        

        ##Frequency parameters##
        #Start frequency#
        fini = fini5
        #End frequency#
        fend = fend5
        #Frequency band range#
        frng = 4
        #Frequency overlap#
        fovr = fovr5
    
        #Frequency band generator#
        fbands = [[f, f + frng] for f in range(fini, fend - fovr, frng - fovr)]
    
        #Filter order#
        order = 5
        #Filter bandstop attenuation (dB)#
        attenuation = 20.0
        #Nyquist frequency#
        fnyq = fs / 2.0
        

        for fb in fbands:
        
            #Create butterworth bandpass filter#
            #b, a = butter(order, fb  / fnyq, btype='band')
            b, a = cheby2(order, attenuation, fb  / fnyq, btype='band')
            
            #Apply filter#
            data_filter = lfilter(b, a, data_sensor)
            
            #Band pass 'power'#
            band_pwr = np.square(data_filter)
            
            avg_band_pwr = band_pwr.mean()
            
            out_str += "," + str(avg_band_pwr)
        
    
    
    
    
    elif eng_number==4:
        
    
        

        ##Frequency parameters##
        #Start frequency#
        fini = 4
        #End frequency#
        fend = 40
        #Frequency band range#
        frng = 4
        #Frequency overlap#
        fovr = 0
    
        #Frequency band generator#
        fbands = [[f, f + frng] for f in range(fini, fend - fovr, frng - fovr)]
    
        #Filter order#
        order = 5
        #Filter bandstop attenuation (dB)#
        attenuation = 20.0
        #Nyquist frequency#
        fnyq = fs / 2.0
        

        for fb in fbands:
        
            #Create butterworth bandpass filter#
            #b, a = butter(order, fb  / fnyq, btype='band')
            b, a = cheby2(order, attenuation, fb  / fnyq, btype='band')
            
            #Apply filter#
            data_filter = lfilter(b, a, data_sensor)
            
            #Band pass 'power'#
            band_pwr = np.square(data_filter)
            
            avg_band_pwr = band_pwr.mean()
            
            out_str += "," + str(avg_band_pwr)
      
    
    
    
    
    elif eng_number==3:
        
    
        

        ##Frequency parameters##
        #Start frequency#
        fini = 7
        #End frequency#
        fend = 30
        #Frequency band range#
        frng = 4
        #Frequency overlap#
        fovr = 0
    
        #Frequency band generator#
        fbands = [[f, f + frng] for f in range(fini, fend - fovr, frng - fovr)]
    
        #Filter order#
        order = 5
        #Filter bandstop attenuation (dB)#
        attenuation = 20.0
        #Nyquist frequency#
        fnyq = fs / 2.0
        

        for fb in fbands:
        
            #Create butterworth bandpass filter#
            #b, a = butter(order, fb  / fnyq, btype='band')
            b, a = cheby2(order, attenuation, fb  / fnyq, btype='band')
            
            #Apply filter#
            data_filter = lfilter(b, a, data_sensor)
            
            #Band pass 'power'#
            band_pwr = np.square(data_filter)
            
            avg_band_pwr = band_pwr.mean()
            
            out_str += "," + str(avg_band_pwr)
            

    elif eng_number==2:
        
        mean = data_sensor.mean()
        
        peak1=fftpeak[0:3].mean()            
        peak2=fftpeak[3:6].mean()          
        peak3=fftpeak[6:9].mean()
        peak4=fftpeak[9:12].mean()
        peak5=fftpeak[12:15].mean()            
        peak6=fftpeak[15:18].mean()          
        peak7=fftpeak[18:21].mean()
        peak8=fftpeak[21:24].mean()
        peak9=fftpeak[24:27].mean()            
        peak10=fftpeak[27:30].mean()          
        peak11=fftpeak[30:33].mean()
        peak12=fftpeak[33:36].mean()
            
        out_str += "," + str(mean)+ "," + str(peak1) + "," + str(peak2) + "," + str(peak3) +"," + str(peak4) \
                    +"," + str(peak5) + "," + str(peak6) + "," + str(peak7) +"," + str(peak8)+ "," + str(peak9) \
                    +"," + str(peak10) + "," + str(peak11) +"," + str(peak12)
    
    elif eng_number==1:
            
        mean = data_sensor.mean()   
        
        peak1=fftpeak[0:5].mean()            
        peak2=fftpeak[5:10].mean()          
        peak3=fftpeak[10:15].mean()
        peak4=fftpeak[15:20].mean()
        
        out_str += "," + str(mean)+ "," + str(peak1) + "," + str(peak2) + "," + str(peak3) +"," + str(peak4)
    
    elif eng_number==0:
            
        mean = data_sensor.mean()
    
        out_str += "," + str(mean)
    
    return out_str

## Module for Reading Test and Train Feature Files and Concatenating

In [None]:


def read_test_train(feature_model, short_size=False, new_test=False):
    feature_file='_'+str(feature_model)
    print("Load train.csv...")
    train1 = pd.read_csv('simple_train_1'+feature_file+'_short_'+str(short_size)+'_new_test_'+str(new_test)+'.csv')
    train2 = pd.read_csv('simple_train_2'+feature_file+'_short_'+str(short_size)+'_new_test_'+str(new_test)+'.csv')
    train3 = pd.read_csv('simple_train_3'+feature_file+'_short_'+str(short_size)+'_new_test_'+str(new_test)+'.csv')
    train = pd.concat([train1, train2, train3])
    # Remove all zeroes files
    train = train[train['file_size'] > 55000].copy()
    # Shuffle rows since they are ordered
    train = train.iloc[np.random.permutation(len(train))]
    # Reset broken index
    train = train.reset_index()
    print("Load test.csv...")
    test1 = pd.read_csv('simple_test_1'+feature_file+'_short_'+str(short_size)+'_new_test_'+str(new_test)+'.csv')
    test2 = pd.read_csv('simple_test_2'+feature_file+'_short_'+str(short_size)+'_new_test_'+str(new_test)+'.csv')
    test3 = pd.read_csv('simple_test_3'+feature_file+'_short_'+str(short_size)+'_new_test_'+str(new_test)+'.csv')
    test = pd.concat([test1, test2, test3])
    print("Process tables...")
    features = get_features(train, test)
    return train, test, features
    

 

In [None]:


def read_test_train_per_patient(feature_model, short_size=False, new_test=False, fini=4, fend=40, fovr=0):
    feature_file='_'+str(feature_model)
    print("Load train.csv...")
    train1 = pd.read_csv('simple_train_1'+feature_file+'_short_'+str(short_size)+
                        '_new_test_'+str(new_test)+'_fini_'+str(fini)+'_fend_'+str(fend)+'_fovr_'+str(fovr)+'.csv')
    train2 = pd.read_csv('simple_train_2'+feature_file+'_short_'+str(short_size)+
                         '_new_test_'+str(new_test)+'_fini_'+str(fini)+'_fend_'+str(fend)+'_fovr_'+str(fovr)+'.csv')
    train3 = pd.read_csv('simple_train_3'+feature_file+'_short_'+str(short_size)+
                         '_new_test_'+str(new_test)+'_fini_'+str(fini)+'_fend_'+str(fend)+'_fovr_'+str(fovr)+'.csv')
#    print(train1['Id'])
    #train = pd.concat([train1, train2, train3])
    # Remove all zeroes files
    train1 = train1[train1['file_size'] > 55000].copy()
    train2 = train2[train2['file_size'] > 55000].copy()
    train3 = train3[train3['file_size'] > 55000].copy()
    # Shuffle rows since they are ordered
    train1 = train1.iloc[np.random.permutation(len(train1))]
    train2 = train2.iloc[np.random.permutation(len(train2))]
    train3 = train3.iloc[np.random.permutation(len(train3))]
    # Reset broken index
    train1 = train1.reset_index()
    train2 = train2.reset_index()
    train3 = train3.reset_index()
    print("Load test.csv...")
    test1 = pd.read_csv('simple_test_1'+feature_file+'_short_'+str(short_size)+
                        '_new_test_'+str(new_test)+'_fini_'+str(fini)+'_fend_'+str(fend)+'_fovr_'+str(fovr)+'.csv')
    test2 = pd.read_csv('simple_test_2'+feature_file+'_short_'+str(short_size)+
                        '_new_test_'+str(new_test)+'_fini_'+str(fini)+'_fend_'+str(fend)+'_fovr_'+str(fovr)+'.csv')
    test3 = pd.read_csv('simple_test_3'+feature_file+'_short_'+str(short_size)+
                        '_new_test_'+str(new_test)+'_fini_'+str(fini)+'_fend_'+str(fend)+'_fovr_'+str(fovr)+'.csv')
    #test = pd.concat([test1, test2, test3])
    print("Process tables...")
    features1 = get_features(train1, test1)
    features2 = get_features(train2, test2)
    features3 = get_features(train3, test3)
    return [[train1, test1, features1],[train2, test2, features2],[train3, test3, features3]]
    

 

## Running Creation of Feature Files (!!!)

In [None]:


#if __name__ == '__main__':
#    print('XGBoost: {}'.format(xgb.__version__))

def generating_files(mode,feature_model, short_size, num_features, new_test,fini, fend, fovr):

    if mode==0:
        # Do reading and processing of MAT files in parallel
        p = dict()
        p[1] = Process(target=create_simple_csv_train, args=(1,feature_model,num_features,fini, fend, fovr,
                                                             short_size,new_test))
        p[1].start()
        p[2] = Process(target=create_simple_csv_train, args=(2,feature_model,num_features,fini, fend, fovr,
                                                             short_size,new_test))
        p[2].start()
        p[3] = Process(target=create_simple_csv_train, args=(3,feature_model,num_features,fini, fend, fovr,
                                                             short_size,new_test))
        p[3].start()
        p[4] = Process(target=create_simple_csv_test, args=(1,feature_model,num_features,fini, fend, fovr,
                                                            short_size,new_test))
        p[4].start()
        p[5] = Process(target=create_simple_csv_test, args=(2,feature_model,num_features,fini, fend, fovr,
                                                            short_size,new_test))
        p[5].start()
        p[6] = Process(target=create_simple_csv_test, args=(3,feature_model,num_features,fini, fend, fovr,
                                                            short_size,new_test))
        p[6].start()
        p[1].join()
        p[2].join()
        p[3].join()
        p[4].join()
        p[5].join()
        p[6].join()
    
    elif mode==1:
        p = dict()
        
        p[1] = Process(target=create_simple_csv_train, args=(1,feature_model,num_features,fini, fend, fovr,
                                                             short_size,new_test))
        p[1].start()
                
        p[2] = Process(target=create_simple_csv_test, args=(1,feature_model,num_features,fini, fend, fovr,
                                                            short_size,new_test))
        p[2].start()
        
        p[1].join()
        p[2].join()
        
    elif mode==2:
        
        p = dict()
        
        p[1] = Process(target=create_simple_csv_train, args=(2,feature_model,num_features,fini, fend, fovr,
                                                             short_size,new_test))
        p[1].start()
                
        p[2] = Process(target=create_simple_csv_test, args=(2,feature_model,num_features,fini, fend, fovr,
                                                            short_size,new_test))
        p[2].start()
        
        p[1].join()
        p[2].join()
        
          
        
    elif mode==3:
    
        p = dict()
        
        p[1] = Process(target=create_simple_csv_train, args=(3,feature_model,num_features,fini, fend, fovr,
                                                             short_size,new_test))
        p[1].start()
                
        p[2] = Process(target=create_simple_csv_test, args=(3,feature_model,num_features,fini, fend, fovr,
                                                            short_size,new_test))
        p[2].start()
        
        p[1].join()
        p[2].join()

#create_simple_csv_test(1,feature_model,num_features,short_size,new_test)

In [None]:

feature_model=5
short_size=False
num_features=9
new_test=True
fini=4
fend=40
fovr=0
mode=3

generating_files(mode,feature_model, short_size, num_features, new_test,fini, fend, fovr)

## Reading Feature Files

In [None]:
feature_model=4
short_size=False
num_features=9
new_test=True

if __name__ == '__main__':
    print('XGBoost: {}'.format(xgb.__version__))
    
    train, test, features = read_test_train(feature_model, short_size, new_test)
    print('Length of train: ', len(train))
    print('Length of test: ', len(test))
    print('Features [{}]: {}'.format(len(features), sorted(features)))
    
#   print ('train',train['sequence_id'])

In [None]:
def run_train_predict(train, test, features, target,params_model, num_features, channels, csp_n=4,
                      seq_number=1,csp_init=0,csp_end=0.1, 
                      nfolds=3, random_state=2016,
                      mode=0, PCAkey=False, PCAgraph=False,
                      PCAkeyGS=False, SEQoriginal=False,
                     Oversampling=False, GridSearch=False, pred_per_patient=False, CSPkey=False, CSPkey1=False):
    
    
#    print(train)
#    print(train.shape)
#    print(train.columns.values)
#    print(type(train['result']))
    
    function_params = OrderedDict()
    function_params["nfolds"]=nfolds
    function_params["random_state"]= random_state
    function_params["PCAkey"] = PCAkey
    function_params["PCAgraph"]= PCAgraph
    function_params["PCAkeyGS"]= PCAkeyGS
    function_params["SEQoriginal"]= SEQoriginal
    function_params["Oversampling"]= Oversampling
    function_params["GridSearch"]= GridSearch
    
    np.set_printoptions(suppress=True)
    
    #print('train type', type(train),'train', train, 'train index', train.index.values)
    #print('test type',type(test),'test', test, 'test index', test.index.values)

    #train=train.iloc[0:120]
    #test=test.iloc[0:100]
    
    #print('test',test)
    

    
    #train_seq=train['Id']
    
    #if seq_number==6:
    #    pandas_number=0
    #else:
    #    pandas_number=seq_number
    
    #print('pandas number', pandas_number)
    
    #train=train[(train_seq % 1000) % 6 == pandas_number]
    
    #train_seq_rev=train['Id'].values.tolist()#[0:120]
    #result_seq_rev=np.int64(train[target].values.tolist())#[0:120])
    
    #for item111 in train_seq_rev:
        
    #    if (item111 % 1000) % 6 != pandas_number:
            
    #        print('error pandas numerb',(item111 % 1000) % 6, pandas_number)
        
    #    else:
            
    #        print('all good!', (item111 % 1000) % 6, pandas_number)
    
    #file_name_train_sequence=[]
    #seq_number_list=[]
   
    #for i,f_id in enumerate(train_seq_rev):
                           
    #   real_f_id=f_id % 100000 
    #   name_file1="./train_"+str(mode)+'/'+str(mode)+'_'+str(real_f_id)+'_'+str(result_seq_rev[i])+'.mat'
    #   file_name_train_sequence.append(name_file1)
    #   print(name_file1)
    #  
    #   try:
    #       sequence_from_mat_seq = mat_to_pandas_seq(name_file1)
    #   except:
    #       print('Some error here {}...'.format(name_file1))
    #       seq_number_list.append(0)
    #       continue
       
    #   seq_number_list.append(int(sequence_from_mat_seq[0][0][0][0]))
    #   print('another done', i)

#    pandas_array=np.array(list(np.loadtxt('pandas_sequences_train_3.txt', delimiter=',')))
    
#    print('pandas_array', len(pandas_array), pandas_array)

#    total_sequences_lists=[result_seq_rev, seq_number_list]
        
#   pandas_sequences=pd.Dataframe(total_sequences_lists, columns=['Id_number', 'sequences'])
    
    
    #pandas_sequences=pd.DataFrame(np.array(seq_number_list),columns=['sequences'], index=train.index)
    
    #print('pandas_sequences',pandas_sequences.shape, pandas_sequences)
    print('train shape',train.shape)
    train=train[train['sequence_num']==seq_number]
    
#    for i in pandas_sequences['Id_number'].values.tolist():
        
#        for j in train.index.values.tolist():
            
#            if 
#            train_list.append()
    
#    train=train[train['Id']==Id_number &
                
#                pandas_sequences[pandas_sequences.isin(train['Id'])]

    
#    np.savetxt('pandas_sequences.txt', pandas_sequences, delimiter=',')

      
    
    print(#'train_seq',len(train_seq_rev),
          'train shape',train.shape, 'train', train.shape)
    
    #print('train type', type(train),'train', train, 'train index', train.index.values)
    #print('test type',type(test),'test', test, 'test index', test.index.values)
    
    unique_seq = train.drop_duplicates(subset=['sequence_id'])
    unique_seq_y = unique_seq['result'].values
    
    print('unique seq y', len(unique_seq_y) )
    
    n_samples=len(unique_seq_y)
    print('length',n_samples)
    unique_seq_X = np.zeros(n_samples)
    
    print('unique seq X', len(unique_seq_X)  )
    
    
    print('train pre', train.shape) 

    yfull_train = dict()
    yfull_test = copy.deepcopy(test[['Id']].astype(object))

    unique_sequences = np.array(train['sequence_id'].unique())
    print('unique sequences pre', unique_sequences.shape)

    groups1=np.fix(unique_sequences/1000)
    
    groups2=groups1.astype(int)
    #    print('groups', groups2)
        
        
        
    gkf = GroupKFold(n_splits=3)
    test1=gkf.split(unique_sequences, groups=groups2)
    test2=gkf.split(unique_sequences, groups=groups2)
    
    #random_state=random_state
    print('unique sequences', unique_sequences.shape)
    #    splitKF = KFold(len(unique_sequences), n_folds=nfolds, shuffle=True, random_state=random_state)
    #    kf = NewKF(n_splits=nfolds, shuffle=True, random_state=random_state)
    kf = StratKF(n_splits=nfolds, shuffle=False, random_state=random_state)
    
    num_fold = 0
    num_fold1=0
    

    
    if SEQoriginal:
        sequences_full=np.mod(train['sequence_id'].values,1000)
        print('sequences full', sequences_full.shape)
        unique_sequences2=np.mod(unique_sequences,1000)
        unique_sequences_fold=pd.Series(sequences_full, index=train['sequence_id'].index)
#        print('unique_sequences_fold', unique_sequences_fold)
    
        unique_sequences = np.unique(unique_sequences2)
        print('unique sequences pre', unique_sequences.shape)

    else:
        unique_sequences_fold=pd.Series(train['sequence_id'], index=train['sequence_id'].index)

    
    
    
    num_boost_round = 1000
    early_stopping_rounds = 50
    
    
    eta = 0.1
    max_depth = 4
    subsample = 0.9
    colsample_bytree = 0.9
    start_time = time.time()

    print('XGBoost params. ETA: {}, MAX_DEPTH: {}, SUBSAMPLE: {}, COLSAMPLE_BY_TREE: {}'.format(eta, max_depth, subsample, colsample_bytree))
    
    params = OrderedDict()
    params["objective"]= "binary:logistic"
    params["booster"]= "gbtree"
    params["eval_metric"]= "auc"
    params["eta"] = eta
    params["tree_method"]='exact'
    params["max_depth"]= max_depth
    params["subsample"] =subsample
    params["colsample_bytree"]= colsample_bytree
    params["silent"] =1
    params["seed"] =random_state
    params["gamma"] =0.1
    params["min_child_weight"] =2
    params["scale_pos_weight"]=2
    params["seed"]=27

    xgboost_to_xgb={
    
    "learning_rate" : "eta",
    "reg_alpha" : "alpha",
   
    "reg_lambda" : "lambda" }

#    Parameters from previous run, if any

    if type(params_model) is OrderedDict:
        
        for item in params_model:
        
            params[item]=params_model[item]
            
        print(params)
    
    
    
    
    
#   Using best parameters to train model 


        
        
    if CSPkey1 and mode!=0:
        
        seq_chosen=seq_number
        csp_components=csp_n
        
        print('test length ',test.shape[0])
        
        for item in range(test.shape[0]):
            
            if item==0:
                
                test_3d_pre=test[features].iloc[[item]].drop(['file_size','patient_id'],1).values
                test_3d=np.reshape(test_3d_pre,newshape=(num_features, channels), order='F')
                continue
                
            test_3d_pre = test[features].iloc[[item]].drop(['file_size','patient_id'],1).values
            test_red=np.reshape(test_3d_pre,newshape=(num_features, channels), order='F')
            
            test_3d=np.dstack((test_3d,test_red))
          
        
        
        csp_test=test_3d.transpose((2,1,0))
        print('csp_test',csp_test.shape) 
                        
    if GridSearch and CSPkey:
        
        pass
    
    if GridSearch and CSPkey==False and CSPkey1==False:
        
        pass
            
            
    if GridSearch and CSPkey1 and mode!=0:
        
        for item in range(train.shape[0]):
            
            if item==0:
                
                train_3d_pre=train[features].iloc[[item]].drop(['file_size','patient_id'],1).values
                train_3d=np.reshape(train_3d_pre,newshape=(num_features, channels), order='F')
                continue
                
            train_3d_pre = train[features].iloc[[item]].drop(['file_size','patient_id'],1).values
            train_red=np.reshape(train_3d_pre,newshape=(num_features, channels), order='F')
            
            train_3d=np.dstack((train_3d,test_red))
          
        
        
        csp_train_gs=train_3d.transpose((2,1,0))
        print('csp_train_gs',csp_train_gs.shape, csp_train_gs )
        
        CSPtest=CSP(n_components=csp_components, transform_into='csp_space')
        
        target_gs=train[target].values.astype(np.int64)
        
        print('target gs',target_gs.shape, target_gs)
        
        CSPtest.fit(csp_train_gs,target_gs)
        
        csp_train_gs_final=CSPtest.transform(csp_train_gs)

        #csp_train_gs_final=CSPtest.fit_transform(train[fea.values,target_gs)
        
        print('csp_train_gs_final',csp_train_gs_final.shape, csp_train_gs_final)
        
        
        train_Id=train['Id'].values.astype(np.int64)
        train_seq_Id=train['sequence_id'].values.astype(np.int64)
        train_patient_Id=train['patient_id'].values.astype(np.int64)
        train_result=train[target].values.astype(np.int64)
        
        
        csp_train_gs_final_index=np.column_stack((train_Id, train_seq_Id, train_patient_Id,
                                                  csp_train_gs_final, train_result,
                                                  train.index.values.astype(np.int64)))
        
        print('csp_train_gs_final_index',csp_train_gs_final_index)
        
        csp_train_gs_f_index=csp_train_gs_final_index[~np.any(np.isinf(csp_train_gs_final_index), axis=1)]
        
        #print('csp_train_gs_f_index',csp_train_gs_f_index)
        index_csp_train_gs=csp_train_gs_f_index[:,csp_train_gs_f_index.shape[1]-1].astype(np.int64)
        
        #print('index_csp_train_gs',index_csp_train_gs)
        
        csp_train_gs_f=np.delete(csp_train_gs_f_index,csp_train_gs_f_index.shape[1]-1, 1)
        
        #print('csp_train_gs_f',csp_train_gs_f)
        
        features_names=['feature'+str(i) for i in range(channels)]
        
        train_gs_columns=['Id','sequence_id', 'patient_id']+features_names+['result']
        
        
        train_gs_f=pd.DataFrame(csp_train_gs_f, index=index_csp_train_gs, columns =train_gs_columns)
        
        #print('train gs f',train_gs_f)
    
        splitKF=kf.split(unique_seq_X, unique_seq_y)
        
        #print('splitKF',splitKF)
    
        best_param=param_search_embedded(nfolds, train_gs_columns, target, splitKF, unique_sequences, 
                                         unique_sequences_fold,train_gs_f, PCAkeyGS)
    
        print('after best_param', best_param)
        
        
        for key in best_param:
            if key in xgboost_to_xgb:   
                best_param[xgboost_to_xgb[key]]=best_param[key]
                del best_param[key]
                
        #print ('substitution', best_param)
        params={key : best_param.get(key, value) for key, value in params.items()}
        
        print (params)
        
        
        
        
        
    
    if CSPkey and mode!=0:
        
        seq_chosen=seq_number
        csp_components=csp_n

#        train_seq_Id=train['sequence_id'].values.astype(np.int64)
        
#        files_id_test=test['Id'].tolist()
#        
#        print('files_number',len(files_id_test))
#        
#        file_name_test=[]
#        
#        for i,f_id in enumerate(files_id_test):
#            
##            if train_seq_Id[i]==seq_number:
#                
#            real_f_id=f_id % 100000   
#            file_name_test.append("./test_"+str(mode)+'_new/new_'+str(mode)+'_'+str(real_f_id)+'.mat')        
#
#
#        print('files_id_test',len(files_id_test),'file_name_test',len(file_name_test))
#        
#        test_div=20
#        
#        parts_test=int(len(file_name_test)/test_div)+1
#        
#        print('parts_test', parts_test)
#        
#        csp_test=[]
#        
#        for k in range(parts_test):
#        
#            track1=0
#            
#            ki=k
#            if k==(parts_test-1):
#                kfin=len(file_name_test)
#                print('kfin',kfin)
#            else:
#                kfin=(k+1)*test_div
#                print('kfin',kfin)
#
#            for i, fl1 in enumerate(file_name_test[ki*test_div:kfin]):
#            #    print(i)
#
#                if i==track1:
#        
#                    print('checking')
#     
#
#                    tables, sequence_from_mat, samp_freq = mat_to_pandas(fl1)
#                
#                    csp_left=int(csp_init*10*60*samp_freq)
#                    csp_right=int(csp_end*10*60*samp_freq)
#                
#                    print('csp left right', csp_left, csp_right)
#        
#        
#
#                    data_csp_test=np.transpose(tables.values[csp_left:csp_right,:])
#                
#                
#          
#                    print('done!')
#        
#                    continue
#              
#   
#                try:
#                    tables1, sequence_from_mat1, samp_freq1 = mat_to_pandas(fl1)
#                except:
#                    print('Some error here {}...'.format(fl1))
#                    continue
#                
#                csp_left=int(csp_init*10*60*samp_freq1)
#                csp_right=int(csp_end*10*60*samp_freq1)
#            
#                print('csp left right', csp_left, csp_right)
#            
#        
#                temp_matrix=np.transpose(tables1.values[csp_left:csp_right,:])
#                data_csp_test=np.dstack((data_csp_test,temp_matrix))
#                print('data_csp_test',data_csp_test.shape)
#
##            csp_test.append(data_csp_test.transpose((2,0,1)))
#
#            outfile = "csp_test_"+str(mode) +"_"+ "part_"+str(k)+ ".txt"
#            data_csp_test.transpose((2,0,1)).tofile(outfile)
#
#            del data_csp_test
            
#            print('csp_test',len(csp_test))     
    
        
        
        
        if GridSearch:
            
            seq_chosen=seq_number
            csp_components=csp_n
        

        
            files_id_train=train['Id'].tolist()
            results_id_train=train[target].tolist()
        
            print('files_number',len(files_id_train))
            
       
            file_name_train=[]
        
            for i,f_id in enumerate(files_id_train):
                
                if (f_id % 1000) % 6 == 0:
                     train_seq_Id=6
                else:
                     train_seq_Id=(f_id % 1000) % 6
                
                if train_seq_Id==seq_chosen:
                
                    real_f_id=f_id % 100000   
                    file_name_train.append("./train_"+str(mode)+'/'+str(mode)+'_'
                                      +str(real_f_id)+'_'+str(results_id_train[i])+'.mat')        


            print('files_id_train',len(files_id_train),'file_name_train',len(file_name_train))
        
       
        
            track1=0

            for i, fl1 in enumerate(file_name_train):
                #    print(i)

                if i==track1:
        
                    print('checking')
     

                    tables, sequence_from_mat, samp_freq = mat_to_pandas(fl1)
                
                    csp_left=int(csp_init*10*60*samp_freq)
                    csp_right=int(csp_end*10*60*samp_freq)
                
                    print('csp left right', csp_left, csp_right)
        
        
                    if int(sequence_from_mat[0][0][0][0])==seq_chosen:
                
                        data_csp_train=tables.values[csp_left:csp_right,:]
                        print('done!')
                        continue
                    
                    else:
                        track1+=1
                        continue
                
   
                try:
                    tables1, sequence_from_mat1, samp_freq1 = mat_to_pandas(fl1)
                except:
                    print('Some error here {}...'.format(fl1))
                    continue
                
                csp_left=int(csp_init*10*60*samp_freq1)
                csp_right=int(csp_end*10*60*samp_freq1)
            
                print('csp left right', csp_left, csp_right)
                
                if int(sequence_from_mat1[0][0][0][0])==seq_chosen:
        
                    temp_matrix=tables1.values[csp_left:csp_right,:]
                    data_csp_train=np.dstack((data_csp_train,temp_matrix))
                    print('data_csp_train',data_csp_train.shape)

            csp_train_gs=data_csp_train.transpose((2,1,0))

            print('csp_test',csp_train.shape, csp_train_gs)
            
            
            
        
            CSPtest=CSP(n_components=csp_components)
        
            target_gs=train[target].values.astype(np.int64)
        
            print('target gs',target_gs.shape, target_gs)
        
            CSPtest.fit(csp_train_gs,target_gs)
        
            csp_train_gs_final=CSPtest.transform(csp_train_gs)

            #csp_train_gs_final=CSPtest.fit_transform(train[fea.values,target_gs)
        
            print('csp_train_gs_final',csp_train_gs_final.shape, csp_train_gs_final)
        
        
            train_Id=train['Id'].values.astype(np.int64)
            train_seq_Id=train['sequence_id'].values.astype(np.int64)
            train_patient_Id=train['patient_id'].values.astype(np.int64)
            train_result=train[target].values.astype(np.int64)
        
        
            csp_train_gs_final_index=np.column_stack((train_Id, train_seq_Id, train_patient_Id,
                                                  csp_train_gs_final, train_result,
                                                  train.index.values.astype(np.int64)))
        
            print('csp_train_gs_final_index',csp_train_gs_final_index)
            
            csp_train_gs_f_index=csp_train_gs_final_index[~np.any(np.isinf(csp_train_gs_final_index), axis=1)]
        
            #print('csp_train_gs_f_index',csp_train_gs_f_index)
            index_csp_train_gs=csp_train_gs_f_index[:,csp_train_gs_f_index.shape[1]-1].astype(np.int64)
        
            #print('index_csp_train_gs',index_csp_train_gs)
        
            csp_train_gs_f=np.delete(csp_train_gs_f_index,csp_train_gs_f_index.shape[1]-1, 1)
        
            #print('csp_train_gs_f',csp_train_gs_f)
        
            features_names=['feature'+str(i) for i in range(channels)]
        
            train_gs_columns=['Id','sequence_id', 'patient_id']+features_names+['result']
        
        
            train_gs_f=pd.DataFrame(csp_train_gs_f, index=index_csp_train_gs, columns =train_gs_columns)
            
            #print('train gs f',train_gs_f)
    
            splitKF=kf.split(unique_seq_X, unique_seq_y)
        
            #print('splitKF',splitKF)
    
            best_param=param_search_embedded(nfolds, train_gs_columns, target, splitKF, unique_sequences, 
                                         unique_sequences_fold,train_gs_f, PCAkeyGS)
    
            print('after best_param', best_param)
        
        
            for key in best_param:
                if key in xgboost_to_xgb:   
                    best_param[xgboost_to_xgb[key]]=best_param[key]
                    del best_param[key]
                
            #print ('substitution', best_param)
            params={key : best_param.get(key, value) for key, value in params.items()}
        
            print (params)
        
        
        
        
        
        
        
        
        
        
        
        

    
    for train_seq_index, test_seq_index in kf.split(unique_seq_X, unique_seq_y):
        num_fold += 1
        print('Start fold {} from {}'.format(num_fold, nfolds))
        train_seq = unique_sequences[train_seq_index]
        valid_seq = unique_sequences[test_seq_index]
        print('Length of train people: {}'.format(len(train_seq)))
        print('Length of valid people: {}'.format(len(valid_seq)))
        
#        print('train_seq',train_seq)
#        print('valid_seq',valid_seq)

        X_train, X_valid = train[unique_sequences_fold.isin(train_seq)][features], train[unique_sequences_fold.isin(valid_seq)][features]
        y_train, y_valid = train[unique_sequences_fold.isin(train_seq)][target], train[unique_sequences_fold.isin(valid_seq)][target]
        X_test = test[features]
        
        X_train_seq, X_valid_seq =train[unique_sequences_fold.isin(train_seq)]['sequence_id'],\
                                    train[unique_sequences_fold.isin(valid_seq)]['sequence_id']
        
        print('X_train index',X_train.index)
        print('X_valid index',y_train.index)
        print('X_test index', X_test.index.shape)

        print('Length train:', len(X_train))
        print('Length valid:', len(X_valid))
        
        print('X_train_seq', X_train_seq.shape)
        print('X_valid_seq', X_valid_seq.shape)
        
#       Scaling for PCA


        scaler = MinMaxScaler()   
        
        Xtrain_scaled=pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
        Xvalid_scaled=pd.DataFrame(scaler.fit_transform(X_valid), columns=X_valid.columns, index=X_valid.index )

        Xtest_scaled=pd.DataFrame(scaler.fit_transform(X_test), columns=X_test.columns, index=X_test.index)


        if PCAgraph:
            
            pcatest=KernelPCA(n_components=20)
            pcatest.fit(Xtrain_scaled)
            var1=np.cumsum(np.round(pcatest.explained_variance_ratio_, decimals=4)*100)
            f1 = plt.figure()
            print(var1)
            plt.plot(var1)
            plt.show()

    
        if PCAkey:      
        
#       PCA transformation 
            pcatest=PCA(n_components=20)
            X_train_f=pd.DataFrame(pcatest.fit_transform(Xtrain_scaled), index=Xtrain_scaled.index)
            X_valid_f=pd.DataFrame(pcatest.fit_transform(Xvalid_scaled), index=Xvalid_scaled.index)

            X_test_f=pd.DataFrame(pcatest.fit_transform(Xtest_scaled), index=Xtest_scaled.index)


        
        if CSPkey1 and mode!=0:
            
            print('train length ', X_train.shape[0])
            
            for item in range(X_train.shape[0]):
            
                if item==0:
                
                    X_train_3d_pre=X_train[features].iloc[[item]].drop(['file_size','patient_id'],1).values
                    X_train_3d=np.reshape(X_train_3d_pre,newshape=(num_features, channels), order='F')
                    continue
                
                X_train_3d_pre = X_train[features].iloc[[item]].drop(['file_size','patient_id'],1).values
                X_train_red=np.reshape(X_train_3d_pre,newshape=(num_features, channels), order='F')
                X_train_3d=np.dstack((X_train_3d,X_train_red))
          
        
        
            csp_train=X_train_3d.transpose((2,1,0))
            
            
            print('valid length ',X_valid.shape[0])
            
            for item in range(X_valid.shape[0]):
            
                if item==0:
                
                    X_valid_3d_pre=X_valid[features].iloc[[item]].drop(['file_size','patient_id'],1).values
                    X_valid_3d=np.reshape(X_valid_3d_pre,newshape=(num_features, channels), order='F')
                    continue
                
                X_valid_3d_pre = X_valid[features].iloc[[item]].drop(['file_size','patient_id'],1).values
                X_valid_red=np.reshape(X_valid_3d_pre,newshape=(num_features, channels), order='F')
                X_valid_3d=np.dstack((X_valid_3d,X_valid_red))
          
        
        
            csp_valid=X_valid_3d.transpose((2,1,0))
            
            y_csp_train=y_train.values
            y_csp_valid=y_valid.values
            
            
            print('csp_train',csp_train.shape)
            print('csp_valid',csp_valid.shape)
            print('csp_test',csp_test.shape) 
            
            CSPtest=CSP(n_components=csp_components)
        
            csp_train_final=CSPtest.fit_transform(csp_train, y_csp_train)
            csp_valid_final=CSPtest.transform(csp_valid)
            csp_test_final=CSPtest.transform(csp_test)
            
            print('csp train final' ,csp_train_final.shape, 'csp valid final', csp_valid_final.shape)
            print('csp test final' ,csp_test_final.shape)
            
            csp_train_final_index=np.column_stack((csp_train_final, X_train.index.values))
            csp_valid_final_index=np.column_stack((csp_valid_final, X_valid.index.values))
            csp_test_final_index=np.column_stack((csp_test_final, X_test.index.values))
            
            csp_test_final_index[csp_test_final_index == inf] = 100000
            csp_test_final_index[csp_test_final_index == -inf] = 100000
            
            print('csp train final_index' ,csp_train_final_index.shape, 'csp valid final_index', csp_valid_final_index.shape)
            print('csp test final index' ,csp_test_final_index.shape)

            
            csp_train_f_index=csp_train_final_index[~np.any(np.isinf(csp_train_final_index), axis=1)] 
            csp_valid_f_index=csp_valid_final_index[~np.any(np.isinf(csp_valid_final_index), axis=1)]
            csp_test_f_index=csp_test_final_index[~np.any(np.isinf(csp_test_final_index), axis=1)]
            
            print('csp train f index' ,csp_train_f_index.shape, 'csp valid f index', csp_valid_f_index.shape)
            print('csp test f index' ,csp_test_f_index.shape)
            
            index_csp_train=csp_train_f_index[:,csp_train_f_index.shape[1]-1].astype(np.int64)
            index_csp_valid=csp_valid_f_index[:,csp_valid_f_index.shape[1]-1].astype(np.int64)
            index_csp_test=csp_test_f_index[:,csp_test_f_index.shape[1]-1].astype(np.int64)
            
            print('index csp train' ,index_csp_train.shape, 'index csp valid', index_csp_valid.shape)
            print('index_csp_test' ,index_csp_test.shape)
            
            
            csp_train_f=np.delete(csp_train_f_index,csp_train_f_index.shape[1]-1, 1)
            csp_valid_f=np.delete(csp_valid_f_index,csp_valid_f_index.shape[1]-1, 1)
            csp_test_f=np.delete(csp_test_f_index,csp_test_f_index.shape[1]-1, 1)
            
            print('csp train f' ,csp_train_f.shape, 'csp valid f', csp_valid_f.shape)
            print('csp test f' ,csp_test_f.shape)
            
            X_train_f=pd.DataFrame(csp_train_f, index=index_csp_train)
            X_valid_f=pd.DataFrame(csp_valid_f, index=index_csp_valid)

            X_test_f=pd.DataFrame(csp_test_f, index=index_csp_test)
            
            
            #print('X_train_f',X_train_f.shape,'X_valid_f', X_valid_f.shape, 'X_test_f', X_test_f.shape )
            #print('y_train',y_train.shape, 'y_valid', y_valid.shape)
            
            print('X_train_f',X_train_f.shape,'X_valid_f', X_valid_f.shape, 'X_test_f', X_test_f.shape )
            print('y_train',y_train.shape, 'y_valid', y_valid.shape)
            
        else:
        
            X_train_f=X_train
            X_valid_f=X_valid
            X_test_f=X_test
            
            
        
        if CSPkey and mode!=0:
            
            seq_chosen=seq_number
            csp_components=csp_n            
            
            #X_train, X_valid = train[(unique_sequences_fold.isin(train_seq)) & ((X_train_seq % 1000) % 6 == seq_chosen)][features],\
                                #train[(unique_sequences_fold.isin(valid_seq))&((X_valid_seq % 1000) % 6 == seq_chosen)][features]
            #y_train, y_valid = train[(unique_sequences_fold.isin(train_seq)) & ((X_train_seq % 1000) % 6 == seq_chosen)][target],\
                                #train[(unique_sequences_fold.isin(valid_seq)) &((X_valid_seq % 1000) % 6 == seq_chosen)][target]
            
            print('X_train csp',X_train.shape)
            print('X_valid csp',X_valid.shape)
            print('y_train csp', y_train.shape)
            print('y_valid csp', y_valid.shape)
            
            
            #taking the 'Id' files from fold
            
            #files_id_train=train[unique_sequences_fold.isin(train_seq)&((X_train_seq % 1000) % 6 == seq_chosen)]['Id'].tolist()
            files_id_train=train[unique_sequences_fold.isin(train_seq)]['Id'].tolist()
            results_id_train=y_train.tolist()
            
            
            #files_id_valid=train[unique_sequences_fold.isin(valid_seq)&((X_valid_seq % 1000) % 6 == seq_chosen)]['Id'].tolist()
            files_id_valid=train[unique_sequences_fold.isin(valid_seq)]['Id'].tolist()
            results_id_valid=y_valid.tolist()
            
            
            print('files_id_train',len(files_id_train),'results_id_train',len(results_id_train))
            print('files_id_valid',len(files_id_valid),'results_id_valid',len(results_id_valid))
            
            file_name_train=[]
            file_name_valid=[]
            
            for i,f_id in enumerate(files_id_train):
                
#                sequence_validator=(X_train_seq[i] % 1000) % 6
                
#                print('sequence validator', sequence_validator)
                
                
#                if sequence_validator==seq_chosen:
                
                real_f_id=f_id % 100000 
                file_name_train.append("./train_"+str(mode)+'/'+str(mode)+'_'+str(real_f_id)+'_'
                                           +str(results_id_train[i])+'.mat')        


            print('files_id_train',len(files_id_train),'results_id_train',len(results_id_train),'file_name_train',len(file_name_train))

            
            
            for i,f_id in enumerate(files_id_valid):
                
#                sequence_validator=(X_valid_seq[i] % 1000) % 6
                
#                if sequence_validator==seq_chosen:
                
                real_f_id=f_id % 100000 
                file_name_valid.append("./train_"+str(mode)+'/'+str(mode)+'_'+str(real_f_id)+'_'
                                           +str(results_id_valid[i])+'.mat')        


            print('files_id_valid',len(files_id_valid),'results_id_valid',len(results_id_valid),'file_name',len(file_name_valid))

            
    
#            result_list_train=[]
#            result_list_valid=[]
            track1=0
        
            for i, fl1 in enumerate(file_name_train):
                #    print(i)
                
#                result = results_id_train[i]
                if i==track1:
                    print('checking')
                    
                    tables, sequence_from_mat, samp_freq = mat_to_pandas(fl1)
                    csp_left=int(csp_init*10*60*samp_freq)
                    csp_right=int(csp_end*10*60*samp_freq)

                    data_csp_train=np.transpose(tables.values[csp_left:csp_right,:])
                    
                    if int(sequence_from_mat[0][0][0][0])!=seq_chosen:
                        print('error train seq here!',int(sequence_from_mat[0][0][0][0]), seq_chosen, fl1)
        
#                    result_list_train.append(result)
        
                    print('done!')
        
                    continue
              
   
                try:
                    tables1, sequence_from_mat1, samp_freq1 = mat_to_pandas(fl1)
                    csp_left=int(csp_init*10*60*samp_freq)
                    csp_right=int(csp_end*10*60*samp_freq)
                    
                except:
                    print('Some error here {}...'.format(fl))
                    continue
    
#                if sequence_from_mat1==seq_chosen:
                if int(sequence_from_mat1[0][0][0][0])!=seq_chosen:
                        print('error train seq here!',int(sequence_from_mat1[0][0][0][0]), seq_chosen, fl1)
        
                temp_matrix=np.transpose(tables1.values[csp_left:csp_right,:])
                data_csp_train=np.dstack((data_csp_train,temp_matrix))
#                result_list_train.append(result)
                
#                print(data_csp_train.shape)
                    
                    
                    
            for i, fl1 in enumerate(file_name_valid):
                #    print(i)
                
#                result = results_id_valid[i]
                if i==track1:
        
                    print('checking')
                  
        
                    tables, sequence_from_mat, samp_freq = mat_to_pandas(fl1)
                    csp_left=int(csp_init*10*60*samp_freq)
                    csp_right=int(csp_end*10*60*samp_freq)

                    data_csp_valid=np.transpose(tables.values[csp_left:csp_right,:])
                
                    if int(sequence_from_mat[0][0][0][0])!=seq_chosen:
                        print('error valid seq here!',int(sequence_from_mat[0][0][0][0]), seq_chosen, fl1)
        
#                    result_list_valid.append(result)
        
                    print('done!')
        
                    continue
              
   
                try:
                    tables1, sequence_from_mat1, samp_freq1 = mat_to_pandas(fl1)
                    csp_left=int(csp_init*10*60*samp_freq)
                    csp_right=int(csp_end*10*60*samp_freq)
                except:
                    print('Some error here {}...'.format(fl1))
                    continue
    
#                if sequence_from_mat1==seq_chosen:
                if int(sequence_from_mat1[0][0][0][0])!=seq_chosen:
                    print('error valid seq here!',int(sequence_from_mat1[0][0][0][0]), seq_chosen, fl1)

                temp_matrix=np.transpose(tables1.values[csp_left:csp_right,:])
                data_csp_valid=np.dstack((data_csp_valid,temp_matrix))
#                result_list_valid.append(result)
                
#                print(data_csp_valid.shape)

#            y_csp_train=np.array(result_list_train)
#            y_csp_valid=np.array(result_list_valid)

            y_csp_train=y_train.values
            y_csp_valid=y_valid.values
                   
            
            print('y_csp_train',y_csp_train.shape,'y_csp_valid', y_csp_valid.shape)
            print('y_csp_train',y_csp_train,'y_csp_valid', y_csp_valid)


            #print(data_csp.shape)    

            csp_train=data_csp_train.transpose((2,0,1))
            csp_valid=data_csp_valid.transpose((2,0,1))
            
#            print('csp_train',csp_train.shape,'csp_valid', csp_valid.shape)
#            print('csp_train',type(csp_train),'csp_valid', type(csp_valid))

            #print(csp_data.shape)     
    
            CSPtest=CSP(n_components=csp_components)
        
            csp_train_final=CSPtest.fit_transform(csp_train, y_csp_train)
            csp_valid_final=CSPtest.transform(csp_valid)
            
            
            track1=0
            
            print('starting reading test files bits')
            
            
            
            
            files_id_test=test['Id'].tolist()
        
            print('files_number',len(files_id_test))
        
            file_name_test=[]
        
            for i,f_id in enumerate(files_id_test):
            
#            if train_seq_Id[i]==seq_number:
                
                real_f_id=f_id % 100000   
                file_name_test.append("./test_"+str(mode)+'_new/new_'+str(mode)+'_'+str(real_f_id)+'.mat')        


            print('files_id_test',len(files_id_test),'file_name_test',len(file_name_test))
            
            test_div=20
            
            parts_test=int(len(file_name_test)/test_div)+1
            
            print('parts_test', parts_test)
            
            csp_test=[]
            
            track=0
            
            for k in range(parts_test):
            
                track1=0
                
                
                ki=k
                if k==(parts_test-1):
                    kfin=len(file_name_test)
                    print('kfin',kfin)
                else:
                    kfin=(k+1)*test_div
                    print('kfin',kfin)
    
                for i, fl1 in enumerate(file_name_test[ki*test_div:kfin]):
                #    print(i)
    
                    if i==track1:
            
                        print('checking')
         
    
                        tables, sequence_from_mat, samp_freq = mat_to_pandas(fl1)
                    
                        csp_left=int(csp_init*10*60*samp_freq)
                        csp_right=int(csp_end*10*60*samp_freq)
                    
 #                       print('csp left right', csp_left, csp_right)
            
            
    
                        data_csp_test=np.transpose(tables.values[csp_left:csp_right,:])
                    
                    
              
                        print('done!')
            
                        continue
                  
       
                    try:
                        tables1, sequence_from_mat1, samp_freq1 = mat_to_pandas(fl1)
                    except:
                        print('Some error here {}...'.format(fl1))
                        continue
                    
                    csp_left=int(csp_init*10*60*samp_freq1)
                    csp_right=int(csp_end*10*60*samp_freq1)
                
#                    print('csp left right', csp_left, csp_right)
                
            
                    temp_matrix=np.transpose(tables1.values[csp_left:csp_right,:])
                    data_csp_test=np.dstack((data_csp_test,temp_matrix))
#                    print('data_csp_test',data_csp_test.shape)
                    
                    del temp_matrix
                
                temp_test_csp=CSPtest.transform(data_csp_test.transpose((2,0,1)))
                
#                print('test shape bit',temp_test_csp.shape)
                
                if k==track:
                    csp_test_final=temp_test_csp
                    continue
                    
                csp_test_final=np.concatenate((csp_test_final,temp_test_csp))
    
    #            csp_test.append(data_csp_test.transpose((2,0,1)))
    
#                outfile = "csp_test_"+str(mode) +"_"+ "part_"+str(k)+ ".txt"
#                data_csp_test.transpose((2,0,1)).tofile(outfile)
    
                del data_csp_test
                del temp_test_csp
            
#            print('csp_test',len(csp_test))             
        
            
            
                                 
#            for k in range(parts_test):
#                
#                if track1==k:
#                    
#                    outfile = "csp_test_"+str(mode) +"_"+ "part_"+str(k)+ ".txt"
#                    temp_test_csp=np.fromfile(outfile)
#                    
#                    csp_test_final=CSPtest.transform(temp_test_csp)
#                    continue
#                    
#                outfile = "csp_test_"+str(mode) +"_"+ str(patient_id) + "part_"+str(k)+ ".csv"
#                temp_test_csp=np.fromfile(outfile)
#                temp_test_csp1=CSPtest.transform(temp_test_csp)
#                csp_test_final=np.concatenate((csp_test_final,temp_test_csp1))
                print('done! ',k)
            
            print('csp train final' ,csp_train_final.shape,
                  'csp valid final', csp_valid_final.shape, 'csp_test_final', csp_test_final.shape)
            
            csp_train_final_index=np.column_stack((csp_train_final, y_train.values, X_train.index.values))
            csp_valid_final_index=np.column_stack((csp_valid_final, y_valid.values, X_valid.index.values))
            csp_test_final_index=np.column_stack((csp_test_final, X_test.index.values))
            
#            print('csp train final_index' ,csp_train_final_index, 'csp valid final_index', csp_valid_final_index)
            
            
            csp_train_f_index=csp_train_final_index[~np.any(np.isinf(csp_train_final_index), axis=1)] 
            csp_valid_f_index=csp_valid_final_index[~np.any(np.isinf(csp_valid_final_index), axis=1)]
 #           csp_test_f_index=csp_test_final_index[~np.any(np.isinf(csp_test_final_index), axis=1)]
            csp_test_final_index[np.isinf(csp_test_final_index)]=0
            csp_test_f_index=csp_test_final_index

#           replace_test_csp=np.isinf()
#           print('post_test_csp',replace_test_csp)

            
            
#            print('csp train f index' ,csp_train_f_index, 'csp valid f index', csp_valid_f_index)
            
            index_csp_train=csp_train_f_index[:,csp_train_f_index.shape[1]-1]
            index_csp_valid=csp_valid_f_index[:,csp_valid_f_index.shape[1]-1]
            index_csp_test=csp_test_f_index[:,csp_test_f_index.shape[1]-1]
            
#            print('index csp train' ,index_csp_train, 'index csp valid', index_csp_valid)

            y_train=csp_train_f_index[:,csp_train_f_index.shape[1]-2]
            y_valid=csp_valid_f_index[:,csp_valid_f_index.shape[1]-2]
          
#            print('index csp train' ,index_csp_train, 'index csp valid', index_csp_valid)
            
            
            csp_train_f=np.delete(csp_train_f_index,np.s_[csp_train_f_index.shape[1]-2,csp_train_f_index.shape[1]-1], 1)
            csp_valid_f=np.delete(csp_valid_f_index,np.s_[csp_train_f_index.shape[1]-2,csp_train_f_index.shape[1]-1], 1)
            csp_test_f=np.delete(csp_test_f_index,csp_test_f_index.shape[1]-1, 1)
            
            print('csp train f' ,csp_train_f.shape, 'csp valid f', csp_valid_f.shape, 'csp_test_f', csp_test_f.shape)
            
            X_train_f=pd.DataFrame(csp_train_f, index=np.int64(index_csp_train))
            X_valid_f=pd.DataFrame(csp_valid_f, index=np.int64(index_csp_valid))

            X_test_f=pd.DataFrame(csp_test_f, index=np.int64(index_csp_test))
            
            
            print('X_train_f',X_train_f.shape,'X_valid_f', X_valid_f.shape, 'X_test_f', X_test_f.shape )
            print('y_train',y_train.shape, 'y_valid', y_valid.shape)
            
            #print('X_train_f',X_train_f,'X_valid_f', X_valid_f, 'X_test_f', X_test_f )
            #print('y_train',y_train, 'y_valid', y_valid)
            

#       SMOTE oversampling
        
#        print('Original dataset shape {}'.format(Counter(y_train)))
#        print('Original dataset shape {}'.format(Counter(X_train_f)))
#        print(X_train_f)
#        print(y_train)

        if Oversampling:
        
            sm1 = SMOTETomek(random_state=42)
            X_res,y_res = sm1.fit_sample(X_train_f,y_train)
            X_train_f=pd.DataFrame(X_res, columns=X_train_f.columns)
            y_train=pd.Series(y_res)# Does it need the index of y_train?
        
#        print('Resampled dataset shape {}'.format(Counter(y_train)))
#        print(X_train_f)
#        print(y_train)
    
    
                        
        
        
        
#       Preparation for XGB training

        dtrain = xgb.DMatrix(X_train_f, y_train)
        dvalid = xgb.DMatrix(X_valid_f, y_valid)

        watchlist = [(dtrain, 'train'), (dvalid, 'eval')]       
        
        gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=early_stopping_rounds, verbose_eval=500)

        yhat = gbm.predict(xgb.DMatrix(X_valid_f), ntree_limit=gbm.best_iteration+1)

#       Each time store portion of precicted data in train predicted values

        for i in range(len(X_valid_f.index)):
            yfull_train[X_valid_f.index[i]] = yhat[i]
            
        print("Validating...")
        check = gbm.predict(xgb.DMatrix(X_valid_f), ntree_limit=gbm.best_iteration+1)
        score = roc_auc_score(y_valid.tolist(), check)
        print('Check error value: {:.6f}'.format(score))

        print("Predict test set...")
        test_prediction1 = gbm.predict(xgb.DMatrix(X_test_f), ntree_limit=gbm.best_iteration+1)
        print('test prediction', test_prediction1.shape, 'yfull_test', yfull_test.shape)
        
#        for item in replace_test_csp.tolist():
#            np.insert(test_prediction1, item, 0)
        print('test_prediction1 shape',test_prediction1.shape)
        yfull_test['kfold_' + str(num_fold)] = test_prediction1
        
              

    print('iteration finished')
    # Copy dict to list
    train_res = []
    
#    print('train.index',train.index, train.index[0], 'train shape', train.shape)
    print('yfull_train', len(yfull_train), list(yfull_train.keys())[0], list(yfull_train.keys())
         ,type(list(yfull_train.keys())[0]))
    
#    print('train_indexes',train.index.values.tolist())
    
    iterator_train=train.index.values.tolist()
    for i in iterator_train:
        if i in yfull_train:
            train_res.append(yfull_train[i])
            
        else:
            print('this index is missing! ', i)
            miss_index=i
            
            print('miss_index', miss_index)
            row_miss=train.loc[[i]]
#            print('row_miss', row_miss)
        
            missing_id=row_miss.index.values[0]

            
            print('missing_id', missing_id)
#            print('missing_result', missing_result)
            train=train[(train.index != missing_id)]
            
            print('train shape', train.shape)
            print('train_res shape', len(train_res))
            
#    print('test indexes', test.index.values.tolist())
    
    
#    print('test.index',test.index, test.index[0], 'test shape', test.shape)
#    print('X_test_f index', X_test_f.index.values, 'X test shape', X_test_f.shape)
    
    iterator_test=test.index.values.tolist()
    
    for j in iterator_test :
    
        
        if j in X_test_f.index.values.tolist():
            pass
        else:
            missing_test_index=j
            
            row_test_miss=test.loc[[j]]
            
            missing_test_id=row_miss['Id'].values[0]
            
            
            test=test[test['Id'] != missing_test_id]
            
            print('test shape', test.shape)
            

     
    
    print('train shape', train.shape)

    score = roc_auc_score(train[target], np.array(train_res))
    print('Check error value: {:.6f}'.format(score))

    # Find mean for KFolds on test
    merge = []
    for i in range(1, nfolds+1):
        merge.append('kfold_' + str(i))
    yfull_test['mean'] = yfull_test[merge].mean(axis=1)
    


    print('Training time: {} minutes'.format(round((time.time() - start_time)/60, 2)))


    #Pred_per_patient currently in development. Is deactivated and should not be used.
    if pred_per_patient:
    
        total_results=yfull_test['mean'].values
        hist, bins = np.histogram(total_results, bins=50)
        width = 0.7 * (bins[1] - bins[0])
        center = (bins[:-1] + bins[1:]) / 2
        plt.bar(center, hist, align='center', width=width)
        plt.show()
    
    
        total_results_train=np.array(train_res)
        hist, bins = np.histogram(total_results_train, bins=50)
        width = 0.7 * (bins[1] - bins[0])
        center = (bins[:-1] + bins[1:]) / 2
        plt.bar(center, hist, align='center', width=width)
        plt.show()
        
    #Saving module and xgboost parameters to JSON file
    
    xgboost_params=params
    
    parameters=[]
    
    parameters.append(xgboost_params)
    parameters.append(function_params)
    
    now = datetime.datetime.now()
    
    parameter_file_name=str('parameter-file-'+'mode-'+str(mode)+'-'+now.strftime("%Y-%m-%d-%H-%M"))
    
    json.dump(parameters, open(parameter_file_name+".txt",'w'), indent=4)
#    read_params = json.load(open(parameter_file_name+".txt"), object_pairs_hook=OrderedDict)
    
#    print (read_params)
    
#    print('yfull_test shape', yfull_test.shape)


    
    return yfull_test['mean'].values, score, yfull_train, train_res, test

## Module for Training and Prediction

## Module for Grid Search

### This is not currently being used. Will probably be deleted in future versions.

In [None]:
def run_param_search(nfolds, train, test, features, target, random_state=2016,  PCAkey=False, SEQoriginal=False):

    num_boost_round = 1000
    early_stopping_rounds = 50
    
    train_index_group=[]
    test_index_group=[]
    

    
    yfull_train = dict()
    yfull_test = copy.deepcopy(test[['Id']].astype(object))
    print('train sequences',train['sequence_id'])
    
    
    unique_sequences = np.array(train['sequence_id'].unique())
#    print('unique sequences', unique_sequences, len(unique_sequences))
    
    groups1=np.fix(unique_sequences/1000)
    
    groups2=groups1.astype(int)
#    print('groups', groups2)
    
    gkf = GroupKFold(n_splits=3)
    test1=gkf.split(unique_sequences, groups=groups2)
    test2=gkf.split(unique_sequences, groups=groups2)
    
    
    if SEQoriginal:
        sequences_full=np.mod(train['sequence_id'].values,1000)
        unique_sequences2=np.mod(unique_sequences,1000)
        unique_sequences_fold=pd.Series(sequences_full, index=train['sequence_id'].index)
#        print('unique_sequences_fold', unique_sequences_fold)
    
        unique_sequences = np.unique(unique_sequences2)

    else:
        unique_sequences_fold=pd.Series(train['sequence_id'], index=train['sequence_id'].index)

    
    kf = KFold(len(unique_sequences), n_folds=nfolds, shuffle=True, random_state=random_state)
    num_fold = 0
    num_fold1=0
    
    
    
    
#   param_test1 = {'max_depth': [1,3,5]}
#   param_test1 = {'max_depth': [1,3,5,7,9], 'min_child_weight':[1,3,5,7]}
#   param_test1 = {'gamma':[i/10.0 for i in range(0,7)]}
#   param_test1 = { 'subsample':[i/10.0 for i in range(6,10)],'colsample_bytree':[i/10.0 for i in range(6,10)]}  
#   param_test1 = {'max_depth': [1,3], 'min_child_weight':[6,7,8,9,10]}
#    param_test1 = {'scale_pos_weight':[1,2,3,4,5], 'max_delta_step':[0,1,2,3,4,5]}

    for train_seq_index1, test_seq_index1 in kf:
        num_fold1 += 1
        print('this is creation of Kfold iterator')
        print('Start fold {} from {}'.format(num_fold1, nfolds))
    
        train_seq1 = unique_sequences[train_seq_index1]
        valid_seq1 = unique_sequences[test_seq_index1]
        
        print(train_seq1)
        print(valid_seq1)

        train_index = train[unique_sequences_fold.isin(train_seq1)].index.values
        test_index = train[unique_sequences_fold.isin(valid_seq1)].index.values

        print(train_index, type(train_index))
        print(test_index, type(test_index))
       
        train_index_group.append(train_index)
        test_index_group.append(test_index)
        
    
    print('train index group',train_index_group)

    custom_cv = [(train_index_group[i], test_index_group[i]) for i in range(0,3) ]
    
#    custom_cv=GroupShuffleSplit(n_splits=nfolds, test_size=0.5, random_state=0)

#    custom_cv = list(zip(train_index_group, test_index_group))

    print('custom cv', custom_cv)
                   

#    Scaling and PCA

    if PCAkey:

        scaler1 = MinMaxScaler() 
    
        train_features=train[features]
        train_target=train[target]
            
        train_scaled=pd.DataFrame(scaler1.fit_transform(train_features), columns=train_features.columns, index=train_features.index)


        pcatest=KernelPCA(kernel='poly')
        train_features_f=pd.DataFrame(pcatest.fit_transform(train_scaled), index=train_scaled.index)

        dmfeatures=train_features_f
        dmtarget=train_target        

    else:
    
        dmfeatures=train[features]
        dmtarget=train[target]
    
#   GridSearch
    
    classifier1=XGBClassifier( learning_rate =0.2, n_estimators=1000, max_depth=1,
        min_child_weight=3, gamma=0, subsample=0.6, colsample_bytree=0.8,
        objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27)

    gsearch1 = GridSearchCV(estimator = classifier1, param_grid = param_test1, scoring='roc_auc',n_jobs=-1,iid=False, cv=custom_cv)
  
    
#    gsearch1.fit(dmfeatures,dmtarget,groups=train['sequence_id'])
    gsearch1.fit(dmfeatures,dmtarget)

    print('best parameters, scores')    
    print(gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_)
    
#    classifier2 = xgb.train(params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=early_stopping_rounds, verbose_eval=500)



## Embedded Grid Search

### Used by Module for Training and Prediction

In [None]:
def param_search_embedded(n_folds,features, target, kf, unique_sequences, unique_sequences_fold, train, 
                          PCAkey=False):

    num_boost_round = 1000
    early_stopping_rounds = 50
    
    train_index_group=[]
    test_index_group=[]
    
    num_fold1=0
    
    print(train)
    print(train.shape)
    print(train.columns.values)
    

    param_test1={}
#    param_test1['learning_rate']= [i/10.0 for i in range(1,5)]
    param_test1['max_depth']= [1,2,3,4,5,6,7]
    param_test1['min_child_weight']=[1,2,3]
#   param_test1['gamma']=[i/10.0 for i in range(0,7)]
#    param_test1['subsample']=[i/10.0 for i in range(6,10)]
#    param_test1['colsample_bytree']=[i/10.0 for i in range(6,10)]
#    param_test1['scale_pos_weight']=[1,2,3,4,5]
#    param_test1['max_delta_step']=[0,1,2,3,4,5]

    for train_seq_index1, test_seq_index1 in kf:
        num_fold1 += 1
        print('this is creation of Kfold iterator')
        print('Start fold {} from {}'.format(num_fold1, n_folds))
    
        train_seq1 = unique_sequences[train_seq_index1]
        valid_seq1 = unique_sequences[test_seq_index1]
        
#        print(train_seq1)
#        print(valid_seq1)

        train_index = train[unique_sequences_fold.isin(train_seq1)].index.values
        test_index = train[unique_sequences_fold.isin(valid_seq1)].index.values

#        print(train_index, type(train_index))
#        print(test_index, type(test_index))
       
        train_index_group.append(train_index)
        test_index_group.append(test_index)
        
    
#    print('train index group',train_index_group)

    custom_cv = [(train_index_group[i], test_index_group[i]) for i in range(0,n_folds) ]
    
#    custom_cv=GroupShuffleSplit(n_splits=nfolds, test_size=0.5, random_state=0)

#    custom_cv = list(zip(train_index_group, test_index_group))

    print('custom cv', np.array(custom_cv).shape)
                   

#    Scaling and PCA

    if PCAkey:

        scaler1 = MinMaxScaler() 
    
        train_features=train[features]
        train_target=train[target]
            
        train_scaled=pd.DataFrame(scaler1.fit_transform(train_features), columns=train_features.columns, index=train_features.index)


        pcatest=PCA(20)
        train_features_f=pd.DataFrame(pcatest.fit_transform(train_scaled), index=train_scaled.index)

        dmfeatures=train_features_f
        dmtarget=train_target        

    else:
    
        dmfeatures=train[features]
        dmtarget=train[target]
    
#   GridSearch

    print('start grid search')
    
    classifier1=XGBClassifier( learning_rate =0.2, n_estimators=1000, max_depth=4,
        min_child_weight=2, gamma=0.1, subsample=0.9, colsample_bytree=0.9,
        objective= 'binary:logistic', nthread=4, scale_pos_weight=2, seed=27, max_delta_step=0)

    gsearch1 = GridSearchCV(estimator = classifier1, param_grid = param_test1, scoring='roc_auc',n_jobs=-1,iid=False, cv=custom_cv)
  
    
#    gsearch1.fit(dmfeatures,dmtarget,groups=train['sequence_id'])
    gsearch1.fit(dmfeatures,dmtarget)

    print('best parameters, scores')    
    print(gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_)
    
#    classifier2 = xgb.train(params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=early_stopping_rounds, verbose_eval=500)

    return gsearch1.best_params_

## Running Grid Search

In [None]:
run_param_search(3, train, test, features, 'result', SEQoriginal=True, PCAkey=False)

## Running Training, Prediction and Creating Submission

In [None]:
mode=3
feature_model=2
short_size=False
num_features=6
new_test=True
fini=
fend=
fovr=

params_training='parameter-file-mode-3-2016-11-22-14-16 (copy).txt'

read_params = json.load(open(params_training), object_pairs_hook=OrderedDict)


#Here checks if file with filter parameters exist.
feature_file='_'+str(feature_model)+'_short_'+str(short_dataset)+'_new_test_'+ str(new_test)\
            +'_fini_'+str(fini)+'_fend_'+str(fend)+'_fovr_'+str(fovr)

file_name_train="simple_train_" + str(patient_id) + feature_file + ".csv"
file_name_test="simple_train_" + str(patient_id) + feature_file + ".csv"

import os  

if os.path.isfile('./'+file_name_train):
    
    pass

#Here creates files with filters parameters

else:


    generating_files(feature_model, short_size, num_features, new_test,fini, fend, fovr):


time.sleep(10)
#Here read data from generated csv files
all_data=read_test_train_per_patient(feature_model, short_size=False, new_test=True)

if mode != 0:
    
    train, test, features=all_data[mode-1][0],all_data[mode-1][1],all_data[mode-1][2]


params_model=read_params[0]
prm=read_params[1]

#print(type(params_model) is OrderedDict)
    
score_list=[]
    
for item in range(1):
    
    folds_iter=item+3
    
    prediction, score, train_result1, train_result2 = run_train_predict(train, test, features, 'result', params_model,
                                                  nfolds=prm['nfolds'], random_state=prm['random_state'],
                                                  mode=mode, SEQoriginal=prm['SEQoriginal'], PCAkey=prm['PCAkey'], 
                                                  PCAgraph=prm['PCAgraph'], PCAkeyGS=prm['PCAkeyGS'],
                                                  Oversampling=prm['Oversampling'], GridSearch=True,
                                                  pred_per_patient=False)

    score_list.append(score)
#prediction, score = run_train_predict(train, test, features, 'result', read_params[0],**prm,
#                                      pred_per_patient=False)

print(score_list)

#create_submission(score, test, prediction, feature_model, short_size, new_test)


## Training and Analysis per patient

In [None]:
mode=3
feature_model=5
short_size=False
num_features=9
new_test=True
channels=16
#fini=
#fend=
#fovr=

#params_training='parameter-file-mode-3-2016-11-22-14-16 (copy).txt'
params_training='parameter-file-mode-3-2016-11-27-16-29.txt'


read_params = json.load(open(params_training), object_pairs_hook=OrderedDict)

all_data=read_test_train_per_patient(feature_model, short_size=False, new_test=True, fini=4, fend=40, fovr=0)

if mode != 0:
    
    train, test, features=all_data[mode-1][0],all_data[mode-1][1],all_data[mode-1][2]


params_model=read_params[0]
prm=read_params[1]

#print(type(params_model) is OrderedDict)
    
score_list=[]
test_seq=[]
prediction_list=[]
train_result2_list=[]

#print('before all', train)


for part_iter in range(2):
    
    for item in range(6):
    
        folds_iter=4
    
        random_iter=2016
    
        seq_iter=item+1
        
        if part_iter==0:
            csp_init=0
            csp_end=0.5
        
        else:
            csp_init=0.5
            csp_end=1
    
        prediction, score, train_result1, train_result2, test_out = run_train_predict(train, test, features, 'result', 
                                               params_model,num_features, channels, csp_n=16,seq_number=seq_iter,
                                                csp_init=0,csp_end=0.5, 
                                                  nfolds=folds_iter, random_state=random_iter,
                                                  mode=mode, SEQoriginal=prm['SEQoriginal'], PCAkey=prm['PCAkey'], 
                                                  PCAgraph=prm['PCAgraph'], PCAkeyGS=prm['PCAkeyGS'],
                                                  Oversampling=prm['Oversampling'], GridSearch=False,
                                                  pred_per_patient=False, CSPkey=True, CSPkey1=False)
        prediction_list.append(prediction)
        test_seq.append(test_out)
        score_list.append(score)
        train_result2_list.append(train_result2)
#prediction, score = run_train_predict(train, test, features, 'result', read_params[0],**prm,
#                                      pred_per_patient=False)

print(score_list)
print(prediction.shape)

create_submission(score, test_out, prediction, feature_model, short_size, new_test)

In [None]:
final_train_pred=np.zeros(len(train_result2_list[0]))
final_test_pred=np.zeros(len(prediction_list[0]))
for i in prediction_list:
    final_test_pred+=i
final_test_pred=np.divide(final_test_pred,6)

for i in train_result2_list:
    final_train_pred+=i
final_train_pred=np.divide(final_test_pred,6)
#print(final_array)
print(final_train_pred)
print(final_test_pred)


In [None]:
#x = np.random.rand(5,5)

#print (x)

#test11=x[:,x.shape[1]-1]

#print(test11,type(test11))

#test12=np.int64(test11)
#print(test12)
#z=x.shape[0]-1

#y=np.delete(x,x.shape[1]-1, 1)

#print(y)
#x[0] = -np.inf
#print(x)

#xf=x[~np.isinf(x)]  # 1D array with NaNs removed
#print(xf)
#xf=x[~np.any(np.isinf(x), axis=1)]  # 2D array with rows with NaN removed

#print (xf)

#a = np.arange(6).reshape((3, 2), order='F')

print(a)

new_test = test[features].iloc[[0]].drop(['file_size','patient_id'],1).values

print(new_test.astype(np.int64))

out1=np.reshape(new_test,newshape=(9, 16), order='F')


print(out1.shape)


## AUC analysis for 1 patient at a time

In [None]:
train_id=all_data[mode-1][0]['Id']

train_true = all_data[mode-1][0][all_data[mode-1][0]['result'] == 1]['Id']

print(train_true.shape)

train_false = all_data[mode-1][0][all_data[mode-1][0]['result'] == 0]['Id']
      
print(train_false.shape)
        
train_result2_final=np.array(train_result2)
#train_result2_1 = (train_result2_1 - np.median(train_result2_1)) * 0.5 + 0.5

print(train_result2_final.shape)

      
#roc1=np.column_stack((train1_id,train_result2_1))
roc=pd.DataFrame({'id':train_id,'prob':train_result2_final})

#print(type(train1_true),type(roc1))

#histograms

hist_true=roc[roc.index.isin(train_true.index)]['prob'].values
hist_false=roc[roc.index.isin(train_false.index)]['prob'].values

#scaling

#hist_true_1 = (hist_true_1 - np.median(hist_true_1)) * 0.5 + 0.5
#hist_false_1 = (hist_false_1 - np.median(hist_false_1)) * 0.5 + 0.5
#print(hist_true_1.shape)

histogram=plt.figure()

bins = np.linspace(0, 1, 100)

plt.hist(hist_true, bins,color='b', log=True, alpha=0.5)
plt.hist(hist_false, bins, log=True,color='g', alpha=0.5)
plt.hist(prediction, bins, log=True,color='r', alpha=0.5)
plt.show()



## Running Grid Search, Training and Prediction per patient - All patients

In [None]:
feature_model=4
short_size=False
num_features=9
new_test=True
all_data=read_test_train_per_patient(feature_model, short_size=False, new_test=True)
rescale=False


params_training1='parameter-file-mode-1-2016-11-21-19-41.txt'
params_training2='parameter-file-mode-2-2016-11-21-22-47.txt'
params_training3='parameter-file-mode-3-2016-11-22-01-59.txt'

read_params1 = json.load(open(params_training1), object_pairs_hook=OrderedDict)

read_params2 = json.load(open(params_training2), object_pairs_hook=OrderedDict)

read_params3 = json.load(open(params_training3), object_pairs_hook=OrderedDict)

all_params=[]
all_params.append(read_params1)
all_params.append(read_params2)
all_params.append(read_params3)

print (all_params)


#print(len(all_data), len(all_data[0]), all_data[0][1].shape)

train_result1=[]
train_result2=[]
prediction=[]
score=[]
for i, item in enumerate(all_data):
    
    read_params_patient=all_params[i]
    params_model=read_params_patient[0]
    prm=read_params_patient[1]
    
#    print(item[1])
    # mode indicates '0':Global training (all patients); '1' training and testing on patient one;
    #'2' training and testing on patient two...
    mode=i+1
    
    temp1, temp2, temp3, temp4 = run_train_predict(item[0], item[1], item[2], 'result',
                                            params_model,num_features, channels, csp_n=16,seq_number=1,csp_init=0,csp_end=1, 
                                            nfolds=prm['nfolds'], random_state=prm['random_state'],
                                            mode=mode, SEQoriginal=prm['SEQoriginal'], PCAkey=prm['PCAkey'], 
                                            PCAgraph=prm['PCAgraph'], PCAkeyGS=prm['PCAkeyGS'],
                                            Oversampling=prm['Oversampling'], GridSearch=False,
                                            pred_per_patient=False, CSPkey=False, CSPkey1=True)
    
   
    
    train_result1.append(temp3)
    train_result2.append(temp4)
    prediction.append(temp1)
    score.append(temp2)

#median rescale

if rescale:
#    for i in range(len(all_data)):
#        prediction[i] = (prediction[i] - np.median(prediction[i])) * 0.5 + 0.5
    prediction[i]=np.divide(prediction[i],np.amax(prediction[i]))
    
# Concatenation of AUC scores per patient

predict_total=np.concatenate((prediction[0],prediction[1],prediction[2]))
score_total=sum(score)/len(score)
test=pd.concat([all_data[0][1],all_data[1][1],all_data[2][1]])
    
    
create_submission(score_total, test, predict_total, feature_model, short_size, new_test)


In [None]:
train1_id,train2_id,train3_id=all_data[0][0]['Id'],all_data[1][0]['Id'],all_data[2][0]['Id']

train1_true, train2_true, train3_true = all_data[0][0][all_data[0][0]['result'] == 1]['Id'],\
                                        all_data[1][0][all_data[1][0]['result'] == 1]['Id'],\
                                        all_data[2][0][all_data[2][0]['result'] == 1]['Id']
        
train1_false, train2_false, train3_false = all_data[0][0][all_data[0][0]['result'] == 0]['Id'],\
                                        all_data[1][0][all_data[1][0]['result'] == 0]['Id'],\
                                        all_data[2][0][all_data[2][0]['result'] == 0]['Id']        
                
        
train_result2_1=np.array(train_result2[0])
#train_result2_1 = (train_result2_1 - np.median(train_result2_1)) * 0.5 + 0.5


train_result2_2=np.array(train_result2[1])
#train_result2_2 = (train_result2_2 - np.median(train_result2_2)) * 0.5 + 0.5


train_result2_3=np.array(train_result2[2])
#train_result2_3 = (train_result2_3 - np.median(train_result2_3)) * 0.5 + 0.5


        
#roc1=np.column_stack((train1_id,train_result2_1))
roc1=pd.DataFrame({'id':train1_id,'prob':train_result2_1})
roc2=pd.DataFrame({'id':train2_id,'prob':train_result2_2})
roc3=pd.DataFrame({'id':train3_id,'prob':train_result2_3})
#print(type(train1_true),type(roc1))

#histograms

hist_true_1=roc1[roc1.index.isin(train1_true.index)]['prob'].values
hist_false_1=roc1[roc1.index.isin(train1_false.index)]['prob'].values

#scaling

#hist_true_1 = (hist_true_1 - np.median(hist_true_1)) * 0.5 + 0.5
#hist_false_1 = (hist_false_1 - np.median(hist_false_1)) * 0.5 + 0.5
#print(hist_true_1.shape)


#histograms

hist_true_2=roc2[roc2.index.isin(train2_true.index)]['prob'].values
hist_false_2=roc2[roc2.index.isin(train2_false.index)]['prob'].values

#scaling

#hist_true_2 = (hist_true_2 - np.median(hist_true_2)) * 0.5 + 0.5
#hist_false_2 = (hist_false_2 - np.median(hist_false_2)) * 0.5 + 0.5


#histograms

hist_true_3=roc3[roc3.index.isin(train3_true.index)]['prob'].values
hist_false_3=roc3[roc3.index.isin(train3_false.index)]['prob'].values


#scaling

#hist_true_3 = (hist_true_3 - np.median(hist_true_3)) * 0.5 + 0.5
#hist_false_3 = (hist_false_3 - np.median(hist_false_3)) * 0.5 + 0.5

#print(roc1.isin(np.array(train1_true))['id'].shape)
#print(prediction[0], prediction[0].shape)
#print(train1_id.shape)
#print(train_result2_1.shape, train1_id.shape)
#print(roc1)
#print(len(all_data))


In [None]:
histogram=plt.figure()

bins = np.linspace(0, 1, 100)

plt.hist(hist_true_1, bins,color='b', log=True, alpha=0.5)
plt.hist(hist_false_1, bins, log=True,color='g', alpha=0.5)
plt.hist(prediction[0], bins, log=True,color='r', alpha=0.5)
plt.show()


histogram1=plt.figure()

bins = np.linspace(0, 1, 100)

plt.hist(hist_true_2, bins, color='b', log=True, alpha=0.5)
plt.hist(hist_false_2, bins,log=True,color='g', alpha=0.5)
plt.hist(prediction[1], bins, log=True,color='r', alpha=0.5)
plt.show()

histogra3=plt.figure()

bins = np.linspace(0, 1, 100)

plt.hist(hist_true_3, bins,log=True,color='b', alpha=0.5)
plt.hist(hist_false_3, bins,log=True,color='g', alpha=0.5)
plt.hist(prediction[2], bins, log=True,color='r', alpha=0.5)
plt.show()



## Modification per patient with filter customization (in progress)

In [None]:
feature_model=3
short_size=False
num_features=6
new_test=True
all_data=read_test_train_per_patient(feature_model=3, short_size=False, new_test=True)
rescale=False


params_training1='parameter-file-mode-1-2016-11-22-14-16 (copy).txt'
params_training2='parameter-file-mode-2-2016-11-22-14-16 (copy).txt'
params_training3='parameter-file-mode-3-2016-11-22-14-16 (copy).txt'

read_params1 = json.load(open(params_training1), object_pairs_hook=OrderedDict)

read_params2 = json.load(open(params_training2), object_pairs_hook=OrderedDict)

read_params3 = json.load(open(params_training3), object_pairs_hook=OrderedDict)

all_params=[]
all_params.append(read_params1)
all_params.append(read_params2)
all_params.append(read_params3)

print (all_params)


#print(len(all_data), len(all_data[0]), all_data[0][1].shape)

if param_filter not in folder_results:
    
    if __name__ == '__main__':
        print('XGBoost: {}'.format(xgb.__version__))
        if 1:
            # Do reading and processing of MAT files in parallel
            p = dict()
    #        p[1] = Process(target=create_simple_csv_train, args=(1,feature_model,num_features,short_size,new_test))
    #        p[1].start()
    #        p[2] = Process(target=create_simple_csv_train, args=(2,feature_model,num_features,short_size,new_test))
    #        p[2].start()
    #        p[3] = Process(target=create_simple_csv_train, args=(3,feature_model,num_features,short_size,new_test))
    #        p[3].start()
    #        p[4] = Process(target=create_simple_csv_test, args=(1,feature_model,num_features,short_size,new_test))
    #        p[4].start()
    #        p[5] = Process(target=create_simple_csv_test, args=(2,feature_model,num_features,short_size,new_test))
    #        p[5].start()
            p[6] = Process(target=create_simple_csv_test, args=(3,feature_model,num_features,short_size,new_test))
            p[6].start()
    #        p[1].join()
    #        p[2].join()
    #        p[3].join()
    #        p[4].join()
    #        p[5].join()
            p[6].join()

        
time.sleep(5)

if __name__ == '__main__':
    print('XGBoost: {}'.format(xgb.__version__))
    
    train, test, features = read_test_train(feature_model, short_size, new_test)
    print('Length of train: ', len(train))
    print('Length of test: ', len(test))
    print('Features [{}]: {}'.format(len(features), sorted(features)))










train_result1=[]
train_result2=[]
prediction=[]
score=[]
for i, item in enumerate(all_data):
    
    read_params_patient=all_params[i]
    params_model=read_params_patient[0]
    prm=read_params_patient[1]
    
    
    # mode indicates '0':Global training (all patients); '1' training and testing on patient one;
    #'2' training and testing on patient two...
    mode=i+1
    
    temp1, temp2, temp3, temp4 = run_train_predict(item[0], item[1], item[2], 'result', params_model, nfolds=prm['nfolds'],
                                      random_state=prm['random_state'],
                                      mode=mode, SEQoriginal=prm['SEQoriginal'], PCAkey=prm['PCAkey'], 
                                      PCAgraph=prm['PCAgraph'], PCAkeyGS=prm['PCAkeyGS'],
                                      Oversampling=prm['Oversampling'], GridSearch=False,
                                      pred_per_patient=False)
    
    
    train_result1.append(temp3)
    train_result2.append(temp4)
    prediction.append(temp1)
    score.append(temp2)

#median rescale

if rescale:
    for i in range(len(all_data)):
        prediction[i] = (prediction[i] - np.median(prediction[i])) * 0.5 + 0.5
    
    
# Concatenation of AUC scores per patient

predict_total=np.concatenate((prediction[0],prediction[1],prediction[2]))
score_total=sum(score)/len(score)
test=pd.concat([all_data[0][1],all_data[1][1],all_data[2][1]])
    
    
create_submission(score_total, test, predict_total, feature_model, short_size, new_test)


## Modules for PCA Graphic Analysis (in progress)

In [None]:
import seaborn as sns

sns.set()


almost_black = '#262626'
palette = sns.color_palette()

f, (ax1, ax2) = plt.subplots(1, 2)

ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5,
            edgecolor=almost_black, facecolor=palette[0], linewidth=0.15)
ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5,
            edgecolor=almost_black, facecolor=palette[2], linewidth=0.15)
ax1.set_title('Original set')

ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1],
            label="Class #0", alpha=.5, edgecolor=almost_black,
            facecolor=palette[0], linewidth=0.15)
ax2.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1],
            label="Class #1", alpha=.5, edgecolor=almost_black,
            facecolor=palette[2], linewidth=0.15)
ax2.set_title('SMOTE svm')



plt.show()

In [None]:
from mne.decoding import CSP

pcatest=CSP(n_components=4)
X_test=pcatest.fit(csp_data,y)

ydata=np.array(train['result'].values)
#y_resampled=y

Xdata=np.array(train[features])

print(X.ndim)
        
#       PCA transformation 
print(X_test)

newX=X_test.fit_transform(Xdata,ydata)
#X_vis=X_test[:,0:2]
#X_res_vis=X_test[:,2:4]

#print(X_vis.shape,X_res_vis.shape, X_test.shape)

In [None]:
newX=X_test.fit_transform(csp_data,y)

print(newX)

In [None]:
import mne
from mne.decoding import CSP

source_dir="./train_"
patient_id=1
files = sorted(glob.glob(source_dir + str(patient_id) + "/*.mat"), key=natural_key)
print ('train files'+ str(patient_id), len(files))  

fl=files[0]


tables, sequence_from_mat, samp_freq = mat_to_pandas(fl)

data_csp=np.transpose(tables.values)

files1=files[0:100]

print(len(files1))


new_train = pd.read_csv('train_and_test_data_labels_safe'+'.csv')
new_data = new_train['image']
    
selection = new_train[new_train['safe'] == 1].drop('safe', axis=1)
    
result_list=[]
count=0
track1=0
        
for i, fl1 in enumerate(files1):
#    print(i)
    if i==track1:
        
        print('checking')
        if os.path.basename(fl1) not in selection['image'].values:
            track1+=1
            print('still')
            continue
        
        id_str = os.path.basename(fl1)[:-4]
        arr = id_str.split("_")  
        result = int(arr[2])
        
        fl=files[0]

        tables, sequence_from_mat, samp_freq = mat_to_pandas(fl)

        data_csp=np.transpose(tables.values)
        
        result_list.append(result)
        
        print('done!')
        
        continue
        
        
    if os.path.basename(fl1) not in selection['image'].values:
        continue
        
    id_str = os.path.basename(fl1)[:-4]
    arr = id_str.split("_")  
    result = int(arr[2])

    
    try:
            tables1, sequence_from_mat1, samp_freq1 = mat_to_pandas(fl1)
    except:
            print('Some error here {}...'.format(fl))
            continue
    
    if sequence_from_mat1==1:
        
        temp_matrix=np.transpose(tables1.values)
        data_csp=np.dstack((data_csp,temp_matrix))
        result_list.append(result)

y=np.array(result_list)
print(y.shape)


#print(data_csp.shape)    

csp_data=data_csp.transpose((2,0,1))

#print(csp_data.shape)     
    
CSPtest=CSP(n_components=4)
#X_test=CSPtest.fit(csp_data,y)
newX=CSPtest.fit_transform(csp_data,y)

print(newX)


In [None]:
y=np.array(result_list)
print(y.shape)
print(data_csp[0][0][3])

print(data_csp.shape)    

csp_data=data_csp.transpose((2,0,1))

print(csp_data.shape)



#csp_train = csp_data.copy().crop(tmin=1., tmax=600.)




ch_names=[str(i) for i in tables.columns.values]
eeg_matrix=np.transpose(tables.values)

print(type(ch_names))

print(type(eeg_matrix), eeg_matrix.shape)

info_eeg=mne.create_info(ch_names, samp_freq, ch_types=None, montage=None)


raw=mne.io.RawArray(eeg_matrix, info_eeg)

#events=mne.find_events(raw, stim_channel='STI 014', output='onset', consecutive='increasing', min_duration=0,
 #               shortest_event=2, mask=None, uint_cast=False, mask_type=None, verbose=None)

#tmin, tmax = -1., 4.
#event_id = dict(hands=2, feet=3)
#subject = 1
#runs = [6, 10, 14]  # motor imagery: hands vs feet

#raw_fnames = eegbci.load_data(subject, runs)
#raw_files = [read_raw_edf(f, preload=True) for f in raw_fnames]
#raw = concatenate_raws(raw_files)

## strip channel names of "." characters
#raw.rename_channels(lambda x: x.strip('.'))

## Apply band-pass filter
#raw.filter(7., 30., method='iir')

#events = find_events(raw, shortest_event=0, stim_channel='STI 014')

#picks = pick_types(raw.info, meg=False, eeg=True, stim=False, eog=False,
#                   exclude='bads')

## Read epochs (train will be done only between 1 and 2s)
## Testing will be done with a running classifier

#events = np.array([
#    [0, 1, 1],
#    [1, 1, 2],
#    [2, 1, 1],
#    [3, 1, 2],
#    [4, 1, 1],
#    [5, 1, 2],
#    [6, 1, 1],
#    [7, 1, 2],
#    [8, 1, 1],
#    [9, 1, 2],
#])

#event_id = dict(smiling=1, frowning=2)

#epochs = mne.Epochs(raw, events, event_id=None, tmin=-0.2, tmax=0.5, baseline=(None, 0),
#                    picks=None, name='Unknown', preload=False, reject=None, flat=None, proj=True,
#                    decim=1, reject_tmin=None, reject_tmax=None, detrend=None, add_eeg_ref=None,
#                    on_missing='error', reject_by_annotation=True, verbose=None)
#epochs_train = epochs.copy().crop(tmin=1., tmax=2.)
#labels = epochs.events[:, -1] - 2



#mne.decoding.CSP(n_components=4, reg=None, log=None, cov_est='concat', transform_into='average_power')

In [None]:
params_test = dict(reduce_dim__n_components=[2, 5, 10],
               clf__C=[0.1, 10, 100])

print(params_test)

## Creating arrays for CSP

In [None]:
#Modules to read train and test data.
#Short_dataset can be False or TRue. It decides whether to use the lon or short sample size.

def create_simple_csv_train(patient_id, feature_model, num_features, fini, fend, fovr,
                            short_dataset=False, new_test=False):
    
    feature_file='_'+str(feature_model)+'_short_'+str(short_dataset)+'_new_test_'+ str(new_test)\
            +'_fini_'+str(fini)+'_fend_'+str(fend)+'_fovr_'+str(fovr)
    
    if short_dataset:
        
        source_dir="./data/train_"
    else:
        source_dir="./train_"
    
    new_label=''
    if new_test:
        
        new_label='_new'

    out = open("simple_train_" + str(patient_id) + feature_file + ".csv", "w")
    out.write("Id,sequence_id,patient_id,")
  
    columns=''
    for i in range(16):
        for j in range(num_features):
            columns+= 'ch_'+str(i)+'_'+"band_"+str(j)+","        

    out.write(columns+"file_size,result\n")

    # TRAIN (0)
    out_str = ''
    files = sorted(glob.glob(source_dir + str(patient_id) + "/*.mat"), key=natural_key)
    print ('train files'+ str(patient_id), len(files))    
    pos1=0
    neg1=0
    sequence_id_pre = int(patient_id)*1000
    sequence_id_inter = int(patient_id)*1000
    total_pre = 0
    total_inter=0
    seq1=0
    
    new_train = pd.read_csv('train_and_test_data_labels_safe'+'.csv')
    new_data = new_train['image']
    
    selection = new_train[new_train['safe'] == 1].drop('safe', axis=1)
    
    for fl in files:
        
        # print('Go for ' + fl)
               
        if os.path.basename(fl) not in selection['image'].values:
            continue
        
        id_str = os.path.basename(fl)[:-4]
        arr = id_str.split("_")
        patient = int(arr[0])
        id = int(arr[1])
        result = int(arr[2])
        
        if result == 1:
            
            total_pre += 1
            sequence_id=int(patient_id)*1000+int((total_pre-1) // 6) + int((total_inter-1) // 6) + 1

            
        elif result == 0:
            
            total_inter += 1            
            sequence_id=int(patient_id)*1000+int((total_pre) // 6) + int((total_inter-1) // 6)

        
        new_id = int(patient*100000 + id)
        try:
            tables, sequence_from_mat, samp_freq = mat_to_pandas(fl)
        except:
            print('Some error here {}...'.format(fl))
            continue
        print(sequence_id)
        out_str += str(new_id) + "," + str(sequence_id) + "," + str(patient)

        sizesignal=int(tables.shape[0])       
        
        for f in sorted(list(tables.columns.values)):
            
            out_str=feature_eng(tables[f], out_str,feature_model, sizesignal, samp_freq,  fini, fend, fovr,)
            
            
        out_str += "," + str(os.path.getsize(fl)) + "," + str(result) + "\n"
        #print(sequence_from_mat)
        #print(type(sequence_from_mat))
        seq1=int(sequence_from_mat[0][0][0][0])
        print('total preictal: ', total_pre,' total interictal: ', total_inter,' sequence local: ', seq1)
        if (total_pre % 6 == 0) and result == 1:
                pos1 += 1
                print('Positive ocurrence sequence finished', pos1)
                if (seq1==6):
                    sequence_id_pre += 1
                    print ('sequence preictal next',sequence_id_pre)
        
        if (total_inter % 6 == 0) and result == 0:                
                neg1 += 1
                print('Negative ocurrence sequence finished', neg1)
                if (seq1==6):
                    sequence_id_inter += 1
                    print ('sequence interictal next',sequence_id_inter)

    out.write(out_str)
    
    out.close()
    print('Train CSV for patient {} has been completed...'.format(patient_id))


def create_simple_csv_test(patient_id, feature_model, num_features, fini, fend, fovr,
                           short_dataset=False, new_test=False):
    
    feature_file='_'+str(feature_model)+'_short_'+str(short_dataset)+'_new_test_'+str(new_test)\
            +'_fini_'+str(fini)+'_fend_'+str(fend)+'_fovr_'+str(fovr)
    
    if short_dataset:
        
        source_dir="./data/test_"
    else:
        source_dir="./test_"
    
    new_label=''
    
    if new_test:
        
        new_label="_new"

    # TEST
    out_str = ''
    files = sorted(glob.glob(source_dir + str(patient_id) + new_label + "/*.mat"), key=natural_key)
    print ('test files'+ str(patient_id), len(files))    
    out = open("simple_test_" + str(patient_id) + feature_file + ".csv", "w")
    out.write("Id,patient_id,")
    
    columns=''
    for i in range(16):
        for j in range(num_features):
            columns+= 'ch_'+str(i)+'_'+"band_"+str(j)+","        
    
    out.write(columns+"file_size\n")
    
        
    for fl in files:
        # print('Go for ' + fl)
        id_str = os.path.basename(fl)[4:-4]
        arr = id_str.split("_")
        patient = int(arr[0])
        id = int(arr[1])
        new_id = int(patient*100000 + id)
        try:
            tables, sequence_from_mat, samp_freq = mat_to_pandas(fl)
        except:
            print('Some error here {}...'.format(fl))
            continue
        out_str += str(new_id) + "," + str(patient)

        sizesignal=int(tables.shape[0])           
        
        
        
        
        for f in sorted(list(tables.columns.values)):
            
            out_str=feature_eng(tables[f], out_str,feature_model, sizesignal, samp_freq, fini, fend, fovr,)
                        
        out_str += "," + str(os.path.getsize(fl)) + "\n"
        # break

    out.write(out_str)
    out.close()
    print('Test CSV for patient {} has been completed...'.format(patient_id))
