## Initial Package Import and Simple Module definition

Modified from ZFTurbo

In [81]:
%matplotlib inline


import shutil
import json

import datetime
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from scipy.io import loadmat
from operator import itemgetter
import random
import os
import time
import glob
import re
from multiprocessing import Process
import copy



#Importing old and new Kfold
from sklearn.cross_validation import KFold
from sklearn.model_selection import KFold as NewKF
from sklearn.model_selection import StratifiedKFold as StratKF

#Importing GroupKfold, only available since version 0.18
from sklearn.model_selection import GroupKFold


#Importing function for scaling data before PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import scale

#Importing PCA packages
from sklearn.decomposition import PCA, KernelPCA

#Importing FFT package
from scipy.fftpack import fft

#Importing crossvalidation metrics and Gridsearch
from sklearn import cross_validation, metrics
from sklearn.model_selection import GridSearchCV

#Importing wrapper to use XGB with Gridsearch

from xgboost.sklearn import XGBClassifier

#Importing plotting packages (optional)

import matplotlib.pylab as plt

from pandas.tools.plotting import scatter_matrix

from sklearn.model_selection import LeavePGroupsOut
from sklearn.model_selection import GroupShuffleSplit

#Oversampling

from imblearn.over_sampling import SMOTE

from collections import Counter, OrderedDict

from imblearn.combine import SMOTETomek

###Band Frequency filtering###
from scipy.signal import cheby2, butter, lfilter



#Defining general modules used in the classification

random.seed(2016)
np.random.seed(2016)


def natural_key(string_):
    return [int(s) if s.isdigit() else s for s in re.split(r'(\d+)', string_)]


def create_feature_map(features):
    outfile = open('xgb.fmap', 'w')
    for i, feat in enumerate(features):
        outfile.write('{0}\t{1}\tq\n'.format(i, feat))
    outfile.close()


def get_importance(gbm, features):
    create_feature_map(features)
    importance = gbm.get_fscore(fmap='xgb.fmap')
    importance = sorted(importance.items(), key=itemgetter(1), reverse=True)
    return importance


def intersect(a, b):
    return list(set(a) & set(b))


def print_features_importance(imp):
    for i in range(len(imp)):
        print("# " + str(imp[i][1]))
        print('output.remove(\'' + imp[i][0] + '\')')


def mat_to_pandas(path):
    mat = loadmat(path)
    names = mat['dataStruct'].dtype.names
    ndata = {n: mat['dataStruct'][n][0, 0] for n in names}
    samp_freq = ndata['iEEGsamplingRate'][0, 0]
    sequence = -1
    if 'sequence' in names:
        sequence = mat['dataStruct']['sequence']
    return pd.DataFrame(ndata['data'], columns=ndata['channelIndices'][0]), sequence, samp_freq

def create_submission(score, test, prediction, feature_model, short_size, new_test):
    # Make Submission
    now = datetime.datetime.now()
    sub_file = 'submission_' + str(score) + '_' + 'model_'+str(feature_model)+'_short_'+str(short_size)+'_new_test_'+str(new_test)+'_'+ str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
    print('Writing submission: ', sub_file)
    f = open(sub_file, 'w')
    f.write('File,Class\n')
    total = 0
    for id in test['Id']:
        patient = id // 100000
        fid = id % 100000
        str1 = 'new_' + str(patient) + '_' + str(fid) + '.mat' + ',' + str(prediction[total])
        str1 += '\n'
        total += 1
        f.write(str1)
    f.close()


def get_features(train, test):
    trainval = list(train.columns.values)
    testval = list(test.columns.values)
    output = intersect(trainval, testval)
    output.remove('Id')
    # output.remove('file_size')
    return sorted(output)

    


## Module for Creating Features and Saving to CSV Files

#### One file per patient per test/training

In [2]:
#Modules to read train and test data.
#Short_dataset can be False or TRue. It decides whether to use the lon or short sample size.

def create_simple_csv_train(patient_id, feature_model, num_features, short_dataset=False, new_test=False):
    
    feature_file='_'+str(feature_model)+'_short_'+str(short_dataset)+'_new_test_'+ str(new_test)
    
    if short_dataset:
        
        source_dir="./data/train_"
    else:
        source_dir="./train_"
    
    new_label=''
    if new_test:
        
        new_label='_new'

    out = open("simple_train_" + str(patient_id) + feature_file + ".csv", "w")
    out.write("Id,sequence_id,patient_id,")
  
    columns=''
    for i in range(16):
        for j in range(num_features):
            columns+= 'ch_'+str(i)+'_'+"band_"+str(j)+","        

    out.write(columns+"file_size,result\n")

    # TRAIN (0)
    out_str = ''
    files = sorted(glob.glob(source_dir + str(patient_id) + "/*.mat"), key=natural_key)
    print ('train files'+ str(patient_id), len(files))    
    pos1=0
    neg1=0
    sequence_id_pre = int(patient_id)*1000
    sequence_id_inter = int(patient_id)*1000
    total_pre = 0
    total_inter=0
    seq1=0
    
    new_train = pd.read_csv('train_and_test_data_labels_safe'+'.csv')
    new_data = new_train['image']
    
    selection = new_train[new_train['safe'] == 1].drop('safe', axis=1)
    
    for fl in files:
        
        # print('Go for ' + fl)
               
        if os.path.basename(fl) not in selection['image'].values:
            continue
        
        id_str = os.path.basename(fl)[:-4]
        arr = id_str.split("_")
        patient = int(arr[0])
        id = int(arr[1])
        result = int(arr[2])
        
        if result == 1:
            
            total_pre += 1
            sequence_id=int(patient_id)*1000+int((total_pre-1) // 6) + int((total_inter-1) // 6) + 1

            
        elif result == 0:
            
            total_inter += 1            
            sequence_id=int(patient_id)*1000+int((total_pre) // 6) + int((total_inter-1) // 6)

        
        new_id = int(patient*100000 + id)
        try:
            tables, sequence_from_mat, samp_freq = mat_to_pandas(fl)
        except:
            print('Some error here {}...'.format(fl))
            continue
        print(sequence_id)
        out_str += str(new_id) + "," + str(sequence_id) + "," + str(patient)

        sizesignal=int(tables.shape[0])       
        
        for f in sorted(list(tables.columns.values)):
            
            out_str=feature_eng(tables[f], out_str,feature_model, sizesignal, samp_freq)
            
            
        out_str += "," + str(os.path.getsize(fl)) + "," + str(result) + "\n"
        #print(sequence_from_mat)
        #print(type(sequence_from_mat))
        seq1=int(sequence_from_mat[0][0][0][0])
        print('total preictal: ', total_pre,' total interictal: ', total_inter,' sequence local: ', seq1)
        if (total_pre % 6 == 0) and result == 1:
                pos1 += 1
                print('Positive ocurrence sequence finished', pos1)
                if (seq1==6):
                    sequence_id_pre += 1
                    print ('sequence preictal next',sequence_id_pre)
        
        if (total_inter % 6 == 0) and result == 0:                
                neg1 += 1
                print('Negative ocurrence sequence finished', neg1)
                if (seq1==6):
                    sequence_id_inter += 1
                    print ('sequence interictal next',sequence_id_inter)

    out.write(out_str)
    
    out.close()
    print('Train CSV for patient {} has been completed...'.format(patient_id))


def create_simple_csv_test(patient_id, feature_model, num_features, short_dataset=False, new_test=False):
    
    feature_file='_'+str(feature_model)+'_short_'+str(short_dataset)+'_new_test_'+str(new_test)
    
    if short_dataset:
        
        source_dir="./data/test_"
    else:
        source_dir="./test_"
    
    new_label=''
    
    if new_test:
        
        new_label="_new"

    # TEST
    out_str = ''
    files = sorted(glob.glob(source_dir + str(patient_id) + new_label + "/*.mat"), key=natural_key)
    print ('test files'+ str(patient_id), len(files))    
    out = open("simple_test_" + str(patient_id) + feature_file + ".csv", "w")
    out.write("Id,patient_id,")
    
    columns=''
    for i in range(16):
        for j in range(num_features):
            columns+= 'ch_'+str(i)+'_'+"band_"+str(j)+","        
    
    out.write(columns+"file_size\n")
    
        
    for fl in files:
        # print('Go for ' + fl)
        id_str = os.path.basename(fl)[4:-4]
        arr = id_str.split("_")
        patient = int(arr[0])
        id = int(arr[1])
        new_id = int(patient*100000 + id)
        try:
            tables, sequence_from_mat, samp_freq = mat_to_pandas(fl)
        except:
            print('Some error here {}...'.format(fl))
            continue
        out_str += str(new_id) + "," + str(patient)

        sizesignal=int(tables.shape[0])           
        
        
        
        
        for f in sorted(list(tables.columns.values)):
            
            out_str=feature_eng(tables[f], out_str,feature_model, sizesignal, samp_freq)
                        
        out_str += "," + str(os.path.getsize(fl)) + "\n"
        # break

    out.write(out_str)
    out.close()
    print('Test CSV for patient {} has been completed...'.format(patient_id))


In [152]:

def create_subset_train(feature_model):
    
   
    
#    filestotal = sorted(glob.glob(source_dir + str(patient_id) + "/*.mat"), key=natural_key)

    
    folder_list=['train_1','train_2','train_3']
    
    
    
    for folder_label in folder_list:
        
        short_size=False
        
        patient_id=int(list(filter(str.isdigit, folder_label))[0])
        
        new_test=True      
        if len(folder_label)>7:
  
            var=6
            
        else:
            var=7
        
        files = pd.read_csv('simple_'+ folder_label[0:var]+'_'+str(feature_model)+'_short_'+str(short_size)+'_new_test_'+str(new_test)+'.csv')

        files_index=pd.Series(files['sequence_id'], index=files['sequence_id'].index)

        
        files_unique =files.drop_duplicates(subset=['sequence_id'])
        
#       print(files_unique)
        

        files_1=files_unique[files_unique['result'] == 1]['sequence_id']
        files_0=files_unique[files_unique['result'] == 0]['sequence_id']
        
        print(files_1.shape)
        print(files_0.shape)
    
        n_samples_1=int(len(files_1)/10)
        n_samples_0=int(len(files_0)/10)
        

    
        files_1_rand=np.random.choice(files_1, size=n_samples_1)
        files_0_rand=np.random.choice(files_0, size=n_samples_0)
        
        
        files_1_rand_seq=files[files_index.isin(files_1_rand)]['Id']
        files_0_rand_seq=files[files_index.isin(files_0_rand)]['Id']
        
        for file in files_1_rand_seq:
        
            id_file=str(file % 1000)
            source_file='./'+folder_label+'/'+str(patient_id)+'_'+id_file+'_1.mat'
            target_file='./data_random/' + folder_label+'/'+str(patient_id)+'_'+id_file+'_1.mat'
            print(target_file)
    
            shutil.copyfile(source_file, target_file) 

        for file in files_0_rand_seq:
        
            id_file=str(file % 1000)
            source_file='./'+folder_label+'/'+str(patient_id)+'_'+id_file+'_0.mat'
            target_file='./data_random/' + folder_label+'/'+str(patient_id)+'_'+id_file+'_0.mat'
            print(target_file)
    
            shutil.copyfile(source_file, target_file)   
    

In [None]:
feature_model=3
short_size=False
num_features=6
new_test=True


create_subset_train(feature_model)

In [None]:


def create_subset_test(feature_model):
    
   
    
#    filestotal = sorted(glob.glob(source_dir + str(patient_id) + "/*.mat"), key=natural_key)

    
    folder_list=['test_1_new','test_2_new','test_3_new']
    
    
    
    for folder_label in folder_list:
        
        short_size=False
        
        patient_id=int(list(filter(str.isdigit, folder_label))[0])
        
        new_test=True      
        if len(folder_label)>7:
  
            var=6
            
        else:
            var=7
        
        files = pd.read_csv('simple_'+ folder_label[0:var]+'_'+str(feature_model)+'_short_'+str(short_size)+'_new_test_'+str(new_test)+'.csv')

             

        files_total=files['Id']

    
        n_samples=int(len(files_total)/10)

    
        files_rand=np.random.choice(files_total, size=n_samples)

             
        
        for file in files_rand:
        
            id_file=str(file % 1000)
            source_file='./'+folder_label+'/new_'+str(patient_id)+'_'+id_file+'.mat'
            target_file='./data_random/' + folder_label+'/new_'+str(patient_id)+'_'+id_file+'.mat'
            print(target_file)
    
            shutil.copyfile(source_file, target_file) 


In [None]:
feature_model=3
short_size=False
num_features=6
new_test=True


create_subset_test(feature_model)

## Feature Engineering

In [3]:
#eng_number is the feature_value that has values 0,1,2,3... infinite, given by the list below.

def feature_eng(data_sensor, out_str, eng_number, sizesignal, fs):

                
    yf1 = fft(data_sensor)
    fftpeak=2/sizesignal * np.abs(yf1[0:sizesignal/2])
 
    numberofbands=4

    sizeband=20/numberofbands

    if eng_number==3:
        
    
        

        ##Frequency parameters##
        #Start frequency#
        fini = 7
        #End frequency#
        fend = 30
        #Frequency band range#
        frng = 4
        #Frequency overlap#
        fovr = 0
    
        #Frequency band generator#
        fbands = [[f, f + frng] for f in range(fini, fend - fovr, frng - fovr)]
    
        #Filter order#
        order = 5
        #Filter bandstop attenuation (dB)#
        attenuation = 20.0
        #Nyquist frequency#
        fnyq = fs / 2.0
        

        for fb in fbands:
        
            #Create butterworth bandpass filter#
            #b, a = butter(order, fb  / fnyq, btype='band')
            b, a = cheby2(order, attenuation, fb  / fnyq, btype='band')
            
            #Apply filter#
            data_filter = lfilter(b, a, data_sensor)
            
            #Band pass 'power'#
            band_pwr = np.square(data_filter)
            
            avg_band_pwr = band_pwr.mean()
            
            out_str += "," + str(avg_band_pwr)
            

    elif eng_number==2:
        
        mean = data_sensor.mean()
        
        peak1=fftpeak[0:3].mean()            
        peak2=fftpeak[3:6].mean()          
        peak3=fftpeak[6:9].mean()
        peak4=fftpeak[9:12].mean()
        peak5=fftpeak[12:15].mean()            
        peak6=fftpeak[15:18].mean()          
        peak7=fftpeak[18:21].mean()
        peak8=fftpeak[21:24].mean()
        peak9=fftpeak[24:27].mean()            
        peak10=fftpeak[27:30].mean()          
        peak11=fftpeak[30:33].mean()
        peak12=fftpeak[33:36].mean()
            
        out_str += "," + str(mean)+ "," + str(peak1) + "," + str(peak2) + "," + str(peak3) +"," + str(peak4) \
                    +"," + str(peak5) + "," + str(peak6) + "," + str(peak7) +"," + str(peak8)+ "," + str(peak9) \
                    +"," + str(peak10) + "," + str(peak11) +"," + str(peak12)
    
    elif eng_number==1:
            
        mean = data_sensor.mean()   
        
        peak1=fftpeak[0:5].mean()            
        peak2=fftpeak[5:10].mean()          
        peak3=fftpeak[10:15].mean()
        peak4=fftpeak[15:20].mean()
        
        out_str += "," + str(mean)+ "," + str(peak1) + "," + str(peak2) + "," + str(peak3) +"," + str(peak4)
    
    elif eng_number==0:
            
        mean = data_sensor.mean()
    
        out_str += "," + str(mean)
    
    return out_str

## Module for Reading Test and Train Feature Files and Concatenating

In [4]:


def read_test_train(feature_model, short_size=False, new_test=False):
    feature_file='_'+str(feature_model)
    print("Load train.csv...")
    train1 = pd.read_csv('simple_train_1'+feature_file+'_short_'+str(short_size)+'_new_test_'+str(new_test)+'.csv')
    train2 = pd.read_csv('simple_train_2'+feature_file+'_short_'+str(short_size)+'_new_test_'+str(new_test)+'.csv')
    train3 = pd.read_csv('simple_train_3'+feature_file+'_short_'+str(short_size)+'_new_test_'+str(new_test)+'.csv')
    train = pd.concat([train1, train2, train3])
    # Remove all zeroes files
    train = train[train['file_size'] > 55000].copy()
    # Shuffle rows since they are ordered
    train = train.iloc[np.random.permutation(len(train))]
    # Reset broken index
    train = train.reset_index()
    print("Load test.csv...")
    test1 = pd.read_csv('simple_test_1'+feature_file+'_short_'+str(short_size)+'_new_test_'+str(new_test)+'.csv')
    test2 = pd.read_csv('simple_test_2'+feature_file+'_short_'+str(short_size)+'_new_test_'+str(new_test)+'.csv')
    test3 = pd.read_csv('simple_test_3'+feature_file+'_short_'+str(short_size)+'_new_test_'+str(new_test)+'.csv')
    test = pd.concat([test1, test2, test3])
    print("Process tables...")
    features = get_features(train, test)
    return train, test, features
    

 

In [18]:


def read_test_train_per_patient(feature_model, short_size=False, new_test=False):
    feature_file='_'+str(feature_model)
    print("Load train.csv...")
    train1 = pd.read_csv('simple_train_1'+feature_file+'_short_'+str(short_size)+'_new_test_'+str(new_test)+'.csv')
    train2 = pd.read_csv('simple_train_2'+feature_file+'_short_'+str(short_size)+'_new_test_'+str(new_test)+'.csv')
    train3 = pd.read_csv('simple_train_3'+feature_file+'_short_'+str(short_size)+'_new_test_'+str(new_test)+'.csv')
    #train = pd.concat([train1, train2, train3])
    # Remove all zeroes files
    train = train1[train1['file_size'] > 55000].copy()
    train = train2[train2['file_size'] > 55000].copy()
    train = train3[train3['file_size'] > 55000].copy()
    # Shuffle rows since they are ordered
    train1 = train1.iloc[np.random.permutation(len(train1))]
    train2 = train2.iloc[np.random.permutation(len(train2))]
    train3 = train3.iloc[np.random.permutation(len(train3))]
    # Reset broken index
    train1 = train1.reset_index()
    train2 = train2.reset_index()
    train3 = train3.reset_index()
    print("Load test.csv...")
    test1 = pd.read_csv('simple_test_1'+feature_file+'_short_'+str(short_size)+'_new_test_'+str(new_test)+'.csv')
    test2 = pd.read_csv('simple_test_2'+feature_file+'_short_'+str(short_size)+'_new_test_'+str(new_test)+'.csv')
    test3 = pd.read_csv('simple_test_3'+feature_file+'_short_'+str(short_size)+'_new_test_'+str(new_test)+'.csv')
    #test = pd.concat([test1, test2, test3])
    print("Process tables...")
    features1 = get_features(train1, test1)
    features2 = get_features(train2, test2)
    features3 = get_features(train3, test3)
    return [[train1, test1, features1],[train2, test2, features2],[train3, test3, features3]]
    

 

## Running Creation of Feature Files (!!!)

In [7]:

feature_model=3
short_size=False
num_features=6
new_test=True


if __name__ == '__main__':
    print('XGBoost: {}'.format(xgb.__version__))
    if 1:
        # Do reading and processing of MAT files in parallel
        p = dict()
#        p[1] = Process(target=create_simple_csv_train, args=(1,feature_model,num_features,short_size,new_test))
#        p[1].start()
#        p[2] = Process(target=create_simple_csv_train, args=(2,feature_model,num_features,short_size,new_test))
#        p[2].start()
#       p[3] = Process(target=create_simple_csv_train, args=(3,feature_model,num_features,short_size,new_test))
#        p[3].start()
#       p[4] = Process(target=create_simple_csv_test, args=(1,feature_model,num_features,short_size,new_test))
#        p[4].start()
#        p[5] = Process(target=create_simple_csv_test, args=(2,feature_model,num_features,short_size,new_test))
#        p[5].start()
        p[6] = Process(target=create_simple_csv_test, args=(3,feature_model,num_features,short_size,new_test))
        p[6].start()
#        p[1].join()
#        p[2].join()
#        p[3].join()
#        p[4].join()
#        p[5].join()
        p[6].join()

#create_simple_csv_test(1,feature_model,num_features,short_size,new_test)

XGBoost: 0.6
test files3 690




Test CSV for patient 3 has been completed...


## Reading Feature Files

In [8]:
feature_model=3
short_size=False
num_features=6
new_test=True

if __name__ == '__main__':
    print('XGBoost: {}'.format(xgb.__version__))
    
    train, test, features = read_test_train(feature_model, short_size, new_test)
    print('Length of train: ', len(train))
    print('Length of test: ', len(test))
    print('Features [{}]: {}'.format(len(features), sorted(features)))
    
#   print ('train',train['sequence_id'])

XGBoost: 0.6
Load train.csv...
Load test.csv...
Process tables...
Length of train:  4698
Length of test:  1908
Features [98]: ['ch_0_band_0', 'ch_0_band_1', 'ch_0_band_2', 'ch_0_band_3', 'ch_0_band_4', 'ch_0_band_5', 'ch_10_band_0', 'ch_10_band_1', 'ch_10_band_2', 'ch_10_band_3', 'ch_10_band_4', 'ch_10_band_5', 'ch_11_band_0', 'ch_11_band_1', 'ch_11_band_2', 'ch_11_band_3', 'ch_11_band_4', 'ch_11_band_5', 'ch_12_band_0', 'ch_12_band_1', 'ch_12_band_2', 'ch_12_band_3', 'ch_12_band_4', 'ch_12_band_5', 'ch_13_band_0', 'ch_13_band_1', 'ch_13_band_2', 'ch_13_band_3', 'ch_13_band_4', 'ch_13_band_5', 'ch_14_band_0', 'ch_14_band_1', 'ch_14_band_2', 'ch_14_band_3', 'ch_14_band_4', 'ch_14_band_5', 'ch_15_band_0', 'ch_15_band_1', 'ch_15_band_2', 'ch_15_band_3', 'ch_15_band_4', 'ch_15_band_5', 'ch_1_band_0', 'ch_1_band_1', 'ch_1_band_2', 'ch_1_band_3', 'ch_1_band_4', 'ch_1_band_5', 'ch_2_band_0', 'ch_2_band_1', 'ch_2_band_2', 'ch_2_band_3', 'ch_2_band_4', 'ch_2_band_5', 'ch_3_band_0', 'ch_3_band_1

## Module for Training and Prediction

In [106]:
def run_train_predict(nfolds, train, test, features, target, random_state=2016, mode=0, PCAkey=False, PCAgraph=False,
                      PCAkeyGS=False, SEQoriginal=False,
                     Oversampling=False, GridSearch=False, pred_per_patient=False):
    
    function_params = OrderedDict()
    function_params["nfolds"]=nfolds
    function_params["random state"]= random_state
    function_params["PCAkey"] = PCAkey
    function_params["PCAgraph"]= PCAgraph
    function_params["PCAkeyGS"]= PCAkeyGS
    function_params["SEQoriginal"]= SEQoriginal
    function_params["Oversampling"]= Oversampling
    function_params["GridSearch"]= GridSearch

    
    train_index_group=[]
    test_index_group=[]
    
    
    unique_seq = train.drop_duplicates(subset=['sequence_id'])
    unique_seq_y = unique_seq['result'].values
    
    print('unique seq y', len(unique_seq_y) )
    
    n_samples=len(unique_seq_y)
    print('length',n_samples)
    unique_seq_X = np.zeros(n_samples)
    
    print('unique seq X', len(unique_seq_X)  )
    
    
    print('train pre', train.shape) 

    yfull_train = dict()
    yfull_test = copy.deepcopy(test[['Id']].astype(object))

    unique_sequences = np.array(train['sequence_id'].unique())
    print('unique sequences pre', unique_sequences.shape)

    groups1=np.fix(unique_sequences/1000)
    
    groups2=groups1.astype(int)
#    print('groups', groups2)
    

    
    if SEQoriginal:
        sequences_full=np.mod(train['sequence_id'].values,1000)
        print('sequences full', sequences_full.shape)
        unique_sequences2=np.mod(unique_sequences,1000)
        unique_sequences_fold=pd.Series(sequences_full, index=train['sequence_id'].index)
#        print('unique_sequences_fold', unique_sequences_fold)
    
        unique_sequences = np.unique(unique_sequences2)
        print('unique sequences pre', unique_sequences.shape)

    else:
        unique_sequences_fold=pd.Series(train['sequence_id'], index=train['sequence_id'].index)

    gkf = GroupKFold(n_splits=3)
    test1=gkf.split(unique_sequences, groups=groups2)
    test2=gkf.split(unique_sequences, groups=groups2)
    
    #random_state=random_state
    print('unique sequences', unique_sequences.shape)
#    splitKF = KFold(len(unique_sequences), n_folds=nfolds, shuffle=True, random_state=random_state)
#    kf = NewKF(n_splits=nfolds, shuffle=True, random_state=random_state)
    kf = StratKF(n_splits=nfolds, shuffle=True, random_state=random_state)
    
    num_fold = 0
    num_fold1=0
    
    
    num_boost_round = 1000
    early_stopping_rounds = 50
    
    
    eta = 0.1
    max_depth = 4
    subsample = 0.9
    colsample_bytree = 0.9
    start_time = time.time()

    print('XGBoost params. ETA: {}, MAX_DEPTH: {}, SUBSAMPLE: {}, COLSAMPLE_BY_TREE: {}'.format(eta, max_depth, subsample, colsample_bytree))
    
    params = OrderedDict()
    params["objective"]= "binary:logistic"
    params["booster"]= "gbtree"
    params["eval_metric"]= "auc"
    params["eta"] = eta
    params["tree_method"]='exact'
    params["max_depth"]= max_depth
    params["subsample"] =subsample
    params["colsample_bytree"]= colsample_bytree
    params["silent"] =1
    params["seed"] =random_state
    params["gamma"] =0.1
    params["min_child_weight"] =2
    params["scale_pos_weight"]=2
    params["seed"]=27

    xgboost_to_xgb={
    
    "learning_rate" : "eta",
    "reg_alpha" : "alpha",
   
    "reg_lambda" : "lambda" }

    print('xgboost to xgb', xgboost_to_xgb)
    
#   Using best parameters to train model 

    if GridSearch:
    
        splitKF=kf.split(unique_seq_X, unique_seq_y)
    
        best_param=param_search_embedded(nfolds, features, target, splitKF, unique_sequences, 
                                         unique_sequences_fold, train, PCAkeyGS)
    
        print('after best_param', best_param)
        
        
        for key in best_param:
            if key in xgboost_to_xgb:   
                best_param[xgboost_to_xgb[key]]=best_param[key]
                del best_param[key]
                
        print ('substitution', best_param)
        params={key : best_param.get(key, value) for key, value in params.items()}
        
        print (params)
                        

    
    for train_seq_index, test_seq_index in kf.split(unique_seq_X, unique_seq_y):
        num_fold += 1
        print('Start fold {} from {}'.format(num_fold, nfolds))
        train_seq = unique_sequences[train_seq_index]
        valid_seq = unique_sequences[test_seq_index]
        print('Length of train people: {}'.format(len(train_seq)))
        print('Length of valid people: {}'.format(len(valid_seq)))
        
#        print('train_seq',train_seq)
#        print('valid_seq',valid_seq)

        X_train, X_valid = train[unique_sequences_fold.isin(train_seq)][features], train[unique_sequences_fold.isin(valid_seq)][features]
        y_train, y_valid = train[unique_sequences_fold.isin(train_seq)][target], train[unique_sequences_fold.isin(valid_seq)][target]
        X_test = test[features]
        
#        print('X_train',X_train)
#        print('y_train',y_train)

        print('Length train:', len(X_train))
        print('Length valid:', len(X_valid))
        
#       Scaling for PCA


        scaler = MinMaxScaler()   
        
        Xtrain_scaled=pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
        Xvalid_scaled=pd.DataFrame(scaler.fit_transform(X_valid), columns=X_valid.columns, index=X_valid.index )

        Xtest_scaled=pd.DataFrame(scaler.fit_transform(X_test), columns=X_test.columns, index=X_test.index)


        if PCAgraph:
            
            pcatest=KernelPCA(n_components=20)
            pcatest.fit(Xtrain_scaled)
            var1=np.cumsum(np.round(pcatest.explained_variance_ratio_, decimals=4)*100)
            f1 = plt.figure()
            print(var1)
            plt.plot(var1)
            plt.show()

    
        if PCAkey:      
        
#       PCA transformation 
            pcatest=PCA(n_components=20)
            X_train_f=pd.DataFrame(pcatest.fit_transform(Xtrain_scaled), index=Xtrain_scaled.index)
            X_valid_f=pd.DataFrame(pcatest.fit_transform(Xvalid_scaled), index=Xvalid_scaled.index)

            X_test_f=pd.DataFrame(pcatest.fit_transform(Xtest_scaled), index=Xtest_scaled.index)

        else:
            X_train_f=X_train
            X_valid_f=X_valid
            X_test_f=X_test
                
        
        y_train_f=y_train
        y_valid_f=y_valid
    
#       SMOTE oversampling
        
#        print('Original dataset shape {}'.format(Counter(y_train)))
#        print('Original dataset shape {}'.format(Counter(X_train_f)))
#        print(X_train_f)
#        print(y_train)

        if Oversampling:
        
            sm1 = SMOTETomek(random_state=42)
            X_res,y_res = sm1.fit_sample(X_train_f,y_train)
            X_train_f=pd.DataFrame(X_res, columns=X_train_f.columns)
            y_train=pd.Series(y_res)
        
#        print('Resampled dataset shape {}'.format(Counter(y_train)))
#        print(X_train_f)
#        print(y_train)
    
    
        
#       Preparation for XGB training

        dtrain = xgb.DMatrix(X_train_f, y_train)
        dvalid = xgb.DMatrix(X_valid_f, y_valid)

        watchlist = [(dtrain, 'train'), (dvalid, 'eval')]       
        
        gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=early_stopping_rounds, verbose_eval=500)

        yhat = gbm.predict(xgb.DMatrix(X_valid_f), ntree_limit=gbm.best_iteration+1)

#       Each time store portion of precicted data in train predicted values

        for i in range(len(X_valid_f.index)):
            yfull_train[X_valid_f.index[i]] = yhat[i]
            
        print("Validating...")
        check = gbm.predict(xgb.DMatrix(X_valid_f), ntree_limit=gbm.best_iteration+1)
        score = roc_auc_score(y_valid.tolist(), check)
        print('Check error value: {:.6f}'.format(score))

        print("Predict test set...")
        test_prediction1 = gbm.predict(xgb.DMatrix(X_test_f), ntree_limit=gbm.best_iteration+1)
        yfull_test['kfold_' + str(num_fold)] = test_prediction1
        
              

    print('iteration finished')
    # Copy dict to list
    train_res = []
    
    for i in range(len(train.index)):
        train_res.append(yfull_train[i])

    score = roc_auc_score(train[target], np.array(train_res))
    print('Check error value: {:.6f}'.format(score))

    # Find mean for KFolds on test
    merge = []
    for i in range(1, nfolds+1):
        merge.append('kfold_' + str(i))
    yfull_test['mean'] = yfull_test[merge].mean(axis=1)
    
    print()

    print('Training time: {} minutes'.format(round((time.time() - start_time)/60, 2)))


    #Pred_per_patient currently in development. Is deactivated and should not be used.
    if pred_per_patient:
    
        total_results=yfull_test['mean'].values
        hist, bins = np.histogram(total_results, bins=50)
        width = 0.7 * (bins[1] - bins[0])
        center = (bins[:-1] + bins[1:]) / 2
        plt.bar(center, hist, align='center', width=width)
        plt.show()
    
    
        total_results_train=np.array(train_res)
        hist, bins = np.histogram(total_results_train, bins=50)
        width = 0.7 * (bins[1] - bins[0])
        center = (bins[:-1] + bins[1:]) / 2
        plt.bar(center, hist, align='center', width=width)
        plt.show()
        
    #Saving module and xgboost parameters to JSON file
    
    xgboost_params=params
    
    parameters=[]
    
    parameters.append(xgboost_params)
    parameters.append(function_params)
    
    now = datetime.datetime.now()
    
    parameter_file_name=str('parameter-file-'+'mode-'+str(mode)+'-'+now.strftime("%Y-%m-%d-%H-%M"))
    
    json.dump(parameters, open(parameter_file_name+".txt",'w'), indent=4)
#    read_params = json.load(open(parameter_file_name+".txt"), object_pairs_hook=OrderedDict)
    
#    print (read_params)
    
#    print('yfull_test shape', yfull_test.shape)
    
    return yfull_test['mean'].values, score
    
    

## Module for Grid Search

### This is not currently being used. Will probably be deleted in future versions.

In [None]:
def run_param_search(nfolds, train, test, features, target, random_state=2016,  PCAkey=False, SEQoriginal=False):

    num_boost_round = 1000
    early_stopping_rounds = 50
    
    train_index_group=[]
    test_index_group=[]
    

    
    yfull_train = dict()
    yfull_test = copy.deepcopy(test[['Id']].astype(object))
    print('train sequences',train['sequence_id'])
    
    
    unique_sequences = np.array(train['sequence_id'].unique())
#    print('unique sequences', unique_sequences, len(unique_sequences))
    
    groups1=np.fix(unique_sequences/1000)
    
    groups2=groups1.astype(int)
#    print('groups', groups2)
    
    gkf = GroupKFold(n_splits=3)
    test1=gkf.split(unique_sequences, groups=groups2)
    test2=gkf.split(unique_sequences, groups=groups2)
    
    
    if SEQoriginal:
        sequences_full=np.mod(train['sequence_id'].values,1000)
        unique_sequences2=np.mod(unique_sequences,1000)
        unique_sequences_fold=pd.Series(sequences_full, index=train['sequence_id'].index)
#        print('unique_sequences_fold', unique_sequences_fold)
    
        unique_sequences = np.unique(unique_sequences2)

    else:
        unique_sequences_fold=pd.Series(train['sequence_id'], index=train['sequence_id'].index)

    
    kf = KFold(len(unique_sequences), n_folds=nfolds, shuffle=True, random_state=random_state)
    num_fold = 0
    num_fold1=0
    
    
    
    
#   param_test1 = {'max_depth': [1,3,5]}
#   param_test1 = {'max_depth': [1,3,5,7,9], 'min_child_weight':[1,3,5,7]}
#   param_test1 = {'gamma':[i/10.0 for i in range(0,7)]}
#   param_test1 = { 'subsample':[i/10.0 for i in range(6,10)],'colsample_bytree':[i/10.0 for i in range(6,10)]}  
#   param_test1 = {'max_depth': [1,3], 'min_child_weight':[6,7,8,9,10]}
#    param_test1 = {'scale_pos_weight':[1,2,3,4,5], 'max_delta_step':[0,1,2,3,4,5]}

    for train_seq_index1, test_seq_index1 in kf:
        num_fold1 += 1
        print('this is creation of Kfold iterator')
        print('Start fold {} from {}'.format(num_fold1, nfolds))
    
        train_seq1 = unique_sequences[train_seq_index1]
        valid_seq1 = unique_sequences[test_seq_index1]
        
        print(train_seq1)
        print(valid_seq1)

        train_index = train[unique_sequences_fold.isin(train_seq1)].index.values
        test_index = train[unique_sequences_fold.isin(valid_seq1)].index.values

        print(train_index, type(train_index))
        print(test_index, type(test_index))
       
        train_index_group.append(train_index)
        test_index_group.append(test_index)
        
    
    print('train index group',train_index_group)

    custom_cv = [(train_index_group[i], test_index_group[i]) for i in range(0,3) ]
    
#    custom_cv=GroupShuffleSplit(n_splits=nfolds, test_size=0.5, random_state=0)

#    custom_cv = list(zip(train_index_group, test_index_group))

    print('custom cv', custom_cv)
                   

#    Scaling and PCA

    if PCAkey:

        scaler1 = MinMaxScaler() 
    
        train_features=train[features]
        train_target=train[target]
            
        train_scaled=pd.DataFrame(scaler1.fit_transform(train_features), columns=train_features.columns, index=train_features.index)


        pcatest=KernelPCA(kernel='poly')
        train_features_f=pd.DataFrame(pcatest.fit_transform(train_scaled), index=train_scaled.index)

        dmfeatures=train_features_f
        dmtarget=train_target        

    else:
    
        dmfeatures=train[features]
        dmtarget=train[target]
    
#   GridSearch
    
    classifier1=XGBClassifier( learning_rate =0.2, n_estimators=1000, max_depth=1,
        min_child_weight=3, gamma=0, subsample=0.6, colsample_bytree=0.8,
        objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27)

    gsearch1 = GridSearchCV(estimator = classifier1, param_grid = param_test1, scoring='roc_auc',n_jobs=-1,iid=False, cv=custom_cv)
  
    
#    gsearch1.fit(dmfeatures,dmtarget,groups=train['sequence_id'])
    gsearch1.fit(dmfeatures,dmtarget)

    print('best parameters, scores')    
    print(gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_)
    
#    classifier2 = xgb.train(params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=early_stopping_rounds, verbose_eval=500)



## Embedded Grid Search

### Used by Module for Training and Prediction

In [103]:
def param_search_embedded(n_folds,features, target, kf, unique_sequences, unique_sequences_fold, train, 
                          PCAkey=False):

    num_boost_round = 1000
    early_stopping_rounds = 50
    
    train_index_group=[]
    test_index_group=[]
    
    num_fold1=0
    

    param_test1={}
    param_test1['learning_rate']= [i/10.0 for i in range(1,5)]
    param_test1['max_depth']= [1,2,3,4,5,6,7]
    param_test1['min_child_weight']=[1,2,3,4,5]
    param_test1['gamma']=[i/10.0 for i in range(0,7)]
#    param_test1['subsample']=[i/10.0 for i in range(6,10)]
#    param_test1['colsample_bytree']=[i/10.0 for i in range(6,10)]
#    param_test1['scale_pos_weight']=[1,2,3,4,5]
#    param_test1['max_delta_step']=[0,1,2,3,4,5]

    for train_seq_index1, test_seq_index1 in kf:
        num_fold1 += 1
        print('this is creation of Kfold iterator')
        print('Start fold {} from {}'.format(num_fold1, n_folds))
    
        train_seq1 = unique_sequences[train_seq_index1]
        valid_seq1 = unique_sequences[test_seq_index1]
        
#        print(train_seq1)
#        print(valid_seq1)

        train_index = train[unique_sequences_fold.isin(train_seq1)].index.values
        test_index = train[unique_sequences_fold.isin(valid_seq1)].index.values

#        print(train_index, type(train_index))
#        print(test_index, type(test_index))
       
        train_index_group.append(train_index)
        test_index_group.append(test_index)
        
    
#    print('train index group',train_index_group)

    custom_cv = [(train_index_group[i], test_index_group[i]) for i in range(0,n_folds) ]
    
#    custom_cv=GroupShuffleSplit(n_splits=nfolds, test_size=0.5, random_state=0)

#    custom_cv = list(zip(train_index_group, test_index_group))

#    print('custom cv', custom_cv)
                   

#    Scaling and PCA

    if PCAkey:

        scaler1 = MinMaxScaler() 
    
        train_features=train[features]
        train_target=train[target]
            
        train_scaled=pd.DataFrame(scaler1.fit_transform(train_features), columns=train_features.columns, index=train_features.index)


        pcatest=PCA(20)
        train_features_f=pd.DataFrame(pcatest.fit_transform(train_scaled), index=train_scaled.index)

        dmfeatures=train_features_f
        dmtarget=train_target        

    else:
    
        dmfeatures=train[features]
        dmtarget=train[target]
    
#   GridSearch

    print('start grid search')
    
    classifier1=XGBClassifier( learning_rate =0.2, n_estimators=1000, max_depth=4,
        min_child_weight=2, gamma=0.1, subsample=0.9, colsample_bytree=0.9,
        objective= 'binary:logistic', nthread=4, scale_pos_weight=2, seed=27, max_delta_step=0)

    gsearch1 = GridSearchCV(estimator = classifier1, param_grid = param_test1, scoring='roc_auc',n_jobs=-1,iid=False, cv=custom_cv)
  
    
#    gsearch1.fit(dmfeatures,dmtarget,groups=train['sequence_id'])
    gsearch1.fit(dmfeatures,dmtarget)

    print('best parameters, scores')    
    print(gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_)
    
#    classifier2 = xgb.train(params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=early_stopping_rounds, verbose_eval=500)

    return gsearch1.best_params_

## Running Grid Search

In [None]:
run_param_search(3, train, test, features, 'result', SEQoriginal=True, PCAkey=False)

## Running Training, Prediction and Creating Submission

In [95]:
prediction, score = run_train_predict(4, train, test, features, 'result', mode=0, SEQoriginal=False, PCAkey=False, 
                                      PCAgraph=False, PCAkeyGS=False,
                                      Oversampling=False, GridSearch=True, pred_per_patient=False)

create_submission(score, test, prediction, feature_model, short_size, new_test)


unique seq y 120
length 120
unique seq X 120
train pre (719, 102)
unique sequences pre (120,)
unique sequences (120,)
XGBoost params. ETA: 0.1, MAX_DEPTH: 4, SUBSAMPLE: 0.9, COLSAMPLE_BY_TREE: 0.9
xgboost to xgb {'reg_alpha': 'alpha', 'reg_lambda': 'lambda', 'learning_rate': 'eta'}
this is creation of Kfold iterator
Start fold 1 from 4
this is creation of Kfold iterator
Start fold 2 from 4
this is creation of Kfold iterator
Start fold 3 from 4
this is creation of Kfold iterator
Start fold 4 from 4
start grid search
best parameters, scores
[mean: 0.71107, std: 0.04499, params: {'max_depth': 1}, mean: 0.70418, std: 0.04200, params: {'max_depth': 2}, mean: 0.72873, std: 0.04064, params: {'max_depth': 3}, mean: 0.74120, std: 0.04037, params: {'max_depth': 4}] {'max_depth': 4} 0.741200588528
after best_param {'max_depth': 4}
substitution {'max_depth': 4}
{'max_depth': 4, 'subsample': 0.9, 'tree_method': 'exact', 'eval_metric': 'auc', 'objective': 'binary:logistic', 'min_child_weight': 2, 's



Stopping. Best iteration:
[7]	train-auc:0.981118	eval-auc:0.78869

Validating...
Check error value: 0.788690
Predict test set...
Start fold 3 from 4
Length of train people: 90
Length of valid people: 30
Length train: 539
Length valid: 180
[0]	train-auc:0.907796	eval-auc:0.735243
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 50 rounds.
Stopping. Best iteration:
[0]	train-auc:0.907796	eval-auc:0.735243

Validating...
Check error value: 0.735243
Predict test set...
Start fold 4 from 4
Length of train people: 91
Length of valid people: 29
Length train: 545
Length valid: 174
[0]	train-auc:0.902143	eval-auc:0.732085
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 50 rounds.
Stopping. Best iteration:
[88]	train-auc:0.99914	eval-auc:0.816224

Validating...
Check error value: 0.816224
Predict test set...
iteration finished
Check er

## Running Grid Search, Training and Prediction per patient

In [104]:
feature_model=3
short_size=False
num_features=6
new_test=True
all_data=read_test_train_per_patient(feature_model=3, short_size=False, new_test=True)

#print(len(all_data), len(all_data[0]), all_data[0][1].shape)

prediction=[]
score=[]
for i, item in enumerate(all_data):
    
    # mode indicates '0':Global training (all patients); '1' training and testing on patient one;
    #'2' training and testing on patient two...
    mode=i+1
    
    temp1, temp2 = run_train_predict(4, item[0], item[1], item[2], 'result', mode=mode, SEQoriginal=False, 
                                        PCAkey=False, 
                                        PCAgraph=False, PCAkeyGS=False,
                                        Oversampling=False, GridSearch=True, pred_per_patient=False)
    prediction.append(temp1)
    score.append(temp2)

# Concatenation of AUC scores per patient

predict_total=np.concatenate((prediction[0],prediction[1],prediction[2]))
score_total=sum(score)/len(score)
test=pd.concat([all_data[0][1],all_data[1][1],all_data[2][1]])
    
    
create_submission(score_total, test, predict_total, feature_model, short_size, new_test)


Load train.csv...
Load test.csv...
Process tables...
unique seq y 120
length 120
unique seq X 120
train pre (719, 102)
unique sequences pre (120,)
unique sequences (120,)
XGBoost params. ETA: 0.1, MAX_DEPTH: 4, SUBSAMPLE: 0.9, COLSAMPLE_BY_TREE: 0.9
xgboost to xgb {'reg_alpha': 'alpha', 'reg_lambda': 'lambda', 'learning_rate': 'eta'}
this is creation of Kfold iterator
Start fold 1 from 4
this is creation of Kfold iterator
Start fold 2 from 4
this is creation of Kfold iterator
Start fold 3 from 4
this is creation of Kfold iterator
Start fold 4 from 4
start grid search
best parameters, scores
[mean: 0.73703, std: 0.03936, params: {'min_child_weight': 1, 'max_depth': 1, 'gamma': 0.0, 'learning_rate': 0.1}, mean: 0.73677, std: 0.03633, params: {'min_child_weight': 2, 'max_depth': 1, 'gamma': 0.0, 'learning_rate': 0.1}, mean: 0.73888, std: 0.03478, params: {'min_child_weight': 3, 'max_depth': 1, 'gamma': 0.0, 'learning_rate': 0.1}, mean: 0.73061, std: 0.03641, params: {'min_child_weight': 4



Stopping. Best iteration:
[7]	train-auc:0.998311	eval-auc:0.815394

Validating...
Check error value: 0.815394
Predict test set...
Start fold 2 from 4
Length of train people: 90
Length of valid people: 30
Length train: 539
Length valid: 180
[0]	train-auc:0.968684	eval-auc:0.679205
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 50 rounds.
Stopping. Best iteration:
[1]	train-auc:0.991389	eval-auc:0.770255

Validating...
Check error value: 0.770255
Predict test set...
Start fold 3 from 4
Length of train people: 90
Length of valid people: 30
Length train: 539
Length valid: 180
[0]	train-auc:0.970387	eval-auc:0.706597
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 50 rounds.
Stopping. Best iteration:
[37]	train-auc:0.998691	eval-auc:0.778164

Validating...
Check error value: 0.778164
Predict test set...
Start fold 4 from 4
Lengt



Stopping. Best iteration:
[41]	train-auc:0.999771	eval-auc:0.789219

Validating...
Check error value: 0.789219
Predict test set...
Start fold 2 from 4
Length of train people: 248
Length of valid people: 83
Length train: 1488
Length valid: 498
[0]	train-auc:0.948218	eval-auc:0.667869
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 50 rounds.
Stopping. Best iteration:
[5]	train-auc:0.992026	eval-auc:0.706109

Validating...
Check error value: 0.706109
Predict test set...
Start fold 3 from 4
Length of train people: 249
Length of valid people: 82
Length train: 1494
Length valid: 492
[0]	train-auc:0.92301	eval-auc:0.702211
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 50 rounds.
Stopping. Best iteration:
[86]	train-auc:1	eval-auc:0.74933

Validating...
Check error value: 0.749330
Predict test set...
Start fold 4 from 4
Length of



Stopping. Best iteration:
[366]	train-auc:0.999987	eval-auc:0.875942

Validating...
Check error value: 0.875942
Predict test set...
Start fold 2 from 4
Length of train people: 257
Length of valid people: 86
Length train: 1542
Length valid: 516
[0]	train-auc:0.945218	eval-auc:0.668345
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 50 rounds.
Stopping. Best iteration:
[201]	train-auc:0.999988	eval-auc:0.850174

Validating...
Check error value: 0.850174
Predict test set...
Start fold 3 from 4
Length of train people: 258
Length of valid people: 85
Length train: 1548
Length valid: 510
[0]	train-auc:0.954501	eval-auc:0.700539
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 50 rounds.
Stopping. Best iteration:
[125]	train-auc:1	eval-auc:0.857859

Validating...
Check error value: 0.857859
Predict test set...
Start fold 4 from 4
Len

In [105]:
#train, test, features = all_data[0][0],all_data[0][1],all_data[0][2]

print(score)

[0.74280584010361483, 0.73675744371822804, 0.7845230607966458]


## Modules for PCA Graphic Analysis (in progress)

In [None]:
import seaborn as sns

sns.set()


almost_black = '#262626'
palette = sns.color_palette()

f, (ax1, ax2) = plt.subplots(1, 2)

ax1.scatter(X_vis[y == 0, 0], X_vis[y == 0, 1], label="Class #0", alpha=0.5,
            edgecolor=almost_black, facecolor=palette[0], linewidth=0.15)
ax1.scatter(X_vis[y == 1, 0], X_vis[y == 1, 1], label="Class #1", alpha=0.5,
            edgecolor=almost_black, facecolor=palette[2], linewidth=0.15)
ax1.set_title('Original set')

ax2.scatter(X_res_vis[y_resampled == 0, 0], X_res_vis[y_resampled == 0, 1],
            label="Class #0", alpha=.5, edgecolor=almost_black,
            facecolor=palette[0], linewidth=0.15)
ax2.scatter(X_res_vis[y_resampled == 1, 0], X_res_vis[y_resampled == 1, 1],
            label="Class #1", alpha=.5, edgecolor=almost_black,
            facecolor=palette[2], linewidth=0.15)
ax2.set_title('SMOTE svm')



plt.show()

In [None]:
y=train['result'].values
y_resampled=y
        
#       PCA transformation 
pcatest=PCA(n_components=4)
X_test=pcatest.fit_transform(train[features])
X_vis=X_test[:,0:2]
X_res_vis=X_test[:,2:4]

print(X_vis.shape,X_res_vis.shape, X_test.shape)