In [3]:
%matplotlib inline
import datetime
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.cross_validation import KFold

from sklearn.metrics import roc_auc_score
from scipy.io import loadmat
from operator import itemgetter
import random
import os
import time
import glob
import re
from multiprocessing import Process
import copy
from scipy.fftpack import fft

import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import cross_validation, metrics   #Additional scklearn functions
from sklearn.grid_search import GridSearchCV   #Perforing grid search

import matplotlib.pylab as plt
#%matplotlib inline
from matplotlib.pylab import rcParams

from pandas.tools.plotting import scatter_matrix

from sklearn.preprocessing import scale

from sklearn.decomposition import PCA




rcParams['figure.figsize'] = 12, 4


random.seed(2016)
np.random.seed(2016)


def natural_key(string_):
    return [int(s) if s.isdigit() else s for s in re.split(r'(\d+)', string_)]


def create_feature_map(features):
    outfile = open('xgb.fmap', 'w')
    for i, feat in enumerate(features):
        outfile.write('{0}\t{1}\tq\n'.format(i, feat))
    outfile.close()


def get_importance(gbm, features):
    create_feature_map(features)
    importance = gbm.get_fscore(fmap='xgb.fmap')
    importance = sorted(importance.items(), key=itemgetter(1), reverse=True)
    return importance


def intersect(a, b):
    return list(set(a) & set(b))


def print_features_importance(imp):
    for i in range(len(imp)):
        print("# " + str(imp[i][1]))
        print('output.remove(\'' + imp[i][0] + '\')')


def mat_to_pandas(path):
    mat = loadmat(path)
    names = mat['dataStruct'].dtype.names
    ndata = {n: mat['dataStruct'][n][0, 0] for n in names}
    sequence = -1
    if 'sequence' in names:
        sequence = mat['dataStruct']['sequence']
    return pd.DataFrame(ndata['data'], columns=ndata['channelIndices'][0]), sequence
    



def create_simple_csv_train(patient_id):

    out = open("simple_train_" + str(patient_id) + ".csv", "w")
    out.write("Id,sequence_id,patient_id")
    for i in range(16):
        out.write(",avg_" + str(i) + ",peak1_" + str(i) + ",peak2_" + str(i) + ",peak3_" + str(i) +",peak4_" + str(i))
    out.write(",file_size,result\n")

    # TRAIN (0)
    out_str = ''
    files = sorted(glob.glob("./data/train_" + str(patient_id) + "/*.mat"), key=natural_key)
    print ('train files'+ str(patient_id), len(files))    
    pos1=0
    neg1=0
    sequence_id = 0
    total = 0
    seq1=0
    for fl in files:
        total += 1
        # print('Go for ' + fl)
        id_str = os.path.basename(fl)[:-4]
        arr = id_str.split("_")
        patient = int(arr[0])
        id = int(arr[1])
        result = int(arr[2])
        new_id = patient*100000 + id
        try:
            tables, sequence_from_mat = mat_to_pandas(fl)
        except:
            print('Some error here {}...'.format(fl))
            continue
        out_str += str(new_id) + "," + str(sequence_id) + "," + str(patient)

        sizesignal=int(tables.shape[0])       
        
        for f in sorted(list(tables.columns.values)):
            mean = tables[f].mean()
            
            yf1 = fft(tables[f])
            fftpeak=2.0/sizesignal * np.abs(yf1[0:sizesignal/2])
 
            numberofbands=4

            sizeband=20/numberofbands

#           for i in range(numberofbands)
          
            peak1=fftpeak[0:5].mean()            
            peak2=fftpeak[5:10].mean()          
            peak3=fftpeak[10:15].mean()
            peak4=fftpeak[15:20].mean()
            
            out_str += "," + str(mean)+ "," + str(peak1) + "," + str(peak2) + "," + str(peak3) +"," + str(peak4)
        out_str += "," + str(os.path.getsize(fl)) + "," + str(result) + "\n"
        #print(sequence_from_mat)
        #print(type(sequence_from_mat))
        seq1=int(sequence_from_mat[0][0][0][0])
        print(total, seq1)
        if (total % 6 == 0) and (seq1==6):
            if result != 0:
                pos1 += 1
                print('Positive ocurrence sequence finished', pos1)
            else:
                neg1 += 1
                print('Negative ocurrence sequence finished', neg1)
                
            sequence_id += 1
            print ('sequence',sequence_id)

    out.write(out_str)
    
    out.close()
    print('Train CSV for patient {} has been completed...'.format(patient_id))


def create_simple_csv_test(patient_id):

    # TEST
    out_str = ''
    files = sorted(glob.glob("./data/test_" + str(patient_id) + "/*.mat"), key=natural_key)
    print ('test files'+ str(patient_id), len(files))    
    out = open("simple_test_" + str(patient_id) + ".csv", "w")
    out.write("Id,patient_id")
    for i in range(16):
        out.write(",avg_" + str(i) + ",peak1_" + str(i) + ",peak2_" + str(i) + ",peak3_" + str(i) +",peak4_" + str(i))
    out.write(",file_size\n")
    for fl in files:
        # print('Go for ' + fl)
        id_str = os.path.basename(fl)[:-4]
        arr = id_str.split("_")
        patient = int(arr[0])
        id = int(arr[1])
        new_id = patient*100000 + id
        try:
            tables, sequence_from_mat = mat_to_pandas(fl)
        except:
            print('Some error here {}...'.format(fl))
            continue
        out_str += str(new_id) + "," + str(patient)

        sizesignal=int(tables.shape[0])           
        
        for f in sorted(list(tables.columns.values)):
            mean = tables[f].mean()
            
            yf1 = fft(tables[f])
            fftpeak=2.0/sizesignal * np.abs(yf1[0:sizesignal/2])

            numberofbands=4

            sizeband=20/numberofbands

#           for i in range(numberofbands)
          
            peak1=fftpeak[0:5].mean()            
            peak2=fftpeak[5:10].mean()          
            peak3=fftpeak[10:15].mean()
            peak4=fftpeak[15:20].mean()
            
            out_str += "," + str(mean)+ "," + str(peak1) + "," + str(peak2) + "," + str(peak3) +"," + str(peak4)
                        
        out_str += "," + str(os.path.getsize(fl)) + "\n"
        # break

    out.write(out_str)
    out.close()
    print('Test CSV for patient {} has been completed...'.format(patient_id))


def run_kfold(nfolds, train, test, features, target, random_state=2016):
    eta = 0.2
    max_depth = 3
    subsample = 0.9
    colsample_bytree = 0.9
    start_time = time.time()

    print('XGBoost params. ETA: {}, MAX_DEPTH: {}, SUBSAMPLE: {}, COLSAMPLE_BY_TREE: {}'.format(eta, max_depth, subsample, colsample_bytree))
    params = {
        "objective": "binary:logistic",
        "booster" : "gbtree",
        "eval_metric": "auc",
        "eta": eta,
        "tree_method": 'exact',
        "max_depth": max_depth,
        "subsample": subsample,
        "colsample_bytree": colsample_bytree,
        "silent": 1,
        "seed": random_state,
    }
    num_boost_round = 1000
    early_stopping_rounds = 50

    yfull_train = dict()
    yfull_test = copy.deepcopy(test[['Id']].astype(object))

    unique_sequences = np.array(train['sequence_id'].unique())
    
    
    kf = KFold(len(unique_sequences), n_folds=nfolds, shuffle=True, random_state=random_state)
    num_fold = 0
    for train_seq_index, test_seq_index in kf:
        num_fold += 1
        print('Start fold {} from {}'.format(num_fold, nfolds))
        train_seq = unique_sequences[train_seq_index]
        valid_seq = unique_sequences[test_seq_index]
        print('Length of train people: {}'.format(len(train_seq)))
        print('Length of valid people: {}'.format(len(valid_seq)))

        X_train, X_valid = train[train['sequence_id'].isin(train_seq)][features], train[train['sequence_id'].isin(valid_seq)][features]
        y_train, y_valid = train[train['sequence_id'].isin(train_seq)][target], train[train['sequence_id'].isin(valid_seq)][target]
        X_test = test[features]

        print('Length train:', len(X_train))
        print('Length valid:', len(X_valid))

        dtrain = xgb.DMatrix(X_train, y_train)
        dvalid = xgb.DMatrix(X_valid, y_valid)

        watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
        gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=early_stopping_rounds, verbose_eval=1000)

        yhat = gbm.predict(xgb.DMatrix(X_valid), ntree_limit=gbm.best_iteration+1)

        # Each time store portion of precicted data in train predicted values
        for i in range(len(X_valid.index)):
            yfull_train[X_valid.index[i]] = yhat[i]

        print("Validating...")
        check = gbm.predict(xgb.DMatrix(X_valid), ntree_limit=gbm.best_iteration+1)
        score = roc_auc_score(y_valid.tolist(), check)
        print('Check error value: {:.6f}'.format(score))

        imp = get_importance(gbm, features)
        print('Importance array: ', imp)

        print("Predict test set...")
        test_prediction1 = gbm.predict(xgb.DMatrix(X_test), ntree_limit=gbm.best_iteration+1)
        yfull_test['kfold_' + str(num_fold)] = test_prediction1
        
        print(test_prediction1)
               

    # Copy dict to list
    train_res = []
    for i in range(len(train.index)):
        train_res.append(yfull_train[i])

    score = roc_auc_score(train[target], np.array(train_res))
    print('Check error value: {:.6f}'.format(score))

    # Find mean for KFolds on test
    merge = []
    for i in range(1, nfolds+1):
        merge.append('kfold_' + str(i))
    yfull_test['mean'] = yfull_test[merge].mean(axis=1)
    
    print(yfull_test)

    print('Training time: {} minutes'.format(round((time.time() - start_time)/60, 2)))
    return yfull_test['mean'].values, score




def create_submission(score, test, prediction):
    # Make Submission
    now = datetime.datetime.now()
    sub_file = 'submission_' + str(score) + '_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
    print('Writing submission: ', sub_file)
    f = open(sub_file, 'w')
    f.write('File,Class\n')
    total = 0
    for id in test['Id']:
        patient = id // 100000
        fid = id % 100000
        str1 = str(patient) + '_' + str(fid) + '.mat' + ',' + str(prediction[total])
        str1 += '\n'
        total += 1
        f.write(str1)
    f.close()


def get_features(train, test):
    trainval = list(train.columns.values)
    testval = list(test.columns.values)
    output = intersect(trainval, testval)
    output.remove('Id')
    # output.remove('file_size')
    return sorted(output)


def read_test_train():
    print("Load train.csv...")
    train1 = pd.read_csv("simple_train_1.csv")
    train2 = pd.read_csv("simple_train_2.csv")
    train3 = pd.read_csv("simple_train_3.csv")
    train = pd.concat([train1, train2, train3])
    # Remove all zeroes files
    train = train[train['file_size'] > 55000].copy()
    # Shuffle rows since they are ordered
    train = train.iloc[np.random.permutation(len(train))]
    # Reset broken index
    train = train.reset_index()
    print("Load test.csv...")
    test1 = pd.read_csv("simple_test_1.csv")
    test2 = pd.read_csv("simple_test_2.csv")
    test3 = pd.read_csv("simple_test_3.csv")
    test = pd.concat([test1, test2, test3])
    print("Process tables...")
    features = get_features(train, test)
    return train, test, features
    

 

In [4]:
#Creating files and reading them, ready for prediction.


if __name__ == '__main__':
    print('XGBoost: {}'.format(xgb.__version__))
    if 1:
        # Do reading and processing of MAT files in parallel
        p = dict()
        p[1] = Process(target=create_simple_csv_train, args=(1,))
        p[1].start()
        p[2] = Process(target=create_simple_csv_train, args=(2,))
        p[2].start()
        p[3] = Process(target=create_simple_csv_train, args=(3,))
        p[3].start()
        p[4] = Process(target=create_simple_csv_test, args=(1,))
        p[4].start()
        p[5] = Process(target=create_simple_csv_test, args=(2,))
        p[5].start()
        p[6] = Process(target=create_simple_csv_test, args=(3,))
        p[6].start()
        p[1].join()
        p[2].join()
        p[3].join()
        p[4].join()
        p[5].join()
        p[6].join()
    train, test, features = read_test_train()
    print('Length of train: ', len(train))
    print('Length of test: ', len(test))
    print('Features [{}]: {}'.format(len(features), sorted(features)))
       


    

XGBoost: 0.6
test files3 2286
test files2 2256
test files1 1584
train files1 1302
train files2 2346
train files3 2394




1 1




1 1




1 1
2 1
2 1
2 1
3 2
3 2
4 2
4 2
5 3
3 2
6 3
5 3
7 4
4 2
6 3
5 3
8 4
6 3
7 4
8 4
9 5
7 4
9 5
10 5
8 4
10 5
11 6
9 5
11 6
12 6
Positive ocurrence sequence finished 1
sequence 1
13 1
14 1
12 6
Positive ocurrence sequence finished 1
sequence 1
10 5
15 2
13 1
16 2
11 6
14 1
17 3
18 3
15 2
12 6
Positive ocurrence sequence finished 1
sequence 1
19 4
16 2
20 4
13 1
17 3
21 5
22 5
23 6
14 1
18 3
24 6
Positive ocurrence sequence finished 2
sequence 2
25 1
19 4
26 1
15 2
20 4
27 2
28 2
16 2
21 5
29 3
30 3
22 5
17 3
31 4
23 6
18 3
32 4
24 6
Positive ocurrence sequence finished 2
sequence 2
25 1
19 4
33 5
26 1
34 5
20 4
27 2
35 6
36 6
Positive ocurrence sequence finished 3
sequence 3
28 2
21 5
29 3
22 5
37 1
30 3
38 1
23 6
31 4
39 2
32 4
40 2
24 6
Positive ocurrence sequence finished 2
sequence 2
33 5
41 3
34 5
25 1
26 1
35 6
42 3
36 6
Positive ocurrence sequence finished 3
sequence 3
27 2
37 1
43 4
38 1
44 4
28 2
39 2
45 5
46 5
40 2
29 3
47 6
41 3
30 3
42 3
48 6
Positive ocurrence sequence finishe

In [5]:
#prediction, score= modelfit(xgb1, newtrain, newtest, train, predictors,'result',test, useTrainCV=True)

prediction, score = run_kfold(6, train, test, features, 'result')

create_submission(score, test, prediction)
print('version 3acb')

XGBoost params. ETA: 0.2, MAX_DEPTH: 3, SUBSAMPLE: 0.9, COLSAMPLE_BY_TREE: 0.9
Start fold 1 from 6
Length of train people: 311
Length of valid people: 63
Length train: 4984
Length valid: 986
[0]	train-auc:0.586464	eval-auc:0.511574
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 50 rounds.
Stopping. Best iteration:
[1]	train-auc:0.682976	eval-auc:0.592603

Validating...
Check error value: 0.592603
Importance array:  [('avg_3', 16), ('avg_4', 15), ('peak4_15', 12), ('peak1_12', 10), ('peak4_2', 9), ('peak1_0', 8), ('peak3_0', 8), ('peak1_13', 8), ('peak2_7', 8), ('peak1_15', 8), ('peak1_5', 8), ('avg_12', 8), ('peak1_10', 6), ('avg_1', 6), ('peak2_11', 6), ('peak3_2', 6), ('peak2_8', 6), ('peak4_7', 6), ('avg_8', 6), ('peak4_4', 6), ('peak3_4', 5), ('avg_11', 5), ('peak4_3', 5), ('peak2_3', 5), ('peak2_4', 5), ('peak3_6', 5), ('peak3_15', 5), ('avg_2', 5), ('avg_14', 5), ('peak4_14', 5), ('peak1_9', 5), ('