In [1]:
import settings
import numpy as np
import pandas as pd

In [2]:
def summary(data):
    # calculate a 5-number summary
    from numpy import percentile
    from numpy.random import rand

    
    # calculate quartiles
    quartiles = percentile(data, [25, 50, 75])
    # calculate min/max
    data_min, data_max = data.min(), data.max()
    # print 5-number summary
    print('Min: %.3f' % data_min)
    print('Q1: %.3f' % quartiles[0])
    print('Median: %.3f' % quartiles[1])
    print('Q3: %.3f' % quartiles[2])
    print('Max: %.3f' % data_max)

In [3]:
# generate data sample
data = np.random.rand(1000)
summary(data)

Min: 0.001
Q1: 0.252
Median: 0.506
Q3: 0.753
Max: 0.997


In [4]:
# Load Training Labels
labels_train = pd.read_csv(f'{settings.TRAIN_DATA_PATH}/train_labels.csv')
print(labels_train)

# 0 = Healthy, 1 = Schizophrenic

        Id  Class
0   120873      1
1   135376      0
2   139149      0
3   146791      0
4   153870      1
..     ...    ...
81  934330      0
82  950671      0
83  963924      1
84  993348      0
85  993946      1

[86 rows x 2 columns]


In [5]:
# Load Training FNC features
fnc_train = pd.read_csv(f'{settings.TRAIN_DATA_PATH}/train_FNC.csv')
fnc_train.head()

Unnamed: 0,Id,FNC1,FNC2,FNC3,FNC4,FNC5,FNC6,FNC7,FNC8,FNC9,...,FNC369,FNC370,FNC371,FNC372,FNC373,FNC374,FNC375,FNC376,FNC377,FNC378
0,120873,0.34312,0.045761,-0.13112,0.15034,0.18082,0.28916,0.069545,-0.052489,0.124,...,0.18743,0.16377,0.17686,0.074728,0.1486,0.34323,0.17565,0.52035,0.030613,0.30774
1,135376,0.2879,0.10257,-0.32343,-0.22776,0.12328,0.36702,-0.08404,0.038793,-0.006287,...,0.59834,-0.47788,0.26634,0.58294,-0.041272,0.63005,0.70314,0.36383,-0.20788,0.64138
2,139149,0.24585,0.21662,-0.12468,-0.3538,0.1615,-0.002032,-0.13302,-0.035222,0.25904,...,0.22866,-0.000816,0.2586,0.28045,-0.64464,0.33244,0.87074,-0.10973,-0.67916,0.32316
3,146791,0.4209,0.33138,0.24453,0.17167,0.59223,0.43105,0.28029,0.28962,0.3568,...,0.42881,0.71157,0.56593,0.24856,0.15386,0.093171,0.62556,0.60391,-0.058301,0.10779
4,153870,-0.14621,-0.46863,-0.5288,-0.50381,-0.51052,-0.029113,-0.015192,0.36017,0.005944,...,0.41688,-0.4356,0.32104,0.39296,-0.84243,0.42508,0.86509,-0.31281,-0.82964,0.24307


In [6]:
fnc_train.describe()

Unnamed: 0,Id,FNC1,FNC2,FNC3,FNC4,FNC5,FNC6,FNC7,FNC8,FNC9,...,FNC369,FNC370,FNC371,FNC372,FNC373,FNC374,FNC375,FNC376,FNC377,FNC378
count,86.0,86.0,86.0,86.0,86.0,86.0,86.0,86.0,86.0,86.0,...,86.0,86.0,86.0,86.0,86.0,86.0,86.0,86.0,86.0,86.0
mean,523359.174419,0.211394,0.04903,-0.091031,-0.102662,0.158572,0.042185,-0.103091,0.02482,0.12922,...,0.164315,0.197671,0.278935,0.129059,-0.176137,0.200409,0.553819,0.251976,-0.286359,0.156969
std,261225.75659,0.258132,0.270017,0.324109,0.320256,0.279851,0.289825,0.279123,0.24971,0.256124,...,0.367621,0.332431,0.247594,0.329516,0.436361,0.312319,0.256369,0.346178,0.358429,0.284371
min,120873.0,-0.39135,-0.67389,-0.73401,-0.77518,-0.51052,-0.58264,-0.63769,-0.52762,-0.46077,...,-0.87806,-0.49831,-0.41809,-0.74414,-0.95714,-0.62866,-0.002533,-0.81408,-0.92865,-0.50749
25%,280986.75,0.061147,-0.129993,-0.344335,-0.35183,-0.049274,-0.162627,-0.312198,-0.11587,-0.047829,...,-0.05913,-0.028957,0.116248,-0.089562,-0.491205,0.00286,0.31872,0.078512,-0.559255,-0.042158
50%,487524.0,0.238885,0.100597,-0.088119,-0.082341,0.194575,0.0521,-0.096169,0.024141,0.155995,...,0.24545,0.20814,0.31755,0.20668,-0.11127,0.21399,0.58069,0.30884,-0.2611,0.19686
75%,772918.5,0.401767,0.256295,0.107702,0.135475,0.37721,0.231305,0.083581,0.216883,0.300102,...,0.42461,0.421727,0.45631,0.36763,0.152545,0.43318,0.757125,0.499785,-0.020905,0.365418
max,993946.0,0.82024,0.7666,0.60102,0.73157,0.76395,0.60715,0.61237,0.61053,0.7292,...,0.90287,0.86536,0.83374,0.71621,0.65738,0.77866,0.95279,0.82871,0.59069,0.90597


In [7]:
# Load Training SBM Features
sbm_train = pd.read_csv(f'{settings.TRAIN_DATA_PATH}/train_SBM.csv')
sbm_train.head()

Unnamed: 0,Id,SBM_map1,SBM_map2,SBM_map3,SBM_map4,SBM_map5,SBM_map6,SBM_map7,SBM_map8,SBM_map10,...,SBM_map55,SBM_map61,SBM_map64,SBM_map67,SBM_map69,SBM_map71,SBM_map72,SBM_map73,SBM_map74,SBM_map75
0,120873,0.725065,-0.639254,0.353069,-0.981707,-1.419971,-0.441321,-0.264192,0.711579,0.641798,...,-0.07822,-0.982331,1.070363,0.220316,0.776855,-2.022404,1.203256,1.083516,0.564201,-0.002006
1,135376,-1.328855,0.50297,0.013232,1.128496,-0.070738,0.398476,-0.466051,-0.435455,0.342847,...,0.989571,-0.057543,0.371701,-0.513081,-0.45755,1.476216,0.381052,0.163436,0.403782,-0.295125
2,139149,0.732268,-1.241554,0.654942,-0.289216,0.158316,0.029165,1.439242,-0.832816,1.285468,...,-0.257114,0.597229,1.220756,-0.059213,-0.435494,-0.092971,1.09091,-0.448562,-0.508497,0.350434
3,146791,-0.343917,-1.052519,-1.150521,0.765989,0.923129,0.674052,-0.492673,1.282388,-1.914583,...,-0.29511,0.829697,-0.450726,-0.791032,-1.115821,-0.022296,-0.245844,-0.705539,0.061344,0.448966
4,153870,-0.208685,-0.562697,-0.362164,1.025571,0.15169,-0.13041,-1.105922,0.502029,-0.097735,...,1.342273,-0.978412,0.158492,0.889753,0.795368,0.738788,0.475415,2.340384,2.516038,-0.55144


In [8]:
sbm_train.describe()

Unnamed: 0,Id,SBM_map1,SBM_map2,SBM_map3,SBM_map4,SBM_map5,SBM_map6,SBM_map7,SBM_map8,SBM_map10,...,SBM_map55,SBM_map61,SBM_map64,SBM_map67,SBM_map69,SBM_map71,SBM_map72,SBM_map73,SBM_map74,SBM_map75
count,86.0,86.0,86.0,86.0,86.0,86.0,86.0,86.0,86.0,86.0,...,86.0,86.0,86.0,86.0,86.0,86.0,86.0,86.0,86.0,86.0
mean,523359.174419,0.134995,-0.094195,-0.076866,0.012471,0.073704,-0.004953,0.118802,0.106247,0.064476,...,0.151999,0.073282,0.047773,0.057342,0.0034,0.126982,-0.043296,-0.013557,0.041178,0.117573
std,261225.75659,0.883114,0.937444,0.882526,0.937842,0.94151,0.99426,0.926693,0.964946,1.017846,...,1.027941,0.971546,0.905877,0.913446,1.070912,1.003204,1.012973,1.019173,1.104815,0.925451
min,120873.0,-1.945161,-2.183338,-2.24121,-2.005569,-2.503497,-3.09961,-1.964108,-2.305639,-2.88043,...,-3.424498,-2.316457,-2.396483,-2.207213,-2.279469,-3.015051,-2.852879,-2.014501,-2.310918,-2.6771
25%,280986.75,-0.400377,-0.632557,-0.641487,-0.726047,-0.459417,-0.600037,-0.547326,-0.471092,-0.56742,...,-0.434299,-0.539833,-0.566438,-0.558446,-0.843822,-0.324,-0.776162,-0.854733,-0.529189,-0.381665
50%,487524.0,0.087173,-0.118148,-0.141126,-0.03435,0.128777,0.028739,0.107167,0.172475,0.178142,...,0.162752,0.010816,-0.101885,-0.106795,0.146057,0.210996,-0.02336,-0.108328,0.046759,0.048913
75%,772918.5,0.730467,0.575077,0.653878,0.587966,0.573641,0.613419,0.752733,0.739824,0.620836,...,0.852634,0.824713,0.812166,0.722758,0.802573,0.748577,0.650146,0.570348,0.581718,0.692555
max,993946.0,2.419532,1.778067,1.784392,2.633859,2.296887,2.895577,2.70758,2.930673,2.89711,...,2.679952,1.854449,2.009513,3.6623,2.11606,2.766767,2.118759,3.25922,4.769478,2.771659


In [9]:
# Convert Labels to be between -1 and 1 (-1 is control, 1 is schizophrenic)
y = 2*(labels_train.Class)-1
print(y)

0     1
1    -1
2    -1
3    -1
4     1
     ..
81   -1
82   -1
83    1
84   -1
85    1
Name: Class, Length: 86, dtype: int64


In [10]:
# Normalize sbm_train and fnc_train by mean normalization

normalized_sbm=(sbm_train-sbm_train.mean())/sbm_train.std()
normalized_fnc=(fnc_train-fnc_train.mean())/fnc_train.std()

In [11]:
normalized_sbm.head()

Unnamed: 0,Id,SBM_map1,SBM_map2,SBM_map3,SBM_map4,SBM_map5,SBM_map6,SBM_map7,SBM_map8,SBM_map10,...,SBM_map55,SBM_map61,SBM_map64,SBM_map67,SBM_map69,SBM_map71,SBM_map72,SBM_map73,SBM_map74,SBM_map75
0,-1.54076,0.66817,-0.581431,0.487164,-1.06007,-1.586467,-0.438888,-0.413291,0.627322,0.5672,...,-0.223961,-1.086529,1.128839,0.178417,0.72224,-2.142522,1.230587,1.076434,0.473403,-0.129212
1,-1.485241,-1.6576,0.637014,0.10209,1.189992,-0.153415,0.405758,-0.631118,-0.561381,0.27349,...,0.814806,-0.134656,0.357584,-0.624473,-0.430428,1.344925,0.418913,0.173664,0.328203,-0.445942
2,-1.470797,0.676326,-1.223923,0.829219,-0.321682,0.089868,0.034315,1.424894,-0.973177,1.199584,...,-0.397992,0.539292,1.294858,-0.127599,-0.409832,-0.21925,1.11968,-0.426821,-0.497526,0.251619
3,-1.441543,-0.542299,-1.022274,-1.216572,0.80346,0.902194,0.682925,-0.659846,1.218867,-1.94436,...,-0.434956,0.778568,-0.550294,-0.928762,-1.04511,-0.148801,-0.199955,-0.678964,0.018253,0.358088
4,-1.414444,-0.389168,-0.499765,-0.323274,1.080246,0.08283,-0.126181,-1.321606,0.41016,-0.159367,...,1.157921,-1.082495,0.122222,0.911286,0.739527,0.609852,0.512068,2.309657,2.240067,-0.722904


In [12]:
normalized_fnc.head()

Unnamed: 0,Id,FNC1,FNC2,FNC3,FNC4,FNC5,FNC6,FNC7,FNC8,FNC9,...,FNC369,FNC370,FNC371,FNC372,FNC373,FNC374,FNC375,FNC376,FNC377,FNC378
0,-1.54076,0.510305,-0.012107,-0.123691,0.790001,0.079499,0.852154,0.618494,-0.309594,-0.020382,...,0.062877,-0.10198,-0.412268,-0.164882,0.744194,0.457293,-1.475097,0.77525,0.884337,0.530193
1,-1.485241,0.296383,0.198284,-0.717041,-0.390618,-0.12611,1.120799,0.068254,0.055957,-0.529069,...,1.18063,-2.032156,-0.05087,1.377414,0.309068,1.375649,0.582444,0.323112,0.218953,1.703449
2,-1.470797,0.133482,0.620665,-0.103821,-0.784178,0.010462,-0.152563,-0.107224,-0.240446,0.506862,...,0.175031,-0.597079,-0.082131,0.459432,-1.07366,0.422745,1.236188,-1.044856,-1.095897,0.584417
3,-1.441543,0.811623,1.045676,1.035332,0.856604,1.549601,1.341726,1.373519,1.060429,0.888552,...,0.719477,1.545882,1.159133,0.362654,0.756248,-0.343359,0.279833,1.016629,0.636271,-0.172939
4,-1.414444,-1.385351,-1.917141,-1.350685,-1.252585,-2.390884,-0.246003,0.314912,1.342956,-0.481313,...,0.687025,-1.904972,0.170056,0.800872,-1.526932,0.719365,1.21415,-1.631491,-1.515729,0.302778


In [13]:
# Normalize sbm_train and fnc_train by standard deviations

normalized_sbm=sbm_train/sbm_train.std()
normalized_fnc=fnc_train/fnc_train.std()

In [14]:
normalized_sbm.head()

Unnamed: 0,Id,SBM_map1,SBM_map2,SBM_map3,SBM_map4,SBM_map5,SBM_map6,SBM_map7,SBM_map8,SBM_map10,...,SBM_map55,SBM_map61,SBM_map64,SBM_map67,SBM_map69,SBM_map71,SBM_map72,SBM_map73,SBM_map74,SBM_map75
0,0.462715,0.821032,-0.681912,0.400067,-1.046773,-1.508184,-0.443869,-0.285091,0.73743,0.630546,...,-0.076094,-1.0111,1.181576,0.241192,0.725414,-2.015945,1.187846,1.063132,0.510675,-0.002168
1,0.518234,-1.504738,0.536533,0.014993,1.20329,-0.075132,0.400776,-0.502918,-0.451274,0.336836,...,0.962673,-0.059228,0.410321,-0.561698,-0.427253,1.471501,0.376171,0.160362,0.365475,-0.318898
2,0.532677,0.829188,-1.324404,0.742122,-0.308384,0.168151,0.029334,1.553094,-0.86307,1.26293,...,-0.250125,0.614721,1.347595,-0.064823,-0.406657,-0.092674,1.076939,-0.440123,-0.460255,0.378662
3,0.561932,-0.389437,-1.122755,-1.303669,0.816757,0.980477,0.677943,-0.531646,1.328974,-1.881014,...,-0.287089,0.853996,-0.497557,-0.865987,-1.041936,-0.022225,-0.242696,-0.692266,0.055524,0.485132
4,0.589031,-0.236306,-0.600246,-0.410372,1.093543,0.161113,-0.131163,-1.193406,0.520267,-0.096021,...,1.305788,-1.007066,0.174959,0.974061,0.742702,0.736428,0.469327,2.296355,2.277339,-0.595861


In [15]:
normalized_fnc.head()

Unnamed: 0,Id,FNC1,FNC2,FNC3,FNC4,FNC5,FNC6,FNC7,FNC8,FNC9,...,FNC369,FNC370,FNC371,FNC372,FNC373,FNC374,FNC375,FNC376,FNC377,FNC378
0,0.462715,1.32924,0.169475,-0.404555,0.469437,0.646129,0.997707,0.249155,-0.2102,0.48414,...,0.509845,0.492644,0.714313,0.226781,0.340544,1.098972,0.685145,1.50313,0.085409,1.082178
1,0.518234,1.115319,0.379865,-0.997905,-0.711182,0.44052,1.266352,-0.301085,0.155352,-0.024547,...,1.627598,-1.437532,1.075711,1.769077,-0.094582,2.017328,2.742685,1.050992,-0.579976,2.255435
2,0.532677,0.952418,0.802247,-0.384685,-1.104742,0.577092,-0.00701,-0.476563,-0.141051,1.011384,...,0.621999,-0.002455,1.04445,0.851096,-1.47731,1.064424,3.39643,-0.316976,-1.894825,1.136403
3,0.561932,1.630559,1.227257,0.754468,0.53604,2.11623,1.487279,1.004179,1.159824,1.393073,...,1.166444,2.140506,2.285713,0.754317,0.352598,0.29832,2.440075,1.744508,-0.162657,0.379047
4,0.589031,-0.566415,-1.735559,-1.631549,-1.573149,-1.824254,-0.10045,-0.054428,1.442351,0.023209,...,1.133993,-1.310348,1.296636,1.192535,-1.930581,1.361044,3.374391,-0.903611,-2.314657,0.854764


In [16]:
fnc_train.iloc[:, 1:]

Unnamed: 0,FNC1,FNC2,FNC3,FNC4,FNC5,FNC6,FNC7,FNC8,FNC9,FNC10,...,FNC369,FNC370,FNC371,FNC372,FNC373,FNC374,FNC375,FNC376,FNC377,FNC378
0,0.343120,0.045761,-0.131120,0.15034,0.180820,0.289160,0.069545,-0.052489,0.124000,0.477620,...,0.187430,0.163770,0.17686,0.074728,0.148600,0.343230,0.17565,0.520350,0.030613,0.307740
1,0.287900,0.102570,-0.323430,-0.22776,0.123280,0.367020,-0.084040,0.038793,-0.006287,0.101690,...,0.598340,-0.477880,0.26634,0.582940,-0.041272,0.630050,0.70314,0.363830,-0.207880,0.641380
2,0.245850,0.216620,-0.124680,-0.35380,0.161500,-0.002032,-0.133020,-0.035222,0.259040,-0.045302,...,0.228660,-0.000816,0.25860,0.280450,-0.644640,0.332440,0.87074,-0.109730,-0.679160,0.323160
3,0.420900,0.331380,0.244530,0.17167,0.592230,0.431050,0.280290,0.289620,0.356800,0.286340,...,0.428810,0.711570,0.56593,0.248560,0.153860,0.093171,0.62556,0.603910,-0.058301,0.107790
4,-0.146210,-0.468630,-0.528800,-0.50381,-0.510520,-0.029113,-0.015192,0.360170,0.005944,0.024801,...,0.416880,-0.435600,0.32104,0.392960,-0.842430,0.425080,0.86509,-0.312810,-0.829640,0.243070
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81,-0.148470,-0.401520,-0.474630,-0.53253,0.293510,-0.111720,-0.544720,0.240320,0.156540,0.194410,...,-0.231000,0.238880,-0.17473,0.095453,-0.708160,-0.026044,0.51899,0.265310,-0.446470,0.130690
82,0.074459,-0.240680,-0.367120,-0.34688,-0.052563,0.252690,-0.382160,0.144010,0.310350,0.174190,...,0.460420,-0.286550,0.11393,0.412760,-0.813060,0.415860,0.81388,-0.248860,-0.546160,0.366960
83,0.435160,0.225050,0.057172,-0.35348,0.447420,0.183180,0.122420,0.024561,0.404830,0.006682,...,-0.025409,0.269110,0.51604,0.053609,-0.484620,-0.437720,0.38378,0.556660,-0.476940,0.026923
84,0.071453,-0.202410,-0.328480,-0.60958,-0.137080,-0.434230,-0.637690,-0.377230,-0.295430,-0.164490,...,0.583920,-0.303370,0.22673,0.482180,-0.679010,0.444760,0.88410,0.073527,-0.676770,0.421160


In [17]:
# Normalize sbm_train and fnc_train by standard deviations

normalized_sbm=sbm_train.iloc[:,1:]/sbm_train.iloc[:,1:].std()
normalized_fnc=fnc_train.iloc[:,1:]/fnc_train.iloc[:,1:].std()

In [18]:
normalized_sbm.head()

Unnamed: 0,SBM_map1,SBM_map2,SBM_map3,SBM_map4,SBM_map5,SBM_map6,SBM_map7,SBM_map8,SBM_map10,SBM_map13,...,SBM_map55,SBM_map61,SBM_map64,SBM_map67,SBM_map69,SBM_map71,SBM_map72,SBM_map73,SBM_map74,SBM_map75
0,0.821032,-0.681912,0.400067,-1.046773,-1.508184,-0.443869,-0.285091,0.73743,0.630546,0.31392,...,-0.076094,-1.0111,1.181576,0.241192,0.725414,-2.015945,1.187846,1.063132,0.510675,-0.002168
1,-1.504738,0.536533,0.014993,1.20329,-0.075132,0.400776,-0.502918,-0.451274,0.336836,-0.621313,...,0.962673,-0.059228,0.410321,-0.561698,-0.427253,1.471501,0.376171,0.160362,0.365475,-0.318898
2,0.829188,-1.324404,0.742122,-0.308384,0.168151,0.029334,1.553094,-0.86307,1.26293,-1.971278,...,-0.250125,0.614721,1.347595,-0.064823,-0.406657,-0.092674,1.076939,-0.440123,-0.460255,0.378662
3,-0.389437,-1.122755,-1.303669,0.816757,0.980477,0.677943,-0.531646,1.328974,-1.881014,-1.173626,...,-0.287089,0.853996,-0.497557,-0.865987,-1.041936,-0.022225,-0.242696,-0.692266,0.055524,0.485132
4,-0.236306,-0.600246,-0.410372,1.093543,0.161113,-0.131163,-1.193406,0.520267,-0.096021,1.443483,...,1.305788,-1.007066,0.174959,0.974061,0.742702,0.736428,0.469327,2.296355,2.277339,-0.595861


In [19]:
normalized_fnc.head()

Unnamed: 0,FNC1,FNC2,FNC3,FNC4,FNC5,FNC6,FNC7,FNC8,FNC9,FNC10,...,FNC369,FNC370,FNC371,FNC372,FNC373,FNC374,FNC375,FNC376,FNC377,FNC378
0,1.32924,0.169475,-0.404555,0.469437,0.646129,0.997707,0.249155,-0.2102,0.48414,2.075783,...,0.509845,0.492644,0.714313,0.226781,0.340544,1.098972,0.685145,1.50313,0.085409,1.082178
1,1.115319,0.379865,-0.997905,-0.711182,0.44052,1.266352,-0.301085,0.155352,-0.024547,0.441955,...,1.627598,-1.437532,1.075711,1.769077,-0.094582,2.017328,2.742685,1.050992,-0.579976,2.255435
2,0.952418,0.802247,-0.384685,-1.104742,0.577092,-0.00701,-0.476563,-0.141051,1.011384,-0.196887,...,0.621999,-0.002455,1.04445,0.851096,-1.47731,1.064424,3.39643,-0.316976,-1.894825,1.136403
3,1.630559,1.227257,0.754468,0.53604,2.11623,1.487279,1.004179,1.159824,1.393073,1.244462,...,1.166444,2.140506,2.285713,0.754317,0.352598,0.29832,2.440075,1.744508,-0.162657,0.379047
4,-0.566415,-1.735559,-1.631549,-1.573149,-1.824254,-0.10045,-0.054428,1.442351,0.023209,0.107788,...,1.133993,-1.310348,1.296636,1.192535,-1.930581,1.361044,3.374391,-0.903611,-2.314657,0.854764


In [20]:
x = pd.concat([normalized_sbm, normalized_fnc], axis=1)

In [21]:
x.head()

Unnamed: 0,SBM_map1,SBM_map2,SBM_map3,SBM_map4,SBM_map5,SBM_map6,SBM_map7,SBM_map8,SBM_map10,SBM_map13,...,FNC369,FNC370,FNC371,FNC372,FNC373,FNC374,FNC375,FNC376,FNC377,FNC378
0,0.821032,-0.681912,0.400067,-1.046773,-1.508184,-0.443869,-0.285091,0.73743,0.630546,0.31392,...,0.509845,0.492644,0.714313,0.226781,0.340544,1.098972,0.685145,1.50313,0.085409,1.082178
1,-1.504738,0.536533,0.014993,1.20329,-0.075132,0.400776,-0.502918,-0.451274,0.336836,-0.621313,...,1.627598,-1.437532,1.075711,1.769077,-0.094582,2.017328,2.742685,1.050992,-0.579976,2.255435
2,0.829188,-1.324404,0.742122,-0.308384,0.168151,0.029334,1.553094,-0.86307,1.26293,-1.971278,...,0.621999,-0.002455,1.04445,0.851096,-1.47731,1.064424,3.39643,-0.316976,-1.894825,1.136403
3,-0.389437,-1.122755,-1.303669,0.816757,0.980477,0.677943,-0.531646,1.328974,-1.881014,-1.173626,...,1.166444,2.140506,2.285713,0.754317,0.352598,0.29832,2.440075,1.744508,-0.162657,0.379047
4,-0.236306,-0.600246,-0.410372,1.093543,0.161113,-0.131163,-1.193406,0.520267,-0.096021,1.443483,...,1.133993,-1.310348,1.296636,1.192535,-1.930581,1.361044,3.374391,-0.903611,-2.314657,0.854764


In [22]:
# Make 2 test cases from training data
y_test = y[-2:]
print(y_test.shape)

x_test = x.iloc[-2:,:]
print(x_test.shape)
print(x.head())

(2,)
(2, 410)
   SBM_map1  SBM_map2  SBM_map3  SBM_map4  SBM_map5  SBM_map6  SBM_map7  \
0  0.821032 -0.681912  0.400067 -1.046773 -1.508184 -0.443869 -0.285091   
1 -1.504738  0.536533  0.014993  1.203290 -0.075132  0.400776 -0.502918   
2  0.829188 -1.324404  0.742122 -0.308384  0.168151  0.029334  1.553094   
3 -0.389437 -1.122755 -1.303669  0.816757  0.980477  0.677943 -0.531646   
4 -0.236306 -0.600246 -0.410372  1.093543  0.161113 -0.131163 -1.193406   

   SBM_map8  SBM_map10  SBM_map13  ...    FNC369    FNC370    FNC371  \
0  0.737430   0.630546   0.313920  ...  0.509845  0.492644  0.714313   
1 -0.451274   0.336836  -0.621313  ...  1.627598 -1.437532  1.075711   
2 -0.863070   1.262930  -1.971278  ...  0.621999 -0.002455  1.044450   
3  1.328974  -1.881014  -1.173626  ...  1.166444  2.140506  2.285713   
4  0.520267  -0.096021   1.443483  ...  1.133993 -1.310348  1.296636   

     FNC372    FNC373    FNC374    FNC375    FNC376    FNC377    FNC378  
0  0.226781  0.340544  1.098

In [28]:
x_test.head()

Unnamed: 0,SBM_map1,SBM_map2,SBM_map3,SBM_map4,SBM_map5,SBM_map6,SBM_map7,SBM_map8,SBM_map10,SBM_map13,...,FNC369,FNC370,FNC371,FNC372,FNC373,FNC374,FNC375,FNC376,FNC377,FNC378
84,1.481832,-0.429526,0.956604,0.411758,-0.355914,-0.07277,0.49929,0.626141,0.450378,0.108217,...,1.588373,-0.912581,0.915731,1.463296,-1.556075,1.424056,3.448542,0.212397,-1.888157,1.481024
85,1.267102,0.686532,1.616131,1.247407,0.49772,-2.064088,1.642604,0.590851,-0.586362,0.205224,...,0.110799,0.868782,2.996432,0.366052,0.011213,1.054434,2.774085,-0.182247,0.302292,1.254805


In [29]:
y_test.head()

84   -1
85    1
Name: Class, dtype: int64

In [23]:
# Remove 2 test cases from training data
print("before")
print(y.shape)
print(x.shape)

y = y[:-2]
x = x.iloc[:-2,:]

print("after")
print(y.shape)
print(x.shape)

before
(86,)
(86, 410)
after
(84,)
(84, 410)


In [24]:
print(x.shape, y.shape)

(84, 410) (84,)


In [25]:
# Load test data

# Load Testing FNC features
fnc_test = pd.read_csv(f'{settings.TEST_DATA_PATH}/test_FNC.csv')
fnc_test.head()

# Load Testing SBM features
fnc_test = pd.read_csv(f'{settings.TEST_DATA_PATH}/test_SBM.csv')
fnc_test.head()


Unnamed: 0,Id,SBM_map1,SBM_map2,SBM_map3,SBM_map4,SBM_map5,SBM_map6,SBM_map7,SBM_map8,SBM_map10,...,SBM_map55,SBM_map61,SBM_map64,SBM_map67,SBM_map69,SBM_map71,SBM_map72,SBM_map73,SBM_map74,SBM_map75
0,100004,1.154881,3.627246,0.146076,-0.677521,0.951169,0.482531,-2.40413,-0.715182,-0.475299,...,-0.451994,1.12377,2.083006,1.14544,-0.067608,1.202529,0.851587,0.451583,-0.159739,0.192076
1,100015,-1.608977,-0.125656,2.232224,1.013575,1.227375,0.296077,-0.612468,-1.610222,-0.136698,...,0.696987,1.397832,1.046136,-0.191733,-2.192023,-0.369276,0.822225,-0.109342,-0.580476,0.17416
2,100026,1.611464,1.859449,-0.959714,-0.713587,0.826513,-0.405989,-0.752907,-0.48418,-1.212576,...,0.160145,1.906989,-2.661633,-0.193911,0.440873,0.641739,0.918397,-0.758046,0.154701,-0.476647
3,100030,-0.283111,-0.41801,1.962834,-0.34213,-0.939108,0.929705,1.041755,-1.073416,-0.815344,...,0.974828,-1.997087,-2.083782,1.154107,-0.643947,2.332424,0.659124,-0.809445,0.55896,2.790871
4,100047,-0.688717,1.951002,1.512576,-0.854993,1.059652,-0.179238,-1.775324,0.649346,1.616116,...,-0.789153,1.578984,1.402592,-1.23044,0.296686,2.806314,0.427184,-0.240682,-0.196948,-1.544345


In [37]:
# evaluate a gaussian process classifier model on the dataset
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF

# define dataset ==> use existing MRI data from above
# x, y = make_classification(n_samples=100, n_features=20, n_informative=15, n_redundant=5, random_state=1)

print(x.shape)
print(y.shape)

# define model
model = GaussianProcessClassifier(kernel=1*RBF(1.0))

# define model evaluation method
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

# evaluate model
scores = cross_val_score(model, x, y, scoring='accuracy', cv=cv, n_jobs=-1)

# summarize result
print('Mean Accuracy: %.3f (%.3f)' % (mean(scores), std(scores)))

(84, 410)
(84,)
Mean Accuracy: 0.620 (0.184)


In [33]:
# make a prediction with a gaussian process classifier model on the dataset
from sklearn.datasets import make_classification
from sklearn.gaussian_process import GaussianProcessClassifier

# define model
model = GaussianProcessClassifier()
# fit model
model.fit(x, y)
# define new data
row = x_test.iloc[0,:]
answer = y_test.iloc[0]

row1 = x_test.iloc[1,:]
answer1 = y_test.iloc[1]
# make a prediction
yhat = model.predict([row])
yhat1 = model.predict([row1])
# summarize prediction
print('Sample 1 Predicted Class: %d' % yhat)
print('Sample 1 True Class: %d' % answer)

# summarize prediction
print('Sample 2 Predicted Class: %d' % yhat1)
print('Sample 2True Class: %d' % answer1)

Sample 1 Predicted Class: -1
Sample 1 True Class: -1
Sample 2 Predicted Class: 1
Sample 2True Class: 1




In [None]:
# Save Model

# Load Test Data

# Predict Test Inputs

378+75

In [45]:
# grid search for best kernel on GPC

from sklearn.datasets import make_classification
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.gaussian_process.kernels import DotProduct
from sklearn.gaussian_process.kernels import Matern
from sklearn.gaussian_process.kernels import RationalQuadratic
from sklearn.gaussian_process.kernels import WhiteKernel
from sklearn.gaussian_process.kernels import ConstantKernel

# define model
model = GaussianProcessClassifier()
# define model evaluation method
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# define grid
grid = dict()
grid['kernel'] = [1*RBF(), 
                  1*DotProduct(), 
                  1*DotProduct()+1,
                  1*DotProduct()+Matern(length_scale=0.01)+ConstantKernel(), 
                  1*DotProduct()*Matern(),
                  1*RationalQuadratic()*DotProduct()*Matern(),
                  1*Matern(), 
                  1*Matern(length_scale=0.01), 
                  1*RationalQuadratic(), 
                  1*RationalQuadratic()*Matern(),
                  1*RationalQuadratic()*Matern(length_scale=0.1),
                  1*RationalQuadratic(length_scale=0.1),
                  1*WhiteKernel()]
# define search
search = GridSearchCV(model, grid, scoring='accuracy', cv=cv, n_jobs=-1)
# perform the search
results = search.fit(x, y)
# summarize best
print('Best Mean Accuracy: %.3f' % results.best_score_)
print('Best Config: %s' % results.best_params_)
# summarize all
means = results.cv_results_['mean_test_score']
params = results.cv_results_['params']
for mean, param in zip(means, params):
    print(">%.3f with: %r" % (mean, param))



Best Mean Accuracy: 0.681
Best Config: {'kernel': 1**2 * RationalQuadratic(alpha=1, length_scale=1)}
>0.620 with: {'kernel': 1**2 * RBF(length_scale=1)}
>0.669 with: {'kernel': 1**2 * DotProduct(sigma_0=1)}
>0.669 with: {'kernel': 1**2 * DotProduct(sigma_0=1) + 1**2}
>0.669 with: {'kernel': 1**2 * DotProduct(sigma_0=1) + Matern(length_scale=0.01, nu=1.5) + 1**2}
>0.659 with: {'kernel': 1**2 * DotProduct(sigma_0=1) * Matern(length_scale=1, nu=1.5)}
>0.648 with: {'kernel': 1**2 * RationalQuadratic(alpha=1, length_scale=1) * DotProduct(sigma_0=1) * Matern(length_scale=1, nu=1.5)}
>0.623 with: {'kernel': 1**2 * Matern(length_scale=1, nu=1.5)}
>0.535 with: {'kernel': 1**2 * Matern(length_scale=0.01, nu=1.5)}
>0.681 with: {'kernel': 1**2 * RationalQuadratic(alpha=1, length_scale=1)}
>0.623 with: {'kernel': 1**2 * RationalQuadratic(alpha=1, length_scale=1) * Matern(length_scale=1, nu=1.5)}
>0.620 with: {'kernel': 1**2 * RationalQuadratic(alpha=1, length_scale=1) * Matern(length_scale=0.1, nu=

