# Parkinson Data

Website: https://zenodo.org/record/2867216#.X5n430dxfD5


## Notebook for Reshuffling Training and Testing Datasets




In [2]:
import numpy as np
import os as os
from pathlib import Path
import matplotlib.pyplot as plt
import time
import random
import pandas as pd
import pickle
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri
pandas2ri.activate()
import math

### Load the data

In [4]:
X_train_hc_m = np.load ( 'C:\\Users\\Marta\\Desktop\\Parkinson_data\\Python_code\\Data_extraction_&_preliminaries\\Results\\OLD_Training_Testing\\X_train_hc_m.npy',)
X_train_pd_m = np.load ('C:\\Users\\Marta\\Desktop\\Parkinson_data\\Python_code\\Data_extraction_&_preliminaries\\Results\\OLD_Training_Testing\\X_train_pd_m.npy')

X_train_hc_f = np.load ( 'C:\\Users\\Marta\\Desktop\\Parkinson_data\\Python_code\\Data_extraction_&_preliminaries\\Results\\OLD_Training_Testing\\X_train_hc_f.npy')
X_train_pd_f = np.load ('C:\\Users\\Marta\\Desktop\\Parkinson_data\\Python_code\\Data_extraction_&_preliminaries\\Results\\OLD_Training_Testing\\X_train_pd_f.npy')


In [5]:
len(X_train_hc_m)

1732

In [6]:
len(X_train_pd_m)

7994

In [7]:
len(X_train_hc_f)

15479

In [8]:
len(X_train_pd_f)

2752

In [9]:
#THESE ARE NOT THE FINAL SPLIT - USED THEM IN THE RTSNE (FOR NOW ONLY MALE) TO PROVIDE EVIDENCE OF THE REQUIRED SPLITTING
#WE ARE GOING TO SPLIT THE GROUPS WRT BOTH PD STAGE AND PATIENT NUMBER TO LEAVE OUT ONE PATIENT FOR THE TESTING
X_train_pd_f_split = np.load("C:\\Users\\Marta\\Desktop\\Parkinson_data\\Python_code\\Data_extraction_&_preliminaries\\Results\\OLD_Training_Testing\\X_train_f_pd_split.npy", allow_pickle=True)
X_train_pd_m_split = np.load("C:\\Users\\Marta\\Desktop\\Parkinson_data\\Python_code\\Data_extraction_&_preliminaries\\Results\\OLD_Training_Testing\\X_train_m_pd_split.npy", allow_pickle=True)


### 1) Separate the segments wrt PD Stage and Patient Number

In [10]:
len_Xf_tr = len(X_train_pd_f_split)
len_Xm_tr = len(X_train_pd_m_split)

In [11]:
n_seg_Xf_tr = [len(X_train_pd_f_split[i]) for i in range(0, len_Xf_tr)]
n_seg_Xf_tr

[1543, 1209]

In [12]:
n_seg_Xm_tr = [len(X_train_pd_m_split[i]) for i in range(0, len_Xm_tr)]
n_seg_Xm_tr

[3910, 2495, 1050, 539]

In [13]:
################
#Index Patients#
################

########
#FEMALE#
########
final_index_pt_f_hc_train = np.load('C:\\Users\\Marta\\Desktop\\Parkinson_data\\Python_code\\System_M1\\Code\\final_index_pt_f_hc_train.npy')
final_index_pt_f_pd_train = np.load('C:\\Users\\Marta\\Desktop\\Parkinson_data\\Python_code\\System_M1\\Code\\final_index_pt_f_pd_train.npy')

######
#MALE#
######

final_index_pt_m_hc_train = np.load('C:\\Users\\Marta\\Desktop\\Parkinson_data\\Python_code\\System_M1\\Code\\final_index_pt_m_hc_train.npy')
final_index_pt_m_pd_train = np.load('C:\\Users\\Marta\\Desktop\\Parkinson_data\\Python_code\\System_M1\\Code\\final_index_pt_m_pd_train.npy')


In [14]:
len(final_index_pt_f_hc_train) == len(X_train_hc_f)

True

In [15]:
len(final_index_pt_f_pd_train) == sum(n_seg_Xf_tr)

True

In [16]:
len(final_index_pt_m_hc_train) == len(X_train_hc_m)

True

In [17]:
len(final_index_pt_m_pd_train) == sum(n_seg_Xm_tr)

True

In [18]:
#######################
#Index Parkinson Stage#
#######################

########
#FEMALE#
########

final_index_pd_f = np.load('C:\\Users\\Marta\\Desktop\\Parkinson_data\\Python_code\\System_M1\\Code\\final_index_pd_f.npy')

######
#MALE#
######

final_index_pd_m = np.load('C:\\Users\\Marta\\Desktop\\Parkinson_data\\Python_code\\System_M1\\Code\\final_index_pd_m.npy')


In [19]:
len(final_index_pd_m) == len(final_index_pt_m_pd_train)

True

In [20]:
len(final_index_pd_f) == len(final_index_pt_f_pd_train)

True

In [21]:
df_seg_descr_m = pd.DataFrame([final_index_pd_m, final_index_pt_m_pd_train] ).T
df_seg_descr_m.columns = ['PD_stage','Patient_N']
df_seg_descr_m

Unnamed: 0,PD_stage,Patient_N
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
...,...,...
7989,2,11
7990,2,11
7991,2,11
7992,2,11


In [22]:
df_seg_descr_f = pd.DataFrame([final_index_pd_f, final_index_pt_f_pd_train] ).T
df_seg_descr_f.columns = ['PD_stage','Patient_N']
df_seg_descr_f


Unnamed: 0,PD_stage,Patient_N
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
...,...,...
2747,0,3
2748,0,3
2749,0,3
2750,0,3


In [23]:
pd_df_f_seg = df_seg_descr_f.groupby(['PD_stage','Patient_N'])['PD_stage'].count()
pd_df_f_seg = pd.DataFrame(pd_df_f_seg)
pd_df_f_seg

Unnamed: 0_level_0,Unnamed: 1_level_0,PD_stage
PD_stage,Patient_N,Unnamed: 2_level_1
0,0,864
0,3,679
1,1,449
1,2,760


In [24]:
np.sum(pd_df_f_seg)

PD_stage    2752
dtype: int64

In [24]:
# HENCE: Within both groups I can always leave one patient out and do the training with the segments from only one
#        patient. What I do now is slecting the minimum number of segments across the 4 groups --> 449. Then one patient
#        will provide the training set and the other the testing. I will extract 80% for the training and 20% for the
#        testing. UPDATE: the minimum number for one of the patient in the healthy class is 442 and therefore we keep 
#        that as a minimum number

In [25]:
pd_df_m_seg = df_seg_descr_m.groupby(['PD_stage','Patient_N',])['PD_stage'].count()
pd_df_m_seg = pd.DataFrame(pd_df_m_seg)
pd_df_m_seg

Unnamed: 0_level_0,Unnamed: 1_level_0,PD_stage
PD_stage,Patient_N,Unnamed: 2_level_1
0,0,999
0,1,530
0,3,832
0,6,771
0,7,778
1,2,976
1,8,407
1,9,740
1,10,372
2,4,401


In [26]:
np.sum(pd_df_m_seg)

PD_stage    7994
dtype: int64

In [None]:
#HENCE:  Within both groups I can always leave one patient out and do the training with the segments from only one
#        patient. What I do now is slecting the minimum number of segments across As for the female case, I select the 
#        minimum number of segments across the 12  groups --> 372. Then, in each group, one patient randomly selected 
#        will provide the testing set and from the rest I will extract 80% for the training set. 
#        I will extract 80% for the training and 20% for the testing.
#       

In [None]:
# THE SET THAT I BUILT FOR THE SPLIT GROUPS ONLY TAKE INTO ACCOUNT THE DIFFERENTIATION W.R.T THE PD-STAGE. GIVEN MY
# NEW DESIGN FOR THE TRAINING AND TESTING PROCEDURE I BUILD A NEW SET OF SPLITS WHICH WILL PROVIDE THE REQUIREMENTS
# ABOVE INTRODUCED. THEREFORE I CONSTRUCT HERE AGAIN THE SPLIT GROUPS 

In [27]:
#MALE CASE
X_train_pd_m_df = pd.DataFrame(X_train_pd_m)
final_index_pd_m_df = pd.DataFrame(final_index_pd_m)
final_index_pt_m_pd_train_df = pd.DataFrame(final_index_pt_m_pd_train)

X_train_m_pd_ind_df = pd.concat([X_train_pd_m_df,
                                 final_index_pd_m_df,
                                 final_index_pt_m_pd_train_df] , ignore_index= True, axis = 1)

col_list = ['col' + str(x) for x in range(0,X_train_m_pd_ind_df.shape[1])]
X_train_m_pd_ind_df.columns = col_list

In [28]:
#I split them with respect to the patients and then they have the associated PD_stage in the last col
X_train_m_pd_split_0 = X_train_m_pd_ind_df.groupby('col5001')
X_train_m_pd_split = [X_train_m_pd_split_0.get_group(x) for x in X_train_m_pd_split_0.groups]


In [29]:
n_seg_pd_m_split =  [len(X_train_m_pd_split[i]) for i in range(0, len(X_train_m_pd_split))]
n_seg_pd_m_split

[999, 530, 976, 832, 401, 539, 771, 778, 407, 740, 372, 649]

In [30]:
#FEMALE CASE
X_train_pd_f_df = pd.DataFrame(X_train_pd_f)
final_index_pd_f_df = pd.DataFrame(final_index_pd_f)
final_index_pt_f_pd_train_df = pd.DataFrame(final_index_pt_f_pd_train)

X_train_f_pd_ind_df = pd.concat([X_train_pd_f_df,
                                 final_index_pd_f_df,
                                 final_index_pt_f_pd_train_df] , ignore_index= True, axis = 1)

col_list = ['col' + str(x) for x in range(0,X_train_f_pd_ind_df.shape[1])]
X_train_f_pd_ind_df.columns = col_list

In [31]:
#I split them with respect to the patients and then they have the associated PD_stage in the last col
X_train_f_pd_split_0 = X_train_f_pd_ind_df.groupby('col5001')
X_train_f_pd_split = [X_train_f_pd_split_0.get_group(x) for x in X_train_f_pd_split_0.groups]


In [32]:
n_seg_pd_f_split = [len(X_train_f_pd_split[i]) for i in range(0, len(X_train_f_pd_split))]
n_seg_pd_f_split

[864, 449, 760, 679]

### 2) Compute the minimum of the number of segments in each group and denote that N_m and N_f

In [8]:
#n_seg_pd_f_split = [864, 449, 760, 679]

In [9]:
#n_seg_pd_m_split = [999, 530, 976, 832, 401, 539, 771, 778, 407, 740, 372, 649]

In [34]:
min_m = min(n_seg_pd_m_split)
min_f = min(n_seg_pd_f_split)
print(min_m,min_f)

372 449


In [None]:
#HOWEVER - SINCE THE MINIMUM NUMBER FOR ONE OF THE HEALTHY PATIENTS IS HEALTHY THEN 442 THEN WE KEEP min_f = 442

In [35]:
min_f = 442
print(min_m,min_f)

372 442


In [36]:
N_f = min_f
N_m = min_m

In [29]:
#N_f = 442
#N_m = 372

### 3) Randomly select from each group (divided by both pd_stage and patient number) 442/372 number of segments for the female/male case respectively

In [None]:
#Now --> Randomly select from each group the minimum number from the groups created for each speaker

In [37]:
random.seed(1)
print(random.random()) #0.13436424411240122

0.13436424411240122


In [38]:
ind_pd_f = [range(0,n_seg_pd_f_split[i]) for i in range(0, len(n_seg_pd_f_split))]
ind_pd_m = [range(0,n_seg_pd_m_split[i]) for i in range(0, len(n_seg_pd_m_split))]

In [39]:
ind_pd_m

[range(0, 999),
 range(0, 530),
 range(0, 976),
 range(0, 832),
 range(0, 401),
 range(0, 539),
 range(0, 771),
 range(0, 778),
 range(0, 407),
 range(0, 740),
 range(0, 372),
 range(0, 649)]

In [40]:
random.seed(1)

rand_ind_pd_m = [random.sample( ind_pd_m[i],  k = min_m) for i in range(0, len(ind_pd_m))]
rand_ind_pd_f = [random.sample( ind_pd_f[i],  k = min_f) for i in range(0, len(ind_pd_f))]

In [41]:
#check
[len(rand_ind_pd_m[i]) for i in range(0, len(rand_ind_pd_m))]

[372, 372, 372, 372, 372, 372, 372, 372, 372, 372, 372, 372]

In [42]:
[len(rand_ind_pd_f[i]) for i in range(0, len(rand_ind_pd_f))]

[442, 442, 442, 442]

In [43]:
rand_ind_pd_sorted_f = [sorted(rand_ind_pd_f[i]) for i in range(0,len(rand_ind_pd_f))]
rand_ind_pd_sorted_m = [sorted(rand_ind_pd_m[i]) for i in range(0,len(rand_ind_pd_m))]

In [44]:
#check
[len(rand_ind_pd_sorted_f[i]) for i in range(0, len(rand_ind_pd_sorted_f))]

[442, 442, 442, 442]

In [45]:
#check
[len(rand_ind_pd_sorted_m[i]) for i in range(0, len(rand_ind_pd_sorted_m))]

[372, 372, 372, 372, 372, 372, 372, 372, 372, 372, 372, 372]

In [46]:
X_pd_f_split = [X_train_f_pd_split[i].iloc[rand_ind_pd_sorted_f[i], :].reset_index() for i in range(0, len(X_train_f_pd_split))]
X_pd_m_split = [X_train_m_pd_split[i].iloc[rand_ind_pd_sorted_m[i], :].reset_index() for i in range(0, len(X_train_m_pd_split))]


In [47]:
#check
[len(X_pd_f_split[i]) for i in range(0, len(X_pd_f_split))]

[442, 442, 442, 442]

In [48]:
#check
[len(X_pd_m_split[i]) for i in range(0, len(X_pd_m_split))]

[372, 372, 372, 372, 372, 372, 372, 372, 372, 372, 372, 372]

### 4)Randomly select one group for one speaker to leave out for the testing set

In [None]:
##########
#  MALE  #
##########

In [49]:
pd_df_m = pd.DataFrame( np.array([[ 0,  0 ], 
                        [ 0,  1 ], 
                        [ 0 , 3 ],
                        [ 0,  6 ], 
                        [ 0,  7 ],
                        [ 1,  2 ],
                        [ 1 , 8 ], 
                        [ 1,  9 ],
                        [ 1 , 10],
                        [ 2,  4 ],
                        [ 2,  11], 
                        [ 3 , 5 ] ]),                      
         columns = ["PD_stage", "Patient_N"])

pd_df_m

Unnamed: 0,PD_stage,Patient_N
0,0,0
1,0,1
2,0,3
3,0,6
4,0,7
5,1,2
6,1,8
7,1,9
8,1,10
9,2,4


In [84]:
np.random.seed(15)   #random seed: 2,12 for pd_stage 2 --- random seed 15 for pd_stage 3

test_patients_m = pd_df_m.sample(frac = 1.0).head(1)
test_patients_m

Unnamed: 0,PD_stage,Patient_N
11,3,5


In [85]:
new_X_pd_m_split_train_d1 = list(X_pd_m_split)
new_X_pd_m_split_train_d1 = [i for j, i in enumerate(new_X_pd_m_split_train_d1) if j not in np.array(test_patients_m["Patient_N"])]

In [86]:
len(new_X_pd_m_split_train_d1)

11

In [88]:
#chekc that the left patients are 0,1,2,3,4, 6, 7, 8, 9,10,11
[set(new_X_pd_m_split_train_d1[i]['col5001']) for i in range(0,len(new_X_pd_m_split_train_d1))]

[{0}, {1}, {2}, {3}, {4}, {6}, {7}, {8}, {9}, {10}, {11}]

In [89]:
new_X_pd_m_split_test_d1 = list(X_pd_m_split)
new_X_pd_m_split_test_d1 = [i for j, i in enumerate(new_X_pd_m_split_test_d1) if j in np.array(test_patients_m["Patient_N"])]

In [90]:
len(new_X_pd_m_split_test_d1)

1

In [91]:
#chekc that the left patient is 8
set(new_X_pd_m_split_test_d1[0]['col5001'] )

{5}

In [None]:
############
#  FEMALE  #
############

In [92]:
pd_df_f = pd.DataFrame( np.array([[ 0,  0 ], 
                        [ 0,  3 ], 
                        [ 1 , 1 ],
                        [ 1,  2 ] ]),                      
         columns = ["PD_stage", "Patient_N"])

pd_df_f

Unnamed: 0,PD_stage,Patient_N
0,0,0
1,0,3
2,1,1
3,1,2


In [94]:
np.random.seed(1) #change here and try with another patient

test_patients_f = pd_df_f.sample(frac = 1.0).head(1)
test_patients_f

Unnamed: 0,PD_stage,Patient_N
3,1,2


In [95]:
new_X_pd_f_split_train_d1 = list(X_pd_f_split)
new_X_pd_f_split_train_d1 = [i for j, i in enumerate(new_X_pd_f_split_train_d1) if j not in np.array(test_patients_f["Patient_N"])]

In [96]:
len(new_X_pd_f_split_train_d1)

3

In [98]:
#chekc that the left patients are 0,1,3
[set(new_X_pd_f_split_train_d1[i]['col5001']) for i in range(0,len(new_X_pd_f_split_train_d1))]

[{0}, {1}, {3}]

In [99]:
new_X_pd_f_split_test_d1 = list(X_pd_f_split)
new_X_pd_f_split_test_d1 = [i for j, i in enumerate(new_X_pd_f_split_test_d1) if j in np.array(test_patients_f["Patient_N"])]

In [100]:
len(new_X_pd_f_split_test_d1)

1

In [101]:
#chekc that the left patient is 1
set(new_X_pd_f_split_test_d1[0]['col5001'] )

{2}

### 5) Extract 80% of the data for the traning set and 20% of the data for the testing

In [75]:
################
#     MALE     #
################

In [78]:
new_X_pd_m_split_train_d1[2] # I merge all of these and then extract 80% of 372 --> 298

Unnamed: 0,index,col0,col1,col2,col3,col4,col5,col6,col7,col8,...,col4992,col4993,col4994,col4995,col4996,col4997,col4998,col4999,col5000,col5001
0,1532,-0.000061,0.002075,0.000397,-0.001465,0.000092,-0.001007,-0.001465,0.000031,-0.000824,...,0.000305,0.001160,0.001984,-0.000336,-0.000336,0.000488,-0.001312,-0.001373,1,2
1,1534,-0.001343,-0.001587,-0.002808,-0.001373,-0.000397,-0.002106,-0.000214,-0.000153,-0.002075,...,-0.000549,-0.001923,-0.000549,0.000214,-0.000641,0.001404,0.002136,0.001068,1,2
2,1539,-0.000641,0.000854,0.002136,0.003571,0.005432,0.007935,0.010284,0.012390,0.013489,...,0.015106,0.012207,0.016632,0.024048,0.028564,0.031219,0.030487,0.028687,1,2
3,1543,-0.072205,-0.075226,-0.078613,-0.081482,-0.082428,-0.082245,-0.082550,-0.084534,-0.087921,...,0.036011,0.032837,0.026093,0.020813,0.014832,0.008606,0.003998,0.000336,1,2
4,1544,0.001556,0.001831,0.002991,0.003662,0.003601,0.003754,0.000824,0.000641,-0.000946,...,-0.034515,-0.025482,-0.019196,-0.018219,-0.013214,0.007111,0.010651,0.008392,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
367,2493,0.024567,0.022583,0.021057,0.019989,0.019592,0.019684,0.020386,0.021423,0.021667,...,0.006165,0.005707,0.006165,0.006500,0.006348,0.006042,0.006470,0.005829,1,2
368,2495,0.024628,0.026215,0.026062,0.027588,0.030884,0.031403,0.031036,0.036407,0.038483,...,0.029144,0.029114,0.027008,0.027710,0.029694,0.031738,0.032227,0.032928,1,2
369,2499,-0.002899,-0.002380,-0.002441,-0.002289,-0.001953,-0.001678,-0.001648,-0.001282,-0.001801,...,0.019073,0.020050,0.021179,0.021942,0.023132,0.023682,0.023560,0.023895,1,2
370,2501,0.015594,0.016418,0.017242,0.018005,0.018768,0.019196,0.019623,0.020111,0.020325,...,0.008209,0.008331,0.008789,0.009186,0.009155,0.009064,0.009033,0.008484,1,2


In [79]:
new_X_pd_m_split_test_d1[0] # from this one I extract 20% of 372 --> 75

Unnamed: 0,index,col0,col1,col2,col3,col4,col5,col6,col7,col8,...,col4992,col4993,col4994,col4995,col4996,col4997,col4998,col4999,col5000,col5001
0,5826,0.000885,0.000793,0.000824,0.000824,0.000641,0.000671,0.000793,0.000671,0.000732,...,-0.000488,-0.000549,-0.000366,-0.000397,-0.000549,-0.000244,-0.000214,-0.000488,1,8
1,5827,0.000061,-0.000061,0.000061,0.000061,-0.000122,-0.000153,-0.000031,-0.000275,-0.000305,...,-0.001587,-0.001862,-0.001862,-0.001923,-0.002075,-0.002106,-0.002167,-0.002167,1,8
2,5828,-0.002075,-0.002075,-0.002136,-0.002106,-0.001923,-0.001709,-0.001709,-0.001648,-0.001373,...,0.000183,-0.000366,0.000977,0.001129,0.000427,0.000793,0.001923,0.001099,1,8
3,5829,0.000671,0.000366,0.000519,-0.000336,-0.001038,-0.000488,-0.000641,-0.001556,-0.001953,...,-0.003357,-0.003235,-0.003296,-0.003143,-0.003174,-0.002991,-0.003052,-0.003082,1,8
4,5830,-0.003296,-0.003540,-0.003296,-0.003235,-0.003113,-0.002869,-0.002716,-0.002808,-0.002808,...,-0.000916,-0.000885,-0.001160,-0.001343,-0.001282,-0.001099,-0.001007,-0.000824,1,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
367,6227,-0.000122,-0.000031,0.000153,0.000092,0.000122,0.000183,0.000427,0.000580,0.000641,...,-0.002655,-0.002655,-0.002502,-0.002472,-0.002502,-0.002258,-0.001892,-0.001831,1,8
368,6228,-0.001831,-0.001740,-0.001740,-0.001984,-0.001587,-0.001984,-0.002197,-0.001984,-0.002014,...,0.030060,0.029907,0.029724,0.028015,0.027832,0.027039,0.025696,0.024323,1,8
369,6229,0.022583,0.020813,0.019440,0.019043,0.017181,0.017029,0.016144,0.014801,0.012146,...,-0.001526,-0.009796,-0.010254,-0.004608,0.005646,0.007477,0.012115,0.016907,1,8
370,6230,0.018738,0.012604,-0.003052,-0.009430,-0.019379,-0.015930,-0.004913,0.010071,0.010925,...,-0.006989,-0.005432,-0.003357,-0.001007,0.000610,0.002441,0.002563,0.003723,1,8


In [102]:
X_train_m_pd_concat_d1 = pd.concat(new_X_pd_m_split_train_d1, axis = 0)
X_train_m_pd_concat_d1 = X_train_m_pd_concat_d1.reset_index()

In [81]:
X_train_m_pd_concat_d1

Unnamed: 0,level_0,index,col0,col1,col2,col3,col4,col5,col6,col7,...,col4992,col4993,col4994,col4995,col4996,col4997,col4998,col4999,col5000,col5001
0,0,1,0.000702,0.000641,0.000610,0.000641,0.000427,0.000610,0.000702,0.000519,...,-0.000397,-0.000641,-0.000580,-0.000732,-0.000580,-0.000549,-0.000610,-0.000885,0,0
1,1,2,-0.000580,-0.000763,-0.000610,-0.000732,-0.000793,-0.000610,-0.000793,-0.000946,...,-0.001190,-0.000916,-0.001038,-0.001282,-0.001007,-0.001190,-0.001434,-0.000732,0,0
2,2,5,-0.000031,0.000122,-0.000122,-0.000214,-0.000092,-0.000153,-0.000214,-0.000305,...,-0.000031,-0.000061,-0.000153,0.000122,0.000000,-0.000183,0.000092,0.000031,0,0
3,3,8,-0.000427,-0.000458,-0.000641,-0.000549,-0.000122,-0.000366,-0.000427,-0.000061,...,-0.001007,-0.001068,-0.000793,-0.000977,-0.000763,-0.001099,-0.000977,-0.001404,0,0
4,4,9,-0.001038,-0.000946,-0.000763,-0.000610,-0.000793,-0.000854,-0.000702,-0.000702,...,-0.001770,-0.001801,-0.001404,-0.001953,-0.001984,-0.001709,-0.002167,-0.002075,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4087,367,7985,-0.000519,-0.000702,-0.000702,-0.000732,-0.000549,-0.000671,-0.000763,-0.000427,...,-0.000031,-0.000153,0.000092,0.000275,-0.000061,0.000122,0.000092,0.000092,2,11
4088,368,7986,0.000031,0.000244,0.000366,0.000000,0.000092,0.000336,0.000244,0.000031,...,-0.000366,0.000488,0.000397,-0.000122,-0.002686,-0.003326,-0.002563,-0.001770,2,11
4089,369,7988,-0.003265,-0.003967,-0.004364,-0.004669,-0.005249,-0.005280,-0.005188,-0.005249,...,0.109406,0.111084,0.113647,0.113190,0.112640,0.115204,0.114624,0.108978,2,11
4090,370,7989,0.104889,0.102356,0.097290,0.090454,0.081726,0.077118,0.071655,0.063660,...,-0.004730,-0.000732,-0.000763,-0.001953,0.000763,-0.001343,-0.003357,0.000458,2,11


In [103]:
4092/11

372.0

In [104]:
X_train_m_pd_concat_d1_0 = X_train_m_pd_concat_d1.drop(['level_0', 'index', 'col5000', 'col5001'], axis=1) 

In [105]:
# NOTE: To extract the trainiing set, even if the sets are of bigger dimensions, we still set the 80% equal to N_m
#       across the groups and ALSO the desings. TODISCUSS

per_train_pd_m = math.ceil(N_m * 0.8)
per_test_pd_m = math.ceil(N_m * 0.2)
print(per_train_pd_m, per_test_pd_m)


298 75


In [106]:
#Set see to reproduce the results
random.seed(1)

range_pd_m = range(0, X_train_m_pd_concat_d1_0.shape[0] )
range_pd_m_test = range(0, N_m)

rand_s_train_pd_m = random.sample(range_pd_m, k = per_train_pd_m)
rand_s_test_pd_m = random.sample(range_pd_m_test, k = per_test_pd_m)

print(random.random()) #0.2665354510728647


0.2665354510728647


In [107]:
len(X_train_m_pd_concat_d1_0) == len(range_pd_m)

True

In [109]:
ind_train_pd_m = sorted(rand_s_train_pd_m) 
ind_test_pd_m = sorted(rand_s_test_pd_m) 

In [110]:
X_train_m_pd_d1 = X_train_m_pd_concat_d1_0.reset_index().loc[ind_train_pd_m] 

In [111]:
X_train_m_pd_d1.shape

(298, 5001)

In [112]:
X_train_m_pd_d1 = X_train_m_pd_d1.drop(['index'], axis=1)

In [113]:
X_train_m_pd_d1.shape

(298, 5000)

In [114]:
X_train_m_pd_d1 = X_train_m_pd_d1.values

In [115]:
len(X_train_m_pd_d1)

298

In [116]:
X_test_pd_m_d1 = new_X_pd_m_split_test_d1[0].reset_index().loc[ind_test_pd_m]

In [117]:
X_test_pd_m_d1.shape

(75, 5004)

In [118]:
X_test_pd_m_d1 = X_test_pd_m_d1.drop(['level_0', 'index', 'col5000', 'col5001'], axis=1)

In [119]:
X_test_pd_m_d1.shape

(75, 5000)

In [120]:
X_test_pd_m_d1 = X_test_pd_m_d1.values 

In [121]:
len(X_test_pd_m_d1)

75

In [128]:
np.save('X_train_m_pd_d1', X_train_m_pd_d1)

In [122]:
np.save('X_test_pd_m_d1_new_pat', X_test_pd_m_d1)

In [None]:
###############
#   FEMALE    #
###############

In [134]:
new_X_pd_f_split_train_d1[2] # I merge all of these and then extract 80% of 442 --> 354

Unnamed: 0,index,col0,col1,col2,col3,col4,col5,col6,col7,col8,...,col4992,col4993,col4994,col4995,col4996,col4997,col4998,col4999,col5000,col5001
0,2073,0.003815,0.003967,0.003754,0.004089,0.004272,0.004150,0.003906,0.004303,0.003876,...,-0.000183,0.000061,-0.000122,-0.000305,-0.000061,-0.000214,-0.000244,-0.000031,0,3
1,2074,-0.000122,-0.000092,0.000000,-0.000366,-0.000275,0.000000,-0.000122,-0.000244,-0.000031,...,0.000519,0.000488,0.001038,0.000916,0.000885,0.001251,0.001251,0.001221,0,3
2,2075,0.001221,0.000977,0.001038,0.001007,0.000702,0.000488,0.000488,0.000427,-0.000031,...,0.000336,0.000244,-0.000031,-0.000153,-0.000275,-0.000519,-0.000305,-0.000305,0,3
3,2076,-0.000275,-0.000488,-0.000854,-0.001160,-0.001404,-0.001251,-0.001404,-0.001282,-0.001038,...,0.001343,0.001099,0.001068,0.001007,0.000854,0.000458,0.000275,-0.000061,0,3
4,2081,0.026367,0.027191,0.028168,0.029449,0.029999,0.030029,0.029449,0.029877,0.029633,...,0.007935,0.009430,0.010651,0.012085,0.013550,0.014557,0.016296,0.017303,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
437,2745,0.006897,0.006836,0.006592,0.006348,0.006287,0.006317,0.006439,0.006317,0.005646,...,0.002502,0.002747,0.003113,0.003296,0.003418,0.003265,0.002716,0.001862,0,3
438,2746,0.002014,0.002502,0.002350,0.002167,0.002075,0.001587,0.001190,0.001160,0.001312,...,0.006409,0.007233,0.008575,0.009460,0.010498,0.011078,0.010223,0.009888,0,3
439,2747,0.009583,0.008636,0.009125,0.008545,0.008118,0.009705,0.010101,0.011261,0.012787,...,0.002869,0.002594,0.003265,0.003235,0.002350,0.001770,0.001831,0.002167,0,3
440,2749,-0.001556,-0.000519,-0.000061,0.000122,0.000458,0.000427,0.000336,0.000031,-0.000854,...,-0.000519,-0.010773,-0.018707,-0.015442,0.002258,0.011597,0.010956,0.014099,0,3


In [137]:
new_X_pd_f_split_test_d1[0] # from this one I extract 20% of 442 --> 89

Unnamed: 0,index,col0,col1,col2,col3,col4,col5,col6,col7,col8,...,col4992,col4993,col4994,col4995,col4996,col4997,col4998,col4999,col5000,col5001
0,864,0.002869,0.002869,0.002869,0.002594,0.002808,0.002777,0.002502,0.002625,0.002319,...,0.000458,0.000854,0.001740,0.002228,0.001862,0.002380,0.002899,0.002777,1,1
1,865,0.003113,0.003510,0.003540,0.003113,0.003265,0.004059,0.004486,0.004395,0.004333,...,-0.002777,-0.002808,-0.002563,-0.002686,-0.002472,-0.002197,-0.002197,-0.001953,1,1
2,866,0.000763,0.000763,0.000916,0.000793,0.000732,0.000671,0.000763,0.000885,0.000519,...,-0.003265,-0.003418,-0.003326,-0.003235,-0.003204,-0.003479,-0.003296,-0.003296,1,1
3,867,-0.003387,-0.003143,-0.003326,-0.003418,-0.003387,-0.003174,-0.003204,-0.003113,-0.003174,...,0.001251,0.000641,0.000580,0.000946,0.000946,0.001740,0.000671,-0.000275,1,1
4,868,-0.002533,-0.002594,-0.002625,-0.002563,-0.002655,-0.002594,-0.002686,-0.002747,-0.002625,...,0.002258,0.002380,0.002106,0.002228,0.002197,0.002106,0.001953,0.002075,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
437,1308,-0.006805,0.002686,0.000885,-0.004395,-0.011627,-0.008850,-0.005737,-0.005890,-0.002228,...,0.000549,0.001556,0.000977,0.000366,0.000702,0.000732,0.000580,0.000854,1,1
438,1309,0.000885,0.000427,0.000946,0.000580,0.000610,0.000732,0.000336,0.000793,0.000793,...,0.043427,0.044464,0.044983,0.045227,0.044952,0.044678,0.044342,0.043762,1,1
439,1310,-0.004700,-0.004364,-0.003937,-0.003479,-0.002686,-0.002319,-0.002167,-0.001617,-0.001373,...,0.001617,0.001434,0.000214,-0.001556,-0.002563,-0.003601,-0.004730,-0.004150,1,1
440,1311,-0.004303,-0.004547,-0.004364,-0.003967,-0.003632,-0.003448,-0.002838,-0.002625,-0.001312,...,-0.005280,-0.005249,-0.005371,-0.005402,-0.005463,-0.005768,-0.005829,-0.005859,1,1


In [123]:
X_train_f_pd_concat_d1 = pd.concat(new_X_pd_f_split_train_d1, axis = 0)
X_train_f_pd_concat_d1 = X_train_f_pd_concat_d1.reset_index()

In [139]:
X_train_f_pd_concat_d1

Unnamed: 0,level_0,index,col0,col1,col2,col3,col4,col5,col6,col7,...,col4992,col4993,col4994,col4995,col4996,col4997,col4998,col4999,col5000,col5001
0,0,2,-0.000061,0.000092,0.000031,0.000122,0.000031,0.000122,0.000214,0.000122,...,-0.001221,-0.001251,-0.001221,-0.001129,-0.001282,-0.001038,-0.001099,-0.001038,0,0
1,1,3,-0.000397,-0.000549,-0.000580,-0.000458,-0.000488,-0.000397,-0.000397,-0.000427,...,0.000671,0.000580,0.000793,0.000610,0.000641,0.000671,0.000580,0.000519,0,0
2,2,4,0.000427,0.000671,0.000732,0.000427,0.000580,0.000580,0.000549,0.000549,...,-0.054230,-0.052185,-0.051971,-0.048431,-0.047241,-0.047729,-0.045288,-0.042969,0,0
3,3,5,-0.043243,-0.041687,-0.038422,-0.037537,-0.034393,-0.031311,-0.030090,-0.027771,...,0.094238,0.090637,0.088562,0.085785,0.081940,0.076416,0.071228,0.067017,0,0
4,4,8,-0.041534,-0.043304,-0.044891,-0.045990,-0.047607,-0.048615,-0.049835,-0.050995,...,-0.041412,-0.041718,-0.041809,-0.041931,-0.041840,-0.041534,-0.040680,-0.040375,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1321,437,2745,0.006897,0.006836,0.006592,0.006348,0.006287,0.006317,0.006439,0.006317,...,0.002502,0.002747,0.003113,0.003296,0.003418,0.003265,0.002716,0.001862,0,3
1322,438,2746,0.002014,0.002502,0.002350,0.002167,0.002075,0.001587,0.001190,0.001160,...,0.006409,0.007233,0.008575,0.009460,0.010498,0.011078,0.010223,0.009888,0,3
1323,439,2747,0.009583,0.008636,0.009125,0.008545,0.008118,0.009705,0.010101,0.011261,...,0.002869,0.002594,0.003265,0.003235,0.002350,0.001770,0.001831,0.002167,0,3
1324,440,2749,-0.001556,-0.000519,-0.000061,0.000122,0.000458,0.000427,0.000336,0.000031,...,-0.000519,-0.010773,-0.018707,-0.015442,0.002258,0.011597,0.010956,0.014099,0,3


In [124]:
1326 /3

442.0

In [125]:
X_train_f_pd_concat_d1_0 = X_train_f_pd_concat_d1.drop(['level_0', 'index', 'col5000', 'col5001'], axis=1) 

In [143]:
# NOTE: To extract the trainiing set, even if the sets are of bigger dimensions, we still set the 80% equal to N_f
#       across the groups and ALSO the desings. TODISCUSS

per_train_pd_f = math.ceil(N_f * 0.8)
per_test_pd_f = math.ceil(N_f * 0.2)
print(per_train_pd_f, per_test_pd_f)


354 89


In [144]:
#Set see to reproduce the results
random.seed(1)

range_pd_f = range(0, X_train_f_pd_concat_d1_0.shape[0] )
range_pd_f_test = range(0, N_f)

rand_s_train_pd_f = random.sample(range_pd_f, k = per_train_pd_f)
rand_s_test_pd_f = random.sample(range_pd_f_test, k = per_test_pd_f)

print(random.random()) #0.1781548656443236


0.1781548656443236


In [145]:
len(X_train_f_pd_concat_d1_0) == len(range_pd_f)

True

In [146]:
ind_train_pd_f = sorted(rand_s_train_pd_f) 
ind_test_pd_f = sorted(rand_s_test_pd_f) 

In [147]:
X_train_f_pd_d1 = X_train_f_pd_concat_d1_0.reset_index().loc[ind_train_pd_f] 

In [148]:
X_train_f_pd_d1.shape

(354, 5001)

In [149]:
X_train_f_pd_d1 = X_train_f_pd_d1.drop(['index'], axis=1)

In [150]:
X_train_f_pd_d1.shape

(354, 5000)

In [151]:
X_train_f_pd_d1 = X_train_f_pd_d1.values

In [152]:
len(X_train_f_pd_d1)

354

In [153]:
X_test_pd_f_d1 = new_X_pd_f_split_test_d1[0].reset_index().loc[ind_test_pd_f]

In [154]:
X_test_pd_f_d1.shape

(89, 5004)

In [138]:
X_test_pd_f_d1 = X_test_pd_f_d1.drop(['level_0', 'index', 'col5000', 'col5001'], axis=1)

In [139]:
X_test_pd_f_d1.shape

(89, 5000)

In [140]:
X_test_pd_f_d1 = X_test_pd_f_d1.values 

In [141]:
len(X_test_pd_f_d1)

89

In [174]:
np.save('X_train_f_pd_d1', X_train_f_pd_d1)

In [142]:
np.save('X_test_pd_f_d1_new_pat', X_test_pd_f_d1)

# APPLY THE SAME PROCEDURE THE HEALTHY PATIENTS

In [20]:
##################################################
#APPLY THE SAME PROCEDURE TO THE HEALTHY DATASETS#
##################################################

#################
#    FEMALE     #
#################

df_seg_descr_hc_f = pd.DataFrame([ final_index_pt_f_hc_train] ).T
df_seg_descr_hc_f.columns = ['Patient_N']
df_seg_descr_hc_f



Unnamed: 0,Patient_N
0,0
1,0
2,0
3,0
4,0
...,...
15474,18
15475,18
15476,18
15477,18


In [21]:
hc_df_f_seg = df_seg_descr_hc_f.groupby(['Patient_N']).size()
hc_df_f_seg = pd.DataFrame(hc_df_f_seg)
hc_df_f_seg

Unnamed: 0_level_0,0
Patient_N,Unnamed: 1_level_1
0,895
1,908
2,578
3,854
4,641
5,695
6,777
7,682
8,884
9,791


In [22]:
###############
#    MALE     #
###############

df_seg_descr_hc_m = pd.DataFrame([ final_index_pt_m_hc_train] ).T
df_seg_descr_hc_m.columns = ['Patient_N']
df_seg_descr_hc_m

Unnamed: 0,Patient_N
0,0
1,0
2,0
3,0
4,0
...,...
1727,1
1728,1
1729,1
1730,1


In [23]:
hc_df_m_seg = df_seg_descr_hc_m.groupby(['Patient_N']).size()
hc_df_m_seg = pd.DataFrame(hc_df_m_seg)
hc_df_m_seg

Unnamed: 0_level_0,0
Patient_N,Unnamed: 1_level_1
0,784
1,948


### 3) Randomly select from each group (divided by patient number) 442/372 number of segments for the female/male case respectively 

--> NOTE THE MINIMUM NUMBER OF SEGMENTS FOR ONE OF THE PATIENTS IS 442 AND SO WE REDUCE TO THIS NUMBER ALSO THE SICK PATIENTS GROUPS

In [24]:
#################
#    FEMALE     #
#################

X_train_hc_f_df = pd.DataFrame(X_train_hc_f)
final_index_pt_f_hc_train_df = pd.DataFrame(final_index_pt_f_hc_train)

X_train_f_hc_ind_df = pd.concat([X_train_hc_f_df,
                                 final_index_pt_f_hc_train_df] , ignore_index= True, axis = 1)

col_list = ['col' + str(x) for x in range(0,X_train_f_hc_ind_df.shape[1])]
X_train_f_hc_ind_df.columns = col_list

#I split them with respect to the patients 
X_train_f_hc_split_0 = X_train_f_hc_ind_df.groupby('col5000')
X_train_f_hc_split = [X_train_f_hc_split_0.get_group(x) for x in X_train_f_hc_split_0.groups]


n_seg_hc_f_split = [len(X_train_f_hc_split[i]) for i in range(0, len(X_train_f_hc_split))]
n_seg_hc_f_split

[895,
 908,
 578,
 854,
 641,
 695,
 777,
 682,
 884,
 791,
 636,
 951,
 1222,
 1026,
 897,
 809,
 1071,
 442,
 720]

In [25]:
###############
#    MALE     #
###############

X_train_hc_m_df = pd.DataFrame(X_train_hc_m)
final_index_pt_m_hc_train_df = pd.DataFrame(final_index_pt_m_hc_train)

X_train_m_hc_ind_df = pd.concat([X_train_hc_m_df,
                                 final_index_pt_m_hc_train_df] , ignore_index= True, axis = 1)

col_list = ['col' + str(x) for x in range(0,X_train_m_hc_ind_df.shape[1])]
X_train_m_hc_ind_df.columns = col_list

#I split them with respect to the patients 
X_train_m_hc_split_0 = X_train_m_hc_ind_df.groupby('col5000')
X_train_m_hc_split = [X_train_m_hc_split_0.get_group(x) for x in X_train_m_hc_split_0.groups]


n_seg_hc_m_split = [len(X_train_m_hc_split[i]) for i in range(0, len(X_train_m_hc_split))]
n_seg_hc_m_split

[784, 948]

In [26]:
ind_hc_f = [range(0,n_seg_hc_f_split[i]) for i in range(0, len(n_seg_hc_f_split))]
ind_hc_m = [range(0,n_seg_hc_m_split[i]) for i in range(0, len(n_seg_hc_m_split))]

ind_hc_m

[range(0, 784), range(0, 948)]

In [30]:
#NOTE: I KEPT min_n and new_min_f --> TODISCUSS
random.seed(1)

rand_ind_hc_m = [random.sample( ind_hc_m[i],  k = N_m) for i in range(0, len(ind_hc_m))]
rand_ind_hc_f = [random.sample( ind_hc_f[i],  k = N_f) for i in range(0, len(ind_hc_f))]

print(random.random())#0.9970099255930638

0.9970099255930638


In [31]:
#check
[len(rand_ind_hc_m[i]) for i in range(0, len(rand_ind_hc_m))]

[372, 372]

In [32]:
#check
[len(rand_ind_hc_f[i]) for i in range(0, len(rand_ind_hc_f))]

[442,
 442,
 442,
 442,
 442,
 442,
 442,
 442,
 442,
 442,
 442,
 442,
 442,
 442,
 442,
 442,
 442,
 442,
 442]

In [33]:
len(rand_ind_hc_f)

19

In [34]:
rand_ind_hc_sorted_f = [sorted(rand_ind_hc_f[i]) for i in range(0,len(rand_ind_hc_f))]
rand_ind_hc_sorted_m = [sorted(rand_ind_hc_m[i]) for i in range(0,len(rand_ind_hc_m))]

In [35]:
X_hc_f_split = [X_train_f_hc_split[i].iloc[rand_ind_hc_sorted_f[i], :].reset_index() for i in range(0, len(X_train_f_hc_split))]
X_hc_m_split = [X_train_m_hc_split[i].iloc[rand_ind_hc_sorted_m[i], :].reset_index() for i in range(0, len(X_train_m_hc_split))]

In [36]:
#check
[len(X_hc_f_split[i]) for i in range(0, len(X_hc_f_split))]

[442,
 442,
 442,
 442,
 442,
 442,
 442,
 442,
 442,
 442,
 442,
 442,
 442,
 442,
 442,
 442,
 442,
 442,
 442]

In [37]:
#check
[len(X_hc_m_split[i]) for i in range(0, len(X_hc_m_split))]

[372, 372]

### 4) Randomly select one one speaker to leave out for the testing set


In [None]:
######################################################################
#NOW RANDOMLY SELECT ONE SPEAKER FOR THE LEAVE ONE OUT IN EACH HC SET#
######################################################################


In [38]:
hc_df_m = pd.DataFrame( np.array([[ 0 ], 
                        [ 1 ] ]),                      
         columns =  ["Patient_N"])

hc_df_m

Unnamed: 0,Patient_N
0,0
1,1


In [39]:
hc_df_f = pd.DataFrame( np.array([[ 0 ], 
                        [ 1 ], 
                        [ 2 ],
                        [ 3 ], 
                        [ 4 ],
                        [ 5 ],
                        [ 6 ], 
                        [ 7 ],
                        [ 8 ],
                        [ 9 ],
                        [ 10 ], 
                        [ 11 ],
                        [ 12 ],
                        [ 13 ],
                        [ 14 ],
                        [ 15 ],
                        [ 16 ],
                        [ 17 ],
                        [ 18 ]   ]),                      
         columns = [ "Patient_N"])

hc_df_f

Unnamed: 0,Patient_N
0,0
1,1
2,2
3,3
4,4
5,5
6,6
7,7
8,8
9,9


In [40]:
np.random.seed(0)

test_patients_hc_m = hc_df_m.sample(frac = 1.0).head(1)
test_patients_hc_m

Unnamed: 0,Patient_N
1,1


In [41]:
np.random.seed(0)

test_patients_hc_f = hc_df_f.sample(frac = 1.0).head(1)
test_patients_hc_f

Unnamed: 0,Patient_N
10,10


In [42]:
new_X_hc_f_split_train = list(X_hc_f_split)
new_X_hc_f_split_train = [i for j, i in enumerate(new_X_hc_f_split_train) if j not in np.array(test_patients_hc_f["Patient_N"])]

In [43]:
len(new_X_hc_f_split_train)

18

In [44]:
#chekc that the left patients are 0,1,2,3,4,5,6,7,8,9,11,12,13,14,15,16,17,18
[set(new_X_hc_f_split_train[i]['col5000']) for i in range(0,len(new_X_hc_f_split_train))]

[{0},
 {1},
 {2},
 {3},
 {4},
 {5},
 {6},
 {7},
 {8},
 {9},
 {11},
 {12},
 {13},
 {14},
 {15},
 {16},
 {17},
 {18}]

In [46]:
new_X_hc_m_split_train = list(X_hc_m_split)
new_X_hc_m_split_train = [i for j, i in enumerate(new_X_hc_m_split_train) if j not in np.array(test_patients_hc_m["Patient_N"])]

In [47]:
len(new_X_hc_m_split_train)

1

In [48]:
#chekc that the left patients is number 0
set(new_X_hc_m_split_train[0]['col5000'] )

{0}

In [49]:
new_X_hc_f_split_test = list(X_hc_f_split)
new_X_hc_f_split_test = [i for j, i in enumerate(new_X_hc_f_split_test) if j in np.array(test_patients_hc_f["Patient_N"])]

In [50]:
len(new_X_hc_f_split_test)

1

In [51]:
set(new_X_hc_f_split_test[0]['col5000']) #chekc that the left patient is number 10

{10}

In [52]:
new_X_hc_m_split_test = list(X_hc_m_split)
new_X_hc_m_split_test = [i for j, i in enumerate(new_X_hc_m_split_test) if j in np.array(test_patients_hc_m["Patient_N"])]

In [53]:
len(new_X_hc_m_split_test)

1

In [54]:
set(new_X_hc_m_split_test[0]['col5000'])#chekc that the left patient is 1

{1}

### 5) Extract 80% of the data for the traning set and 20% of the data for the testing

In [84]:
################
#     MALE     #
################

In [85]:
new_X_hc_m_split_train[0] # I extract 80% of 372 --> 298

Unnamed: 0,index,col0,col1,col2,col3,col4,col5,col6,col7,col8,...,col4991,col4992,col4993,col4994,col4995,col4996,col4997,col4998,col4999,col5000
0,1,-0.000885,-0.001099,-0.001373,-0.000763,-0.000946,-0.001465,-0.001190,-0.000977,-0.001312,...,-0.004181,-0.003784,-0.004272,-0.004181,-0.004059,-0.004059,-0.003906,-0.004272,-0.004150,0
1,2,-0.004211,-0.003998,-0.004242,-0.004120,-0.003876,-0.004303,-0.004608,-0.004333,-0.004242,...,-0.003357,-0.003143,-0.003021,-0.003632,-0.003418,-0.003052,-0.003571,-0.003601,-0.003143,0
2,5,0.022278,0.023499,0.025421,0.027161,0.028564,0.029236,0.029846,0.031952,0.035065,...,0.002930,0.000427,-0.000671,0.002106,0.004608,0.005859,-0.002533,-0.004974,0.000214,0
3,6,-0.001007,-0.000305,0.003601,0.005798,0.000549,-0.007721,-0.002899,0.003174,0.001953,...,0.003876,0.007019,0.007965,0.005463,0.004303,0.003845,0.003479,0.003601,0.004242,0
4,7,0.005829,0.006287,0.006165,0.006195,0.003876,0.000427,0.000092,0.002747,0.006989,...,-0.001373,-0.001404,-0.001434,-0.001831,-0.002014,-0.001831,-0.001831,-0.001831,-0.001862,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
367,779,0.003998,0.004333,0.004822,0.005310,0.006073,0.006256,0.006104,0.005798,0.005249,...,0.003448,0.003296,0.003143,0.003204,0.003265,0.003265,0.003235,0.003693,0.004028,0
368,780,0.004608,0.005066,0.005432,0.005463,0.005371,0.005829,0.005981,0.005890,0.005371,...,0.001129,0.001221,0.000275,-0.000244,0.000031,0.000580,0.000763,0.000061,0.000977,0
369,781,0.001038,0.000458,0.001038,0.001343,0.001282,0.000275,0.000824,0.000519,0.001404,...,-0.000183,0.000427,0.000916,0.000977,0.000671,0.000519,0.000458,0.000671,0.000732,0
370,782,0.001038,0.000946,0.000977,0.000641,0.000793,0.000916,0.001038,0.001190,0.000305,...,-0.000092,-0.000061,-0.000031,-0.000031,-0.000031,0.000122,0.000153,0.000183,0.000061,0


In [86]:
new_X_hc_m_split_test[0] # from this one I extract 20% of 372 --> 75

Unnamed: 0,index,col0,col1,col2,col3,col4,col5,col6,col7,col8,...,col4991,col4992,col4993,col4994,col4995,col4996,col4997,col4998,col4999,col5000
0,784,0.001068,0.001282,0.001068,0.001190,0.001160,0.001099,0.001068,0.001038,0.001282,...,0.000275,0.000122,0.000092,0.000397,0.000061,0.000397,0.000153,0.000183,0.000214,1
1,787,-0.005188,-0.005646,-0.005829,-0.006042,-0.006592,-0.006592,-0.006775,-0.007080,-0.007843,...,0.005249,0.005554,0.006012,0.005920,0.006653,0.006775,0.006683,0.007568,0.007721,1
2,788,0.007355,0.008057,0.008392,0.008179,0.008850,0.008972,0.009308,0.008972,0.008789,...,0.001404,0.000793,0.004486,0.007996,0.003662,0.002716,0.003632,0.002838,0.004791,1
3,791,0.061493,-0.017151,-0.068817,-0.036499,0.052460,0.087250,0.068726,-0.011841,-0.068787,...,-0.000183,0.000549,0.001648,-0.000092,-0.001526,-0.000519,-0.000427,0.000488,0.001160,1
4,794,0.053864,0.053925,0.056458,0.052460,0.051361,0.056854,0.057709,0.058716,0.061035,...,-0.012573,-0.012482,-0.012756,-0.011414,-0.010651,-0.008820,-0.008942,-0.009094,-0.008392,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
367,1721,-0.000549,-0.000458,-0.000427,-0.000183,0.000000,0.000244,0.000519,-0.000061,0.000275,...,0.001160,0.000671,0.001129,0.000793,0.001038,0.000793,0.000916,0.000977,0.000885,1
368,1723,0.001221,0.000397,0.000763,0.000977,0.000305,0.000793,0.000244,0.000519,0.000732,...,0.000153,-0.000916,-0.000793,-0.000305,-0.000702,-0.000641,-0.000458,-0.000397,-0.000641,1
369,1725,0.000916,0.001984,0.003326,0.004669,0.005707,0.006531,0.007080,0.007355,0.007385,...,-0.000458,-0.000061,-0.000031,0.000183,-0.000427,0.000061,0.000092,-0.000336,0.000977,1
370,1729,0.000244,-0.000519,0.000427,-0.000427,0.000580,-0.000977,0.000519,-0.000336,0.000183,...,0.001251,0.000549,0.001892,0.000580,0.001190,0.001129,0.000336,0.001465,0.000580,1


In [87]:
# NOTE: To extract the trainiing set, even if the sets are of bigger dimensions, we still set the 80% equal to N_m
#       across the groups and ALSO the desings. TODISCUSS

per_train_pd_m = math.ceil(N_m * 0.8)
per_test_pd_m = math.ceil(N_m * 0.2)
print(per_train_pd_m, per_test_pd_m)


298 75


In [88]:
#Set see to reproduce the results
random.seed(1)

range_pd_m = range(0, N_m )
range_pd_m_test = range(0, N_m)

rand_s_train_pd_m = random.sample(range_pd_m, k = per_train_pd_m)
rand_s_test_pd_m = random.sample(range_pd_m_test, k = per_test_pd_m)

print(random.random()) #0.25006282489890796


0.25006282489890796


In [91]:
ind_train_pd_m = sorted(rand_s_train_pd_m) 
ind_test_pd_m = sorted(rand_s_test_pd_m) 

In [92]:
X_train_m_hc = new_X_hc_m_split_train[0].reset_index().loc[ind_train_pd_m] 

In [93]:
X_train_m_hc.shape

(298, 5003)

In [94]:
X_train_m_hc = X_train_m_hc.drop(['index', 'level_0', 'col5000'], axis=1)

In [95]:
X_train_m_hc.shape

(298, 5000)

In [96]:
X_train_m_hc = X_train_m_hc.values

In [97]:
len(X_train_m_hc)

298

In [98]:
X_test_hc_m  = new_X_hc_m_split_test[0].reset_index().loc[ind_test_pd_m]

In [99]:
X_test_hc_m.shape

(75, 5003)

In [100]:
X_test_hc_m = X_test_hc_m.drop(['level_0', 'index', 'col5000'], axis=1)

In [101]:
X_test_hc_m.shape

(75, 5000)

In [102]:
X_test_hc_m = X_test_hc_m.values 

In [104]:
len(X_test_hc_m)

75

In [587]:
np.save('X_train_m_hc', X_train_m_hc)

In [586]:
np.save('X_test_hc_m', X_test_hc_m)

In [None]:
##################
#     FEMALE     #
##################

In [55]:
new_X_hc_f_split_train[17] # I merge all of these and then extract 80% of 442 --> 354

Unnamed: 0,index,col0,col1,col2,col3,col4,col5,col6,col7,col8,...,col4991,col4992,col4993,col4994,col4995,col4996,col4997,col4998,col4999,col5000
0,14760,0.000732,0.000977,0.000671,0.000122,0.000702,0.000488,0.000671,0.000610,0.000153,...,0.002167,0.002472,0.001404,0.001434,0.002502,0.001617,0.001190,0.001862,0.001740,18
1,14763,-0.003693,-0.004059,-0.003754,-0.003448,-0.003571,-0.003448,-0.003204,-0.003540,-0.003052,...,-0.052094,-0.054291,-0.057892,-0.059601,-0.058411,-0.059448,-0.060791,-0.061340,-0.060822,18
2,14764,-0.060120,-0.059753,-0.061066,-0.061371,-0.059601,-0.056335,-0.056641,-0.058685,-0.060272,...,0.013306,0.013641,0.015442,0.004761,-0.020599,-0.004333,0.011261,0.003174,0.006866,18
3,14765,0.007233,0.008087,-0.003662,-0.001526,0.008759,0.015839,-0.003448,-0.010162,0.001526,...,-0.005432,-0.005341,-0.001923,0.000153,0.000977,0.000610,-0.000183,-0.000793,0.000275,18
4,14766,-0.000092,-0.001465,-0.000122,-0.000641,-0.000763,0.000275,-0.000031,0.000885,0.003204,...,0.007568,0.008545,0.008911,0.008514,0.008728,0.009338,0.009216,0.009491,0.009979,18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
437,15468,-0.000702,-0.000763,-0.000671,-0.000488,-0.000580,-0.000488,-0.000305,-0.000458,-0.000366,...,-0.001068,-0.000885,-0.000977,-0.001007,-0.001099,-0.000854,-0.001129,-0.001007,-0.000946,18
438,15469,-0.001038,-0.000916,-0.001099,-0.000854,-0.000885,-0.000824,-0.000824,-0.000977,-0.000702,...,-0.008087,-0.007263,-0.006134,-0.005432,-0.005676,-0.005249,-0.004150,-0.003174,-0.002411,18
439,15473,-0.000244,-0.000183,0.000000,-0.000092,-0.000153,0.000061,-0.000275,0.000092,-0.000031,...,-0.000183,-0.000031,0.000092,0.000061,0.000061,0.000183,-0.000122,-0.000214,-0.000092,18
440,15474,0.000122,0.000000,-0.000183,-0.000488,-0.000580,-0.000763,-0.000854,-0.000549,-0.000366,...,0.000031,0.000092,0.000244,0.000153,0.000305,0.000427,0.000366,0.000214,0.000336,18


In [56]:
new_X_hc_f_split_test[0] # from this one I extract 20% of 442 --> 89

Unnamed: 0,index,col0,col1,col2,col3,col4,col5,col6,col7,col8,...,col4991,col4992,col4993,col4994,col4995,col4996,col4997,col4998,col4999,col5000
0,7705,-0.000366,0.000061,0.000183,-0.000153,0.000061,-0.000092,-0.000092,0.000244,0.000183,...,-0.004364,-0.004425,-0.004211,-0.004303,-0.004364,-0.004364,-0.004303,-0.004211,-0.004242,10
1,7706,-0.004242,-0.004425,-0.004211,-0.004303,-0.004242,-0.004303,-0.004150,-0.004120,-0.004150,...,0.002991,0.001709,0.001556,0.001984,0.002716,0.001892,0.002167,0.002350,0.001312,10
2,7708,-0.001495,-0.001617,-0.001709,-0.001740,-0.002441,-0.001312,-0.001831,-0.001953,-0.002228,...,0.000610,0.001007,0.001312,0.001434,0.001801,0.001801,0.001617,0.001556,0.001251,10
3,7710,-0.001740,-0.001465,-0.001404,-0.001251,-0.001373,-0.001038,-0.000427,-0.000519,-0.000641,...,0.000092,0.000793,0.000183,0.000854,0.000610,0.000702,0.000824,0.000305,-0.000244,10
4,7712,-0.000305,-0.000153,-0.000061,-0.000336,-0.000397,-0.000244,-0.000305,-0.000519,-0.000061,...,0.000244,0.000336,0.000153,0.000244,0.000366,0.000122,0.000244,0.000305,0.000244,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
437,8334,0.000122,-0.000122,-0.000214,-0.000183,-0.000092,-0.000183,-0.000183,-0.000092,-0.000214,...,0.000488,0.000519,0.000549,0.000366,0.000336,0.000610,0.000244,0.000397,0.000427,10
438,8335,0.000214,0.000122,0.000397,0.000397,0.000580,0.000427,0.000214,0.000458,0.000397,...,-0.000580,-0.000092,0.000000,-0.000275,-0.000275,0.000153,-0.000214,-0.000549,-0.000824,10
439,8336,-0.000641,-0.000366,-0.000061,0.000183,-0.000153,-0.000305,-0.000183,0.000092,-0.000061,...,-0.000366,0.000641,0.000458,0.000488,-0.000397,-0.000275,0.000732,-0.000214,0.000183,10
440,8338,-0.001038,-0.001251,-0.001190,-0.001038,-0.001129,-0.001129,-0.000702,-0.001190,-0.001373,...,0.000488,0.000397,0.000397,0.000549,0.000305,0.000183,0.000458,0.000458,0.000458,10


In [57]:
X_train_f_hc_concat = pd.concat(new_X_hc_f_split_train, axis = 0)
X_train_f_hc_concat = X_train_f_hc_concat.reset_index()

X_train_f_hc_concat

Unnamed: 0,level_0,index,col0,col1,col2,col3,col4,col5,col6,col7,...,col4991,col4992,col4993,col4994,col4995,col4996,col4997,col4998,col4999,col5000
0,0,5,-0.000580,-0.000519,-0.000427,-0.000458,-0.000671,-0.000641,-0.000732,-0.000671,...,0.000519,0.000458,0.000275,0.000488,0.000458,0.000305,0.000397,0.000397,0.000183,0
1,1,8,0.027344,0.024506,0.021759,0.019226,0.015594,0.013641,0.010406,0.008728,...,0.000732,0.000702,0.000732,0.000763,0.001068,0.000610,0.000580,0.000488,0.000763,0
2,2,10,0.000946,0.001038,0.001221,0.001007,0.000916,0.000946,0.000946,0.000732,...,0.030457,0.033081,0.037323,0.038086,0.036957,0.039246,0.042419,0.042206,0.039001,0
3,3,11,-0.014557,-0.015533,-0.016449,-0.017365,-0.018768,-0.019989,-0.021240,-0.022278,...,-0.012695,-0.014465,-0.015625,-0.016571,-0.016815,-0.017181,-0.016449,-0.015808,-0.014954,0
4,4,14,0.000916,0.000854,0.001007,0.000916,0.000763,0.000763,0.000366,0.000183,...,-0.013916,-0.012115,-0.009949,-0.007599,-0.004730,-0.002869,-0.001190,0.000977,0.004181,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7951,437,15468,-0.000702,-0.000763,-0.000671,-0.000488,-0.000580,-0.000488,-0.000305,-0.000458,...,-0.001068,-0.000885,-0.000977,-0.001007,-0.001099,-0.000854,-0.001129,-0.001007,-0.000946,18
7952,438,15469,-0.001038,-0.000916,-0.001099,-0.000854,-0.000885,-0.000824,-0.000824,-0.000977,...,-0.008087,-0.007263,-0.006134,-0.005432,-0.005676,-0.005249,-0.004150,-0.003174,-0.002411,18
7953,439,15473,-0.000244,-0.000183,0.000000,-0.000092,-0.000153,0.000061,-0.000275,0.000092,...,-0.000183,-0.000031,0.000092,0.000061,0.000061,0.000183,-0.000122,-0.000214,-0.000092,18
7954,440,15474,0.000122,0.000000,-0.000183,-0.000488,-0.000580,-0.000763,-0.000854,-0.000549,...,0.000031,0.000092,0.000244,0.000153,0.000305,0.000427,0.000366,0.000214,0.000336,18


In [58]:
7956/442

18.0

In [59]:
X_train_f_hc_concat_0 = X_train_f_hc_concat.drop(['level_0', 'index', 'col5000'], axis=1) 

In [60]:
# NOTE: To extract the trainiing set, even if the sets are of bigger dimensions, we still set the 80% equal to N_m
#       across the groups and ALSO the desings. TODISCUSS

per_train_hc_f = math.ceil(N_f * 0.8)
per_test_hc_f = math.ceil(N_f * 0.2)
print(per_train_hc_f, per_test_hc_f)

354 89


In [61]:
#Set see to reproduce the results
random.seed(1)

range_hc_f = range(0, X_train_f_hc_concat_0.shape[0] )
range_hc_f_test = range(0, N_m)

rand_s_train_hc_f = random.sample(range_hc_f, k = per_train_hc_f)
rand_s_test_hc_f = random.sample(range_hc_f_test, k = per_test_hc_f)

print(random.random()) #0.4577692590412453

0.4577692590412453


In [62]:
len(X_train_f_hc_concat) == len(range_hc_f)

True

In [63]:
ind_train_hc_f = sorted(rand_s_train_hc_f) 
ind_test_hc_f = sorted(rand_s_test_hc_f) 

In [64]:
X_train_f_hc_concat_0 = X_train_f_hc_concat.drop(['level_0', 'index', 'col5000'], axis  = 1)

In [65]:
X_train_f_hc = X_train_f_hc_concat_0.reset_index().loc[ind_train_hc_f] 

In [66]:
X_train_f_hc = X_train_f_hc.drop(['index'], axis = 1)

In [67]:
X_train_f_hc.shape

(354, 5000)

In [68]:
X_train_f_hc = X_train_f_hc.values

In [69]:
len(X_train_f_hc)

354

In [70]:
X_test_hc_f = new_X_hc_f_split_test[0].reset_index().loc[ind_test_hc_f]

In [71]:
X_test_hc_f.shape

(89, 5003)

In [72]:
X_test_hc_f = X_test_hc_f.drop(['level_0', 'index', 'col5000'], axis=1)

In [73]:
X_test_hc_f.shape

(89, 5000)

In [74]:
X_test_hc_f = X_test_hc_f.values 

In [75]:
len(X_test_hc_f)

89

In [129]:
np.save('X_train_f_hc', X_train_f_hc)

In [76]:
np.save('X_test_hc_f', X_test_hc_f)