#### 2020.07.02

### Spliting all dataset in propotion 70%-15%-15%

Some examples about splitting ([link](https://stackoverflow.com/questions/38250710/how-to-split-data-into-3-sets-train-validation-and-test)).

In [25]:
import os
from glob import glob
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split

In [26]:
def split_stratified_into_train_val_test(df_input, stratify_colname='y',
                                         frac_train=0.6, frac_val=0.15, frac_test=0.25,
                                         random_state=None):
    '''
    Splits a Pandas dataframe into three subsets (train, val, and test)
    following fractional ratios provided by the user, where each subset is
    stratified by the values in a specific column (that is, each subset has
    the same relative frequency of the values in the column). It performs this
    splitting by running train_test_split() twice.

    Parameters
    ----------
    df_input : Pandas dataframe
        Input dataframe to be split.
    stratify_colname : str
        The name of the column that will be used for stratification. Usually
        this column would be for the label.
    frac_train : float
    frac_val   : float
    frac_test  : float
        The ratios with which the dataframe will be split into train, val, and
        test data. The values should be expressed as float fractions and should
        sum to 1.0.
    random_state : int, None, or RandomStateInstance
        Value to be passed to train_test_split().

    Returns
    -------
    df_train, df_val, df_test :
        Dataframes containing the three splits.
        
    link
    https://stackoverflow.com/questions/38250710/how-to-split-data-into-3-sets-train-validation-and-test
    '''

    if frac_train + frac_val + frac_test != 1.0:
        raise ValueError('fractions %f, %f, %f do not add up to 1.0' % \
                         (frac_train, frac_val, frac_test))

    if stratify_colname not in df_input.columns:
        raise ValueError('%s is not a column in the dataframe' % (stratify_colname))

    X = df_input # Contains all columns.
    y = df_input[[stratify_colname]] # Dataframe of just the column on which to stratify.

    # Split original dataframe into train and temp dataframes.
    df_train, df_temp, y_train, y_temp = train_test_split(X,
                                                          y,
                                                          stratify=y,
                                                          test_size=(1.0 - frac_train),
                                                          random_state=random_state)

    # Split the temp dataframe into val and test dataframes.
    relative_frac_test = frac_test / (frac_val + frac_test)
    df_val, df_test, y_val, y_test = train_test_split(df_temp,
                                                      y_temp,
                                                      stratify=y_temp,
                                                      test_size=relative_frac_test,
                                                      random_state=random_state)

    assert len(df_input) == len(df_train) + len(df_val) + len(df_test)

    return df_train, df_val, df_test

In [31]:
xls_path    = Path('xls/ixi_processed.xls')
df= pd.read_excel(xls_path, index_col=0)

In [33]:
df.head()

Unnamed: 0_level_0,brain_mask_path,subject_id,gender,age_at_scan,above_60_years
t1_biascorr_path,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
/data-10tb/marek/IXI/train_data/FSL_outputs/IXI002-Guys-0828-T1.anat/T1_biascorr.nii.gz,/data-10tb/marek/IXI/train_data/FSL_outputs/IX...,IXI002,F,35.800137,False
/data-10tb/marek/IXI/train_data/FSL_outputs/IXI012-HH-1211-T1.anat/T1_biascorr.nii.gz,/data-10tb/marek/IXI/train_data/FSL_outputs/IX...,IXI012,M,38.781656,False
/data-10tb/marek/IXI/train_data/FSL_outputs/IXI013-HH-1212-T1.anat/T1_biascorr.nii.gz,/data-10tb/marek/IXI/train_data/FSL_outputs/IX...,IXI013,M,46.710472,False
/data-10tb/marek/IXI/train_data/FSL_outputs/IXI014-HH-1236-T1.anat/T1_biascorr.nii.gz,/data-10tb/marek/IXI/train_data/FSL_outputs/IX...,IXI014,F,34.236824,False
/data-10tb/marek/IXI/train_data/FSL_outputs/IXI015-HH-1258-T1.anat/T1_biascorr.nii.gz,/data-10tb/marek/IXI/train_data/FSL_outputs/IX...,IXI015,M,24.284736,False


### Splitting

In [38]:
df_train, df_val, df_test = split_stratified_into_train_val_test(df,
                                                                 stratify_colname='above_60_years',
                                                                 frac_train = 0.7,
                                                                 frac_val=0.15,
                                                                 frac_test=0.15,
                                                                 random_state=42)

In [39]:
# print(xls.shape)
# print(df_train.shape)
# print(df_val.shape)
# print(df_test.shape)

In [40]:
with pd.ExcelWriter('xls/ixi_split.xls') as writer:
    df_train.to_excel(writer, sheet_name='ixi_train', index=False)
    df_val.to_excel(writer, sheet_name='ixi_val', index=False)
    df_test.to_excel(writer, sheet_name='ixi_test', index=False)