In [5]:
import pandas as pd
import numpy as np
from skmultilearn.model_selection import iterative_train_test_split
from sklearn.datasets import make_multilabel_classification
from skmultilearn.model_selection import IterativeStratification
import random
import csv

Load and Manipulate

In [6]:
acc0 = pd.read_csv("/neurospin/dico/data/deep_folding/current/datasets/ACCpatterns/ACCpatterns_0_without_headers.csv", header=None)
acc1 = pd.read_csv("/neurospin/dico/data/deep_folding/current/datasets/ACCpatterns/ACCpatterns_1_without_headers.csv", header=None)
acc = pd.read_csv("/neurospin/dico/data/deep_folding/current/datasets/ACCpatterns/subjects_labels.csv")
acc = acc[~acc.long_name.isnull()]
acc = acc[['long_name', 'DATABASE', 'SEX', 'Right_PCS', 'Left_PCS']]

In [21]:
acc.columns = ['Subject', 'DATABASE', 'SEX', 'Right_PCS', 'Left_PCS']
acc

Unnamed: 0,Subject,DATABASE,SEX,Right_PCS,Left_PCS
0,nih_chp_04701_t1,NIMH_COS,M,absent,present
1,nih_chp_01534_t1,NIMH_COS,M,present,present
2,nih_chp_04623_t1,NIMH_COS,M,present,present
3,nih_chp_01503_t1,NIMH_COS,M,absent,absent
4,nih_chp_00404_t1,NIMH_COS,M,present,present
...,...,...,...,...,...
376,a0005_t1_s03,TISSIER_2018,M,present,absent
377,a0006_t1_s03,TISSIER_2018,M,absent,present
378,a0001_t1_s03,TISSIER_2018,M,present,present
379,a0016_t1_s03,TISSIER_2018,F,present,absent


In [22]:
def print_results(parent, folds, col, verbose=True):

    # For each conbination of labels, prints the number of rows for each fold
    # having this combination
    total_errors = 0
    n_splits = len(folds)
    if verbose:
        print("query   : #rows      : #rows per fold\n")

    for col0 in parent[col[0]].unique():
        for col1 in parent[col[1]].unique():
            for col2 in parent[col[2]].unique():
                df = parent.query(f"{col[0]}==@col0 and {col[1]}==@col1 and {col[2]}==@col2")
                len_query = len(df)
                if verbose:
                    print(f"{col0}, {col1}, {col2} : total = {len_query} : per fold =", end = ' ')
                for fold in folds:
                    df0 = fold.query(f"{col[0]}==@col0 and {col[1]}==@col1 and {col[2]}==@col2")
                    len_query_fold = len(df0)
                    if abs(len_query_fold-len_query/n_splits) >= 2:
                        total_errors += 1
                    if verbose:
                        print(f"{len_query_fold} -", end= ' ')
                if verbose:
                    print("")

    # Prints the statistics and the number of stratification errors
    expected_total_length = len(parent)
    total_length = 0
    total_mismatches = 0
    print("\nlengths of folds : ", end = ' ')
    for fold in folds:
        len_fold = len(fold)
        print(len_fold, end=' ')
        total_length += len_fold
        if abs(len_fold-expected_total_length/n_splits) >= 2:
            total_mismatches += 1
    print(f"\nExpected total_length = {expected_total_length}")
    print(f"Effective total_length = {total_length}")

    print(f"total number of stratification errors: {total_errors}")
    print(f"total number of mismatched fold sizes : {total_mismatches}")

In [23]:
def iterative_split_through_sorting_shuffle(df, n_splits, stratify_columns, random_state):
    """Custom iterative train test split which
    maintains balanced representation.
    """
    # Dataframe random row shuffle + sorting according to stratify_columns
    sorted = df.sample(frac=1, random_state=random_state).sort_values(stratify_columns)
    # for each fold, we take one row every n_splits rows
    folds = [sorted.iloc[i::n_splits, :] for i in range(n_splits)]
    # Further shuffling
    folds = [fold.sample(frac=1, random_state=random_state) for fold in folds]
    random.Random(random_state).shuffle(folds)
    return folds

# Step 1 : split in 5 folds, keep last for test

In [33]:
side = 'Left'

In [34]:
# Save Results
save_path = f"/neurospin/dico/data/deep_folding/current/datasets/ACCpatterns/splits/{side}"

In [35]:
results = iterative_split_through_sorting_shuffle(acc, 5, ['DATABASE', 'SEX', f'{side}_PCS'], 1)

In [36]:
print_results(acc, results, ['DATABASE', 'SEX', f'{side}_PCS'], True)

query   : #rows      : #rows per fold

NIMH_COS, M, present : total = 34 : per fold = 6 - 7 - 7 - 7 - 7 - 
NIMH_COS, M, absent : total = 13 : per fold = 3 - 2 - 2 - 3 - 3 - 
NIMH_COS, F, present : total = 15 : per fold = 3 - 3 - 3 - 3 - 3 - 
NIMH_COS, F, absent : total = 10 : per fold = 2 - 2 - 2 - 2 - 2 - 
NIMH_COS, nan, present : total = 0 : per fold = 0 - 0 - 0 - 0 - 0 - 
NIMH_COS, nan, absent : total = 0 : per fold = 0 - 0 - 0 - 0 - 0 - 
NIMH_COSSIB, M, present : total = 20 : per fold = 4 - 4 - 4 - 4 - 4 - 
NIMH_COSSIB, M, absent : total = 5 : per fold = 1 - 1 - 1 - 1 - 1 - 
NIMH_COSSIB, F, present : total = 11 : per fold = 2 - 2 - 2 - 3 - 2 - 
NIMH_COSSIB, F, absent : total = 13 : per fold = 3 - 3 - 3 - 2 - 2 - 
NIMH_COSSIB, nan, present : total = 0 : per fold = 0 - 0 - 0 - 0 - 0 - 
NIMH_COSSIB, nan, absent : total = 0 : per fold = 0 - 0 - 0 - 0 - 0 - 
NIMH_NV, M, present : total = 37 : per fold = 7 - 7 - 8 - 8 - 7 - 
NIMH_NV, M, absent : total = 12 : per fold = 3 - 3 - 2 - 2 - 2 

In [37]:
results[-1].reset_index()['Subject'].to_csv(
    f"{save_path}/test_split.csv",
    header=False,
    index=False,
    quoting=csv.QUOTE_ALL)

# Step 2 : split remaining 80% in 5 for cross val and train eval

In [38]:
acc_train_val = acc.drop(index=results[-1].index.to_list())

In [39]:
results = iterative_split_through_sorting_shuffle(acc_train_val, 5, ['DATABASE', 'SEX', f'{side}_PCS'], 1)

In [40]:
print_results(acc_train_val, results, ['DATABASE', 'SEX', f'{side}_PCS'], True)

query   : #rows      : #rows per fold

NIMH_COS, M, present : total = 27 : per fold = 6 - 6 - 5 - 5 - 5 - 
NIMH_COS, M, absent : total = 10 : per fold = 2 - 2 - 2 - 2 - 2 - 
NIMH_COS, F, present : total = 12 : per fold = 2 - 2 - 2 - 3 - 3 - 
NIMH_COS, F, absent : total = 8 : per fold = 2 - 2 - 2 - 1 - 1 - 
NIMH_COS, nan, present : total = 0 : per fold = 0 - 0 - 0 - 0 - 0 - 
NIMH_COS, nan, absent : total = 0 : per fold = 0 - 0 - 0 - 0 - 0 - 
NIMH_COSSIB, M, present : total = 16 : per fold = 3 - 4 - 3 - 3 - 3 - 
NIMH_COSSIB, M, absent : total = 4 : per fold = 1 - 0 - 1 - 1 - 1 - 
NIMH_COSSIB, F, present : total = 9 : per fold = 2 - 2 - 1 - 2 - 2 - 
NIMH_COSSIB, F, absent : total = 11 : per fold = 2 - 2 - 3 - 2 - 2 - 
NIMH_COSSIB, nan, present : total = 0 : per fold = 0 - 0 - 0 - 0 - 0 - 
NIMH_COSSIB, nan, absent : total = 0 : per fold = 0 - 0 - 0 - 0 - 0 - 
NIMH_NV, M, present : total = 30 : per fold = 6 - 6 - 6 - 6 - 6 - 
NIMH_NV, M, absent : total = 10 : per fold = 2 - 2 - 2 - 2 - 2 - 

In [41]:
# save folds
for i in range(len(results)):
    results[i].reset_index()['Subject'].to_csv(
        f"{save_path}/train_val_split_{i}.csv",
        header=False,
        index=False,
        quoting=csv.QUOTE_ALL)