In [11]:
import pandas as pd
import numpy as np
from skmultilearn.model_selection import iterative_train_test_split
from sklearn.datasets import make_multilabel_classification
from skmultilearn.model_selection import IterativeStratification
from sklearn.model_selection import StratifiedGroupKFold
import random
import csv
import glob
import sklearn
import os

In [12]:
from collections import defaultdict

from sklearn.utils import (
    _safe_indexing,
    check_random_state,
    indexable,
    metadata_routing,
)

from sklearn.utils.validation import _num_samples, check_array, column_or_1d
from sklearn.utils.multiclass import type_of_target

In [15]:
# FIP
fip = pd.read_excel('/neurospin/dico/data/bv_databases/human/partially_labeled/FIP_patterns/IPS_labels_390.xlsx')
fip = fip.dropna()
fip.columns = ['Subject', 'Sex', 'Left', 'Right']
fip = fip.set_index('Subject')

In [16]:
def print_frequencies(df):
    for right in df['Right'].unique():
        for left in df['Left'].unique():
            for sex in df['Sex'].unique():
                freq = df.query("Right==@right and Left==@left and Sex==@sex")
                print(f"{right}, {left}, {sex}: {len(freq)}")

In [17]:
print_frequencies(fip)

1, 0, M: 44
1, 0, F: 74
1, 1, M: 96
1, 1, F: 56
0, 0, M: 21
0, 0, F: 66
0, 1, M: 12
0, 1, F: 21


In [18]:
participants_file = "/neurospin/dico/data/bv_databases/human/not_labeled/hcp/participants.csv"
participants_unrestricted = pd.read_csv(participants_file)
participants_unrestricted = participants_unrestricted[['Subject', "Gender"]]
print(len(participants_unrestricted))
participants_unrestricted.head()

1206


Unnamed: 0,Subject,Gender
0,100004,M
1,100206,M
2,100307,F
3,100408,M
4,100610,M


In [19]:
participants_file = "/neurospin/dico/jchavas/RESTRICTED_jchavas_1_18_2022_3_17_51.csv"
participants = pd.read_csv(participants_file)
participants = participants[['Subject', 'ZygosityGT', 'Family_ID']]
print(len(participants))
participants.head()

1206


Unnamed: 0,Subject,ZygosityGT,Family_ID
0,100004,,52259_82122
1,100206,,56037_85858
2,100307,MZ,51488_81352
3,100408,MZ,51730_81594
4,100610,DZ,52813_82634


In [20]:
participants = pd.merge(participants_unrestricted, participants)
participants.loc[(participants['ZygosityGT']== " "), 'ZygosityGT'] = 'NotTwin'  
participants['Subject'] = participants['Subject'].astype('string')
print(len(participants))
participants.head()

1206


Unnamed: 0,Subject,Gender,ZygosityGT,Family_ID
0,100004,M,NotTwin,52259_82122
1,100206,M,NotTwin,56037_85858
2,100307,F,MZ,51488_81352
3,100408,M,MZ,51730_81594
4,100610,M,DZ,52813_82634


In [21]:
treated_subjects = glob.glob("/neurospin/dico/data/bv_databases/human/not_labeled/hcp/hcp/*[!.minf]")
treated_subjects = [x.split('/')[-1] for x in treated_subjects]
treated_subjects = [x for x in treated_subjects if 'database' not in x]
print(treated_subjects[:5])
len(treated_subjects)
participants = participants[participants['Subject'].isin(treated_subjects)]
print(len(participants))
set(treated_subjects) - set(participants['Subject'])

['210112', '579665', '922854', '517239', '329440']
1113


{'142626'}

In [22]:
print(len(fip))
print(fip.dtypes)
fip.index = fip.index.astype(str)

390
Sex      object
Left      int64
Right     int64
dtype: object


In [23]:
set(fip.index) - set(treated_subjects)

set()

In [24]:
fip.index.isin(set(treated_subjects) - set(participants['Subject'])).sum()

0

In [25]:
fip = pd.merge(participants, fip, left_on='Subject', right_index=True).set_index('Subject')

In [26]:
fip

Unnamed: 0_level_0,Gender,ZygosityGT,Family_ID,Sex,Left,Right
Subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
100206,M,NotTwin,56037_85858,M,0,1
100307,F,MZ,51488_81352,F,1,1
100610,M,DZ,52813_82634,M,0,1
101309,M,NotTwin,52385_82248,M,1,1
101410,M,NotTwin,52198_82061,M,0,0
...,...,...,...,...,...,...
983773,M,NotTwin,52801_82622,M,1,1
984472,F,DZ,51455_81320,F,1,1
991267,M,NotTwin,51639_81503,M,1,1
993675,F,NotTwin,55800_85621,F,1,0


In [41]:
def print_results(parent, folds, col, verbose=True):

    # For each conbination of labels, prints the number of rows for each fold
    # having this combination
    total_errors = 0
    n_splits = len(folds)
    if verbose:
        print("query   : #rows      : #rows per fold\n")

    for col0 in parent[col[0]].unique():
        for col1 in parent[col[1]].unique():
            #for col2 in parent[col[2]].unique():
            #df = parent.query(f"{col[0]}==@col0 and {col[1]}==@col1 and {col[2]}==@col2")
            df = parent.query(f"{col[0]}==@col0 and {col[1]}==@col1")
            len_query = len(df)
            if verbose:
                #print(f"{col0}, {col1}, {col2} : total = {len_query} : per fold =", end = ' ')
                print(f"{col0}, {col1} : total = {len_query} : per fold =", end = ' ')
            for fold in folds:
                #df0 = fold.query(f"{col[0]}==@col0 and {col[1]}==@col1 and {col[2]}==@col2")
                df0 = fold.query(f"{col[0]}==@col0 and {col[1]}==@col1")
                len_query_fold = len(df0)
                if abs(len_query_fold-len_query/n_splits) >= 2:
                    total_errors += 1
                if verbose:
                    print(f"{len_query_fold} -", end= ' ')
            if verbose:
                print("")

    # Prints the statistics and the number of stratification errors
    expected_total_length = len(parent)
    total_length = 0
    total_mismatches = 0
    print("\nlengths of folds : ", end = ' ')
    for fold in folds:
        len_fold = len(fold)
        print(len_fold, end=' ')
        total_length += len_fold
        if abs(len_fold-expected_total_length/n_splits) >= 2:
            total_mismatches += 1
    print(f"\nExpected total_length = {expected_total_length}")
    print(f"Effective total_length = {total_length}")

    print(f"total number of stratification errors: {total_errors}")
    print(f"total number of mismatched fold sizes : {total_mismatches}")

In [31]:
def how_many_common_families(total, folds):
    nb_common = 0
    for i, source in enumerate(folds):
        for j, target in enumerate(folds):
            if j > i:
                nb_common += len(set(target.Family_ID).intersection(set(source.Family_ID)))
    print(f"number of common families = {nb_common} over {len(set(total.Family_ID))} total families")

# Step 1 : split in 5 folds, keep last for test

In [34]:
side = 'Right'

In [36]:
# Save Results
save_path = f"/neurospin/dico/data/deep_folding/current/datasets/hcp/FIP/{side}"

In [46]:
n_splits = 5
df = fip[[side, 'Gender', 'Family_ID']]

# group based on family
groups = df['Family_ID'].to_numpy()

# labels are combinations of Sex and actual label
labels = df[[side, 'Gender']]
labels['label'] = labels[labels.columns[:]].apply(
    lambda x: ''.join(x.dropna().astype(str)),
    axis=1
)
labels = labels['label'].to_numpy()

In [47]:
stratifier = StratifiedGroupKFold(
    n_splits=n_splits, shuffle=False)
results = []
for indices in stratifier.split(df.to_numpy(), labels, groups):
    results.append(df.iloc[indices[1]])

In [48]:
print_results(fip, results, [side, 'Gender'], True)
how_many_common_families(fip, results)

query   : #rows      : #rows per fold

1, M : total = 140 : per fold = 28 - 28 - 28 - 28 - 28 - 
1, F : total = 130 : per fold = 26 - 26 - 26 - 26 - 26 - 
0, M : total = 33 : per fold = 7 - 7 - 7 - 6 - 6 - 
0, F : total = 87 : per fold = 17 - 17 - 18 - 18 - 17 - 

lengths of folds :  78 78 79 78 77 
Expected total_length = 390
Effective total_length = 390
total number of stratification errors: 0
total number of mismatched fold sizes : 0
number of common families = 0 over 258 total families


In [49]:
results[-1].reset_index()['Subject'].to_csv(
    f"{save_path}/test_split.csv",
    header=False,
    index=False,
    quoting=csv.QUOTE_ALL)

# Step 2 : split remaining 80% in 5 for cross val and train eval

In [50]:
fip_train_val = fip.drop(index=results[-1].index.to_list())

In [51]:
fip_train_val

Unnamed: 0_level_0,Gender,ZygosityGT,Family_ID,Sex,Left,Right
Subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
100206,M,NotTwin,56037_85858,M,0,1
100307,F,MZ,51488_81352,F,1,1
100610,M,DZ,52813_82634,M,0,1
101410,M,NotTwin,52198_82061,M,0,0
102311,F,MZ,51679_81543,F,0,1
...,...,...,...,...,...,...
971160,M,MZ,52796_82617,M,0,1
984472,F,DZ,51455_81320,F,1,1
991267,M,NotTwin,51639_81503,M,1,1
993675,F,NotTwin,55800_85621,F,1,0


In [52]:
# repeat the process
n_splits = 5
df = fip_train_val[[side, 'Gender', 'Family_ID']]

# group based on family
groups = df['Family_ID'].to_numpy()

# labels are combinations of Sex and actual label
labels = df[[side, 'Gender']]
labels['label'] = labels[labels.columns[:]].apply(
    lambda x: ''.join(x.dropna().astype(str)),
    axis=1
)
labels = labels['label'].to_numpy()

In [53]:
stratifier = StratifiedGroupKFold(
    n_splits=n_splits, shuffle=False)
results = []
for indices in stratifier.split(df.to_numpy(), labels, groups):
    results.append(df.iloc[indices[1]])

In [55]:
print_results(fip_train_val, results, [side, 'Gender'], True)
how_many_common_families(fip_train_val, results)

query   : #rows      : #rows per fold

1, M : total = 112 : per fold = 22 - 23 - 23 - 22 - 22 - 
1, F : total = 104 : per fold = 21 - 20 - 21 - 21 - 21 - 
0, M : total = 27 : per fold = 5 - 6 - 5 - 6 - 5 - 
0, F : total = 70 : per fold = 14 - 14 - 14 - 14 - 14 - 

lengths of folds :  62 63 63 63 62 
Expected total_length = 313
Effective total_length = 313
total number of stratification errors: 0
total number of mismatched fold sizes : 0
number of common families = 0 over 207 total families


In [56]:
# save folds
for i in range(len(results)):
    results[i].reset_index()['Subject'].to_csv(
        f"{save_path}/train_val_split_{i}.csv",
        header=False,
        index=False,
        quoting=csv.QUOTE_ALL)