Test notebook to transition from pickle datasets to numpy.

In [1]:
import os
import pandas as pd
import numpy as np


from contrastive.utils.config import process_config
from contrastive.data.datasets import create_sets_pure_contrastive
from contrastive.data.datamodule import DataModule_PureContrastive

### Configs

In [38]:
pickle_all = "/neurospin/dico/data/deep_folding/current/datasets/hcp/crops/2mm/CINGULATE/mask/Rskeleton.pkl"

numpy_all = "/neurospin/dico/data/deep_folding/current/datasets/hcp/crops/2mm/CINGULATE/mask/Rskeleton.npy"
subjects_all = "/neurospin/dico/data/deep_folding/current/datasets/hcp/crops/2mm/CINGULATE/mask/Rskeleton_subject.csv"
train_val_csv_file = "/neurospin/dico/data/deep_folding/papers/midl2022/HCP_half_1bis.csv"
nb_subjects = -1

## Import the sets

In [3]:
# import pickle data (done only once as it is very long)

pickle_data = pd.read_pickle(pickle_all)
pickle_data.iloc[0,:10]

/neurospin/dico/data/deep_folding/current/datasets/hcp/crops/2mm/CINGULATE/mask/Rcrops/173536    [[[[0], [0], [0], [0], [0], [0], [0], [0], [0]...
/neurospin/dico/data/deep_folding/current/datasets/hcp/crops/2mm/CINGULATE/mask/Rcrops/683256    [[[[0], [0], [0], [0], [0], [0], [0], [0], [0]...
/neurospin/dico/data/deep_folding/current/datasets/hcp/crops/2mm/CINGULATE/mask/Rcrops/161832    [[[[0], [0], [0], [0], [0], [0], [0], [0], [0]...
/neurospin/dico/data/deep_folding/current/datasets/hcp/crops/2mm/CINGULATE/mask/Rcrops/395958    [[[[0], [0], [0], [0], [0], [0], [0], [0], [0]...
/neurospin/dico/data/deep_folding/current/datasets/hcp/crops/2mm/CINGULATE/mask/Rcrops/289555    [[[[0], [0], [0], [0], [0], [0], [0], [0], [0]...
/neurospin/dico/data/deep_folding/current/datasets/hcp/crops/2mm/CINGULATE/mask/Rcrops/208630    [[[[0], [0], [0], [0], [0], [0], [0], [0], [0]...
/neurospin/dico/data/deep_folding/current/datasets/hcp/crops/2mm/CINGULATE/mask/Rcrops/757764    [[[[0], [0], [0], [0]

In [4]:
# rename columns

new_names = []

for name in pickle_data.columns:
    new_names.append(name[-6:])

pickle_data.columns = new_names

pickle_data.columns

Index(['173536', '683256', '161832', '395958', '289555', '208630', '757764',
       '492754', '125222', '570243',
       ...
       '179245', '158843', '194847', '150423', '885975', '700634', '213017',
       '138332', '146836', '123925'],
      dtype='object', length=1114)

### Check if the two initial datasets are the same

In [44]:
def check_same_initial_data(numpy_all, subjects_all, pickle_data, security=100):
    # import the data
    normal_data = np.load(numpy_all, mmap_mode='r')
    normal_subjects = pd.read_csv(subjects_all)

    # check the dimensions
    dims_np = normal_data.shape
    if dims_np[0] != pickle_data.shape[1]:
        raise ValueError("Not the same sample size")
    subj = pickle_data.iloc[0,0]
    if subj.shape != dims_np[1:]:
        raise ValueError("Not the same data dimensions")
    
    # check if same subjects
    common_subjects = normal_subjects[normal_subjects.Subject.isin(list(pickle_data))]
    if len(common_subjects) != len(normal_subjects):
        raise ValueError(f"Difference in the chosen subjects: \
            {len(normal_subjects) - len(common_subjects)} subjects are not the same.")
    
    # check if they are in the same order
    diff = pickle_data.columns.astype(int) != normal_subjects.Subject
    if diff.sum() > 0:
        raise ValueError(f"Subjects are not in the same order (found {diff.sum()}\
    differences).")
    
    # content check
    # only check a random element for time purposes
    for i in range(security):
        rand_idx = np.random.randint(0, len(pickle_data.columns))
        if (pickle_data.iloc[0, rand_idx] != normal_data[rand_idx]).any():
            raise ValueError(f"Content of the two datasets don't match (issue for subject\
        {pickle_data.columns[rand_idx]}).")
    
    return normal_data, normal_subjects

In [45]:
normal_data, normal_subjects = check_same_initial_data(numpy_all, subjects_all, pickle_data)

## Create test and train sets

### Create test and train sets (numpy)

Creates the train and test sets just like in the function create_sets_pure_contrastive

In [54]:
# Gets train_val subjects from csv file
train_val_subjects = pd.read_csv(train_val_csv_file, names=['ID'])
print(f"train_val_subjects = {train_val_subjects}")

# Determines test dataframe
test_subjects = normal_subjects[~normal_subjects.Subject.isin(
    train_val_subjects.ID)].index
len_test = len(test_subjects)
print(f"length of test = {len_test}")
print(f"test_subjects = {test_subjects[:5]}")

# /!\ copy the data to construct test_data
test_data = normal_data[test_subjects]
print(f'test set size: {test_data.shape}')

# Cuts train_val set to requested number
if nb_subjects == -1:
    len_train_val = len(train_val_subjects)
else:
    len_train_val = min(nb_subjects,
                        len(train_val_subjects))
    train_val_subjects = train_val_subjects[:len_train_val]

print(f"length of train/val dataframe: {len_train_val}")

 # Determines train/val dataframe
train_val_subjects_index = normal_subjects[normal_subjects.Subject.isin(
                            train_val_subjects.ID)].index
# /!\ copy the data to construct train_val_data
train_val_data = normal_data[train_val_subjects_index]

train_val_subjects =          ID
0    129634
1    136833
2    206222
3    138837
4    987983
..      ...
545  618952
546  433839
547  188448
548  166640
549  541943

[550 rows x 1 columns]
length of test = 564
test_subjects = Int64Index([4, 6, 7, 9, 10], dtype='int64')
test set size: (564, 17, 44, 42, 1)
length of train/val dataframe: 550


### Create test and train sets (pickle)

In [51]:
pickle_normal_subjects = pickle_data.columns.tolist()

# Gets train_val subjects from csv file
train_val_subjects = pd.read_csv(train_val_csv_file, names=['ID']).T
train_val_subjects = train_val_subjects.values[0].tolist()
train_val_subjects = list(map(str, train_val_subjects))
print(f"nb train_val_subjects = {len(train_val_subjects)}")

# Determines test dataframe
pickle_test_subjects = list(set(pickle_normal_subjects).difference(train_val_subjects))
len_test = len(pickle_test_subjects)
print(f"pickle_test_subjects = {len_test}")


pickle_test_data = pickle_data[pickle_data.columns.intersection(pickle_test_subjects)]

# Cuts train_val set to requested number
if nb_subjects == -1:
    len_train_val = len(train_val_subjects)
else:
    len_train_val = min(nb_subjects,
                        len(train_val_subjects))
    train_val_subjects = train_val_subjects[:len_train_val]

print(f"length of train/val dataframe: {len_train_val}")

pickle_train_val_data = pickle_data[pickle_data.columns.intersection(
            train_val_subjects)]

nb train_val_subjects = 550
pickle_test_subjects = 564
length of train/val dataframe: 550


In [53]:
pickle_test_data.iloc[0,:10]

289555    [[[[0], [0], [0], [0], [0], [0], [0], [0], [0]...
757764    [[[[0], [0], [0], [0], [0], [0], [0], [0], [0]...
492754    [[[[0], [0], [0], [0], [0], [0], [0], [0], [0]...
570243    [[[[0], [0], [0], [0], [0], [0], [0], [0], [0]...
581349    [[[[0], [0], [0], [0], [0], [0], [0], [0], [0]...
729557    [[[[0], [0], [0], [0], [0], [0], [0], [0], [0]...
742549    [[[[0], [0], [0], [0], [0], [0], [0], [0], [0]...
536647    [[[[0], [0], [0], [0], [0], [0], [0], [0], [0]...
368753    [[[[0], [0], [0], [0], [0], [0], [0], [0], [0]...
386250    [[[[0], [0], [0], [0], [0], [0], [0], [0], [0]...
Name: 0, dtype: object

## Check if they are the same

### test_subjects

In [60]:
print(test_subjects)
print(pickle_test_subjects)

Int64Index([   4,    6,    7,    9,   10,   11,   13,   16,   19,   20,
            ...
            1090, 1092, 1094, 1098, 1099, 1102, 1105, 1108, 1110, 1111],
           dtype='int64', length=564)
['139435', '506234', '473952', '843151', '617748', '180836', '852455', '569965', '955465', '120717', '175035', '112112', '127226', '585862', '902242', '103818', '148133', '210112', '561444', '485757', '120212', '385450', '885975', '516742', '760551', '295146', '126426', '742549', '481042', '134223', '283543', '130821', '142828', '623137', '117324', '735148', '165032', '300719', '308331', '689470', '180230', '248238', '859671', '125525', '672756', '567961', '166438', '559053', '368753', '601127', '108828', '397760', '788674', '200109', '158540', '334635', '147030', '304020', '453542', '898176', '109325', '727553', '156637', '497865', '149842', '205119', '125424', '214019', '211619', '581349', '579665', '191942', '615744', '996782', '685058', '957974', '169444', '172938', '401422', '570243', 

In [63]:
pickle_data.columns[test_subjects]

Index(['289555', '757764', '492754', '570243', '581349', '729557', '742549',
       '536647', '368753', '386250',
       ...
       '181636', '104012', '150726', '176037', '310621', '144428', '158843',
       '885975', '213017', '138332'],
      dtype='object', length=564)

In [66]:
print(pickle_data.columns[test_subjects].to_list() == pickle_test_subjects)
print(set(pickle_data.columns[test_subjects].to_list()) == set(pickle_test_subjects))

False
True


Same elements, not inth same order. -> No problem normally.

### test_data

In [68]:
print(len(test_data))
print(len(pickle_test_data.columns))

564
564


In [70]:
# exhaustive check

for i in range(len(test_data)):
    print(i, (test_data[i] == pickle_test_data.iloc[0,i]).all())

# only true => ok

0 True
1 True
2 True
3 True
4 True
5 True
6 True
7 True
8 True
9 True
10 True
11 True
12 True
13 True
14 True
15 True
16 True
17 True
18 True
19 True
20 True
21 True
22 True
23 True
24 True
25 True
26 True
27 True
28 True
29 True
30 True
31 True
32 True
33 True
34 True
35 True
36 True
37 True
38 True
39 True
40 True
41 True
42 True
43 True
44 True
45 True
46 True
47 True
48 True
49 True
50 True
51 True
52 True
53 True
54 True
55 True
56 True
57 True
58 True
59 True
60 True
61 True
62 True
63 True
64 True
65 True
66 True
67 True
68 True
69 True
70 True
71 True
72 True
73 True
74 True
75 True
76 True
77 True
78 True
79 True
80 True
81 True
82 True
83 True
84 True
85 True
86 True
87 True
88 True
89 True
90 True
91 True
92 True
93 True
94 True
95 True
96 True
97 True
98 True
99 True
100 True
101 True
102 True
103 True
104 True
105 True
106 True
107 True
108 True
109 True
110 True
111 True
112 True
113 True
114 True
115 True
116 True
117 True
118 True
119 True
120 True
121 True
122 True
123

### train_val_data

In [72]:
print(len(train_val_data))
print(len(pickle_train_val_data.columns))

550
550


In [73]:
# exhaustive check

for i in range(len(train_val_data)):
    print(i, (train_val_data[i] == pickle_train_val_data.iloc[0,i]).all())

# only true => ok

0 True
1 True
2 True
3 True
4 True
5 True
6 True
7 True
8 True
9 True
10 True
11 True
12 True
13 True
14 True
15 True
16 True
17 True
18 True
19 True
20 True
21 True
22 True
23 True
24 True
25 True
26 True
27 True
28 True
29 True
30 True
31 True
32 True
33 True
34 True
35 True
36 True
37 True
38 True
39 True
40 True
41 True
42 True
43 True
44 True
45 True
46 True
47 True
48 True
49 True
50 True
51 True
52 True
53 True
54 True
55 True
56 True
57 True
58 True
59 True
60 True
61 True
62 True
63 True
64 True
65 True
66 True
67 True
68 True
69 True
70 True
71 True
72 True
73 True
74 True
75 True
76 True
77 True
78 True
79 True
80 True
81 True
82 True
83 True
84 True
85 True
86 True
87 True
88 True
89 True
90 True
91 True
92 True
93 True
94 True
95 True
96 True
97 True
98 True
99 True
100 True
101 True
102 True
103 True
104 True
105 True
106 True
107 True
108 True
109 True
110 True
111 True
112 True
113 True
114 True
115 True
116 True
117 True
118 True
119 True
120 True
121 True
122 True
123

In [74]:
# to be sure it works properly
train_val_data[0,0,0,0,0] += 1

for i in range(len(train_val_data)):
    print(i, (train_val_data[i] == pickle_train_val_data.iloc[0,i]).all())

0 False
1 True
2 True
3 True
4 True
5 True
6 True
7 True
8 True
9 True
10 True
11 True
12 True
13 True
14 True
15 True
16 True
17 True
18 True
19 True
20 True
21 True
22 True
23 True
24 True
25 True
26 True
27 True
28 True
29 True
30 True
31 True
32 True
33 True
34 True
35 True
36 True
37 True
38 True
39 True
40 True
41 True
42 True
43 True
44 True
45 True
46 True
47 True
48 True
49 True
50 True
51 True
52 True
53 True
54 True
55 True
56 True
57 True
58 True
59 True
60 True
61 True
62 True
63 True
64 True
65 True
66 True
67 True
68 True
69 True
70 True
71 True
72 True
73 True
74 True
75 True
76 True
77 True
78 True
79 True
80 True
81 True
82 True
83 True
84 True
85 True
86 True
87 True
88 True
89 True
90 True
91 True
92 True
93 True
94 True
95 True
96 True
97 True
98 True
99 True
100 True
101 True
102 True
103 True
104 True
105 True
106 True
107 True
108 True
109 True
110 True
111 True
112 True
113 True
114 True
115 True
116 True
117 True
118 True
119 True
120 True
121 True
122 True
12