In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import random

# Loading files from the previous Teh (2025) paper

Here, we merge the train and test data from Teh (2025) into a single data structure. The main reason for that is that for each bootstrap sample, we will be determining the train, validation and test data iteratively. So an observation from the test set in a sample could be in the train of the other one. Since the bootstrap samples are to be fitted independently, that does not correspond to a data leak of any sort.

In [2]:
df_train = pd.read_csv("trainSet.csv")
df_train = df_train.iloc[:,1:]

df_test = pd.read_csv("testSet.csv")
df_test = df_test.iloc[:,1:]

df = pd.concat([df_train, df_test])

print("Dimensão dos dados de treino: {}".format(df_train.shape))
print("Dimensão dos dados de teste: {}".format(df_test.shape))
print("Dimensão dos dados completos: {}".format(df.shape))

mri = np.load("mri.npz")

mriTrain = (mri["train_images"] / np.max(mri["train_images"]))
# Remove uma observação do conjunto, que contava com dados de especificação faltantes
mriTrain = np.delete(mriTrain, 230, axis=0)

mriTest = (mri["test_images"] / np.max(mri["test_images"]))
# mriTest = mri["test_images"]

image_dim = mriTrain[0].shape
print("Dimensão das imagens: {}".format(image_dim))

Dimensão dos dados de treino: (280, 23)
Dimensão dos dados de teste: (72, 23)
Dimensão dos dados completos: (352, 23)
Dimensão das imagens: (160, 200, 1)


In [3]:
# Merge both train and test sets into a single data structure. The train and test for each bootstrap sampel shall be determined iteratively
mriData = np.concatenate([mriTrain, mriTest])
np.savez("mri_train_test.npz", images = mriData)
df = pd.concat([df_train, df_test]).reset_index(drop = True)
df.to_csv("train_test_data.csv")

# Generate bootstrap samples

In [4]:
print("Dataset size: {}".format(df.shape[0]))
print("Size for the val and test sets: {}".format(df.shape[0] * 0.15))

Dataset size: 352
Size for the val and test sets: 52.8


In [5]:
n_train = 246
n_val = 53
n_test = 53

In [6]:
def sample_single_bootstrap(df, n_train, n_val, n_test):
    images_indices = np.arange(df.shape[0])
    df["image_index"] = images_indices
    
    # Sample the indices for the validation set
    val_indices = np.random.choice( np.arange( df.shape[0] ), size = n_val, replace = True )
    df_val = df.iloc[val_indices, :].reset_index(drop = True)
    # Remove the validation set indices from the complete dataset
    val_indices = np.unique(val_indices)
    df = df.iloc[ ~np.isin(np.arange(df.shape[0]), val_indices ), : ].reset_index(drop = True)
    
    # Sample the indices for the test set
    test_indices = np.random.choice( np.arange( df.shape[0] ), size = n_test, replace = True )
    df_test = df.iloc[test_indices, :].reset_index(drop = True)
    # Remove the test set indices from the complete dataset
    test_indices = np.unique(test_indices)
    df = df.iloc[ ~np.isin(np.arange(df.shape[0]), test_indices ), : ].reset_index(drop = True)

    train_indices = np.random.choice( np.arange( df.shape[0] ), size = n_train, replace = True )
    df_train = df.iloc[train_indices, :].reset_index(drop = True)

    df_train["set"] = "train"
    df_val["set"] = "val"
    df_test["set"] = "test"
    df_sample = pd.concat([df_train, df_val, df_test]).reset_index(drop = True)
    
    return df_sample

In [8]:
df_sample = sample_single_bootstrap(df, n_train, n_val, n_test)
df_sample.head(5)

Unnamed: 0,time,delta,image_index,set
0,2.180822,0,298,train
1,11.838356,0,262,train
2,4.191781,1,10,train
3,4.6,1,197,train
4,4.830137,0,33,train


In [9]:
np.random.seed(1)
for i in range(1, 101):
    df_sample = sample_single_bootstrap(df, n_train, n_val, n_test)
    df_sample.to_csv("bootstrap_sample/sample_{}.csv".format(i), index = False)