# Train and test split + Copying images into train and test folders

## 1. Reload images dataset

In [1]:
import pandas as pd

images_dataset_path = 'data/datasets/images.csv'
df_images = pd.read_csv(images_dataset_path)

#### Create 'tag' field from multiple tags based on tag priority

In [2]:
from modules.octa_video_util import _assign_tag

# Create unique tag column based on class priority list
default_tag = 'normal'
tags_priority_list = ['alagamento', 'bolsão', 'lâmina', 'poça', 'transbordo']

df_images['tag'] = df_images['tags'].apply(lambda tags_list: _assign_tag(tags_list, tags_priority_list, default_tag))

print('Imagens assistidas (de videos baixados):', df_images['seen'].sum(), '/', len(df_images))
print()
display(df_images.tag.value_counts())

Imagens assistidas (de videos baixados): 29356 / 29356



tag
poça          19877
normal         4889
lâmina         3661
alagamento      863
bolsão           66
Name: count, dtype: int64

---
## 2. Example train and test split

In [3]:
import numpy as np
from sklearn.model_selection import StratifiedGroupKFold
from modules.octa_video_util import filter_by_query

#### Custom sampling of images (Example usage)

In [32]:
random_state = None
max_samples = 100
class_col = 'tag'
target_col = 'flood'
groups_col = 'code'
target_classes = ['lâmina', 'bolsão', 'alagamento']
query_params = {'seen': True}
# query_params = {'code': [101, 102, 103], 'tag': ['lâmina', 'bolsão', 'alagamento'], 'seen': [True, False]}

#### Filter images

In [14]:
# Filter dataset of images by query
df_presample = filter_by_query(df_images, query_params).copy()
df_presample.reset_index(drop=True, inplace=True)

#### Binarize categorical variable

In [15]:
# Binarize categorical variable from list of target classes
df_presample['flood'] = df_presample[class_col].isin(target_classes).astype(int)

#### Get x and y (features and target variable)

In [16]:
# Get x and y
x = df_presample.drop(target_col, axis=1)
y = df_presample[target_col]
groups = df_presample[groups_col]

#### Under sampling

In [17]:
# Custom under sampling
minority_samples = (y == 1).sum()
y_minority_sample = y[y == 0].sample(n=minority_samples, replace=False, random_state=random_state)
y_res = pd.concat([y_minority_sample, y[y == 1]], axis=0).sample(max_samples, replace=True, random_state=random_state)
x_res = x.loc[y_res.index]
groups_res = groups.loc[y_res.index]

#### Train test split

In [47]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(x_res, y_res, test_size=0.5, random_state=random_state)
train_index, test_index = Y_train.index, Y_test.index

#### (Alternative) Stratified group KFold split (Get train and test index from sample)

In [34]:
# Stratified group KFold split
sgkf = StratifiedGroupKFold(n_splits=2, shuffle=True, random_state=random_state)
for i, (train_index, test_index) in enumerate(sgkf.split(x_res, y_res, groups_res)):
    break

X_train = x_res.iloc[train_index]
X_test = x_res.iloc[test_index]

Y_train = y_res.iloc[train_index]
Y_test = y_res.iloc[test_index]

#### Report train and test samples

In [39]:
print('Train samples:',len(train_index))
print('Test samples:', len(test_index))

display(Y_train.value_counts().to_frame('train'))
display(Y_test.value_counts().to_frame('test'))

Train samples: 50
Test samples: 50


Unnamed: 0_level_0,train
flood,Unnamed: 1_level_1
1,27
0,23


Unnamed: 0_level_0,test
flood,Unnamed: 1_level_1
0,26
1,24


---
## 3. Copy images into train and test folders

#### Copy images with `train_index`and `test_index` into structured 'train' and 'test' folders

In [49]:
from modules.octa_video_util import copy_images_to_folders

base_directory = 'data/images'
target_directory = 'data/samples/1'
dataset = df_presample.copy()
file_path_field = 'file_path'
label_field = 'flood'
train_indexes = list(train_index)
test_indexes = list(test_index)

copy_images_to_folders(base_directory, target_directory, dataset, train_indexes, test_indexes, file_path_field=file_path_field, tag_field=label_field)

Copying images to train folders:
Processed 50/50 files (100.00%) - Found: 50/50
Copying images to test folders:
Processed 50/50 files (100.00%) - Found: 50/50