# Train and test split + Copying images into train and test folders

## 1. Reload images dataset

In [1]:
import pandas as pd

images_dataset_path = 'data/datasets/images.csv'
df_images = pd.read_csv(images_dataset_path)

## 2. Preprocess the dataset of images

In [2]:
from modules.octa_video_util import _assign_tag
from modules.octa_video_util import filter_by_query

#### Create categorical field from tag priority list

In [3]:
# Create unique tag column based on class priority list
default_tag = 'normal'
tags_priority_list = ['alagamento', 'bolsão', 'lâmina', 'poça', 'transbordo']

df_images['tag'] = df_images['tags'].apply(lambda tags_list: _assign_tag(tags_list, tags_priority_list, default_tag))
display(df_images.tag.value_counts())

tag
normal        100031
poça           53218
lâmina          7979
bolsão          3612
transbordo      2237
alagamento      1417
Name: count, dtype: int64

#### Binarize categorical variable

In [4]:
target_classes = ['bolsão', 'alagamento']

# Binarize categorical variable from list of target classes
df_images['flood'] = df_images['tag'].isin(target_classes).astype(int)

#### Filter images

In [5]:
query_params = {'seen': True}

# Filter dataset of images by query
df_sample = filter_by_query(df_images, query_params).copy()
df_sample.reset_index(drop=True, inplace=True)

---
## 2. Example train and test split

In [6]:
import numpy as np

#### Custom sampling of images (Example usage)

In [237]:
random_state = None
max_samples = 1000
target_variable = 'flood'
groups_variable = 'code'

#### Shuffle Samples

In [239]:
data = df_sample.sample(n=len(df_sample), replace=False, random_state=random_state)
data.reset_index(drop=True, inplace=True)

#### Get x and y (features and target variable)

In [240]:
# Get x and y
X = data.drop(target_variable, axis=1)
Y = data[target_variable].copy()
groups = data[groups_variable].copy()

Y.value_counts()

flood
0    163465
1      5029
Name: count, dtype: int64

#### Under sampling

In [241]:
# Under sampling
minority_samples = (Y == 1).sum()
y_minority_sample = Y[Y == 0].sample(n=minority_samples, replace=False, random_state=random_state)

y_res = pd.concat([y_minority_sample, Y[Y == 1]], axis=0)
x_res = X.loc[y_res.index].copy()
groups_res = groups.loc[y_res.index].copy()

print('Total samples:', len(y_res))
y_res.value_counts()

Total samples: 10058


flood
0    5029
1    5029
Name: count, dtype: int64

In [267]:
data['flood'].loc[y_res.index].value_counts()

flood
0    5029
1    5029
Name: count, dtype: int64

#### Sampling based on `max_samples` · Optional

In [29]:
# Dropping samples
y_res = y_res.sample(max_samples, replace=True, random_state=random_state)
x_res = X.loc[y_res.index].copy()
groups_res = groups.loc[y_res.index].copy()

y_res.value_counts()

flood
1    54
0    46
Name: count, dtype: int64

#### Train test split  · Optional

In [36]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(x_res, y_res, test_size=0.3, shuffle=True, stratify=y_res, random_state=random_state)
train_index, test_index = Y_train.index, Y_test.index

#### (Alternative) Stratified group KFold split · Optional

Obs: Best split result

In [243]:
from sklearn.model_selection import StratifiedGroupKFold

# Stratified group KFold split
sgkf = StratifiedGroupKFold(n_splits=5, shuffle=False, random_state=random_state)

# Get first split folder
for i, (train_index, test_index) in enumerate(sgkf.split(x_res, y_res, groups_res)):
    break

X_train = x_res.iloc[train_index]
X_test = x_res.iloc[test_index]

Y_train = y_res.iloc[train_index]
Y_test = y_res.iloc[test_index]

#### (Alternative) Group shuffle split · Optional

In [210]:
# NOTE: This split is not deterministic even using random_state!=None
# NOTE: This split is not stratified
from sklearn.model_selection import GroupShuffleSplit

# Group shuffle split
gss = GroupShuffleSplit(n_splits=1, random_state=random_state, test_size=0.2)

# Get first split folder
for i, (train_index, test_index) in enumerate(gss.split(x_res, y_res, groups_res)):
    break

X_train = x_res.iloc[train_index]
X_test = x_res.iloc[test_index]

Y_train = y_res.iloc[train_index]
Y_test = y_res.iloc[test_index]

#### Report group count and train and test samples for each class (Missing reporting per group class)

In [273]:
groups_train = groups_res.loc[Y_train.index].unique()
groups_test = groups_res.loc[Y_test.index].unique()
class_count = y_res.value_counts()
train_class_count = Y_train.value_counts()
test_class_count = Y_test.value_counts()
train_class_prct = [train_class_count[0] / class_count[0], train_class_count[1] / class_count[1]]
test_class_prct = [test_class_count[0] / class_count[0], test_class_count[1] / class_count[1]]

print('Groups intersecting train and test sets:', len(set(groups_train).intersection(groups_test)))
print()

pd.DataFrame(
    [[len(groups_train), len(groups_test)],
    [len(Y_train), len(Y_test)],
    [train_class_count[0], test_class_count[0]], 
    [train_class_count[1], test_class_count[1]],
    [round(train_class_prct[0] * 100, 2), round(test_class_prct[0] * 100, 2)],
    [round(train_class_prct[1] * 100, 2), round(test_class_prct[1] * 100, 2)]],
    columns=['train', 'test'], index=['groups', 'total', '0', '1', '0 (%)', '1 (%)']
).T

Groups intersecting train and test sets: 0



Unnamed: 0,groups,total,0,1,0 (%),1 (%)
train,138.0,8044.0,4027.0,4017.0,80.08,79.88
test,31.0,2014.0,1002.0,1012.0,19.92,20.12


---
## 3. Copy images into train and test folders

#### Copy images with `train_index`and `test_index` into structured 'train' and 'test' folders

In [271]:
from modules.octa_video_util import copy_images_to_folders

base_directory = 'data/images'
target_directory = 'data/splits/under-10058-StratifiedGroupKFold-5'
dataset = data.copy()
file_path_field = 'file_path'
label_field = 'flood'
train_indexes = list(Y_train.index)
test_indexes = list(Y_test.index)

copy_images_to_folders(base_directory, target_directory, dataset, train_indexes, test_indexes, file_path_field=file_path_field, tag_field=label_field)

Copying images to train folders:
Processed 8044/8044 files (100.00%) - Found: 8044/8044
Copying images to test folders:
Processed 2014/2014 files (100.00%) - Found: 2014/2014

#### Count save images

In [274]:
import os

# target_directory = 'data/splits/under-10058-StratifiedGroupKFold-5'

print('train:', len(os.listdir(f'{target_directory}/train/0')), len(os.listdir(f'{target_directory}/train/1')))
print('test:', len(os.listdir(f'{target_directory}/test/0')), len(os.listdir(f'{target_directory}/test/1')))

train: 4019 4017
test: 999 1012
