#### Reload images and videos datasets

In [1]:
import pandas as pd
import json

df_images = pd.read_csv('data/datasets/images.csv')
df_videos = pd.read_csv('data/datasets/videos.csv')

# df_images['tags'] = df_images['tags'].apply(json.loads)
df_videos['tags'] = df_videos['tags'].apply(json.loads)

print(df_videos.columns)
print()
display(df_videos.index[:10])

Index(['_id', 'blob_name', 'blob_size', 'bucket_name', 'file_name', 'code',
       'n_folders', 'timestamp', 'folder_structure', 'folder', 'tags', 'url',
       'api_url', 'bucket', 'seen'],
      dtype='object')



RangeIndex(start=0, stop=10, step=1)

#### Create tag field

In [2]:
from modules.octa_video_util import _assign_tag

# Create unique tag column based on class priority list
default_tag = 'normal'
tags_priority_list = ['alagamento', 'bolsão', 'lâmina', 'poça', 'transbordo']

df_videos['tag'] = df_videos['tags'].apply(lambda tags_list: _assign_tag(tags_list, tags_priority_list, default_tag))
df_images['tag'] = df_images['tags'].apply(lambda tags_list: _assign_tag(tags_list, tags_priority_list, default_tag))

display(df_videos.tag.value_counts().rename('Videos'))
print()
display(df_images.tag.value_counts().rename('Images'))

tag
normal        60241
poça           1348
lâmina          214
bolsão           96
transbordo       62
alagamento       56
Name: Videos, dtype: int64




tag
normal        100031
poça           53218
lâmina          7979
bolsão          3612
transbordo      2237
alagamento      1417
Name: Images, dtype: int64

#### Binarize 'tag' variable for images dataset

In [3]:
target_classes = ['lâmina', 'bolsão', 'alagamento']

# Binarize categorical variable from list of target classes
df_images['flood'] = df_images['tag'].isin(target_classes).astype(int)

display(df_images['flood'].value_counts())
print()
display(df_images.index[:10])

flood
0    155486
1     13008
Name: count, dtype: int64




RangeIndex(start=0, stop=10, step=1)

#### Filter videos

In [4]:
from modules.octa_video_util import filter_by_query

query_params = {
    'seen': True,
    'tag': ['normal', 'poça', 'lâmina', 'bolsão', 'alagamento']
}

# Filter dataset of images by query
df_videos_filtered = filter_by_query(df_videos, query_params).copy()

display(df_videos_filtered['tag'].value_counts())
print()
display(df_videos_filtered.index[:10])

tag
normal        2669
poça          1348
lâmina         213
bolsão          94
alagamento      36
Name: count, dtype: int64




Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype='int64')

#### Reload sample images dataset

In [5]:
target_directory = 'data/splits/sgkf-8-1-1'

df_sample = pd.read_csv(f'{target_directory}/images.csv', index_col=0)

print(df_sample.columns)
display(df_sample.index[:10])

Index(['id_video', 'code', 'folder', 'file_name', 'file_path', 'frame_index',
       'timestamp', 'initial_timestamp', 'seen', 'tags', 'tag', 'flood',
       'set'],
      dtype='object')


Index([152562, 28728, 102704, 5237, 112104, 19950, 8079, 37094, 127490, 53874], dtype='int64')

#### Unique cameras in each data set

In [6]:
train = df_sample[df_sample['set']=='train']
test = df_sample[df_sample['set']=='test']
val = df_sample[df_sample['set']=='val']

train_codes = train['code'].unique()
test_codes = test['code'].unique()
val_codes = val['code'].unique()
out_train_codes = set(df_videos_filtered['code'].unique()).difference(train_codes)

print('Train codes:', len(train_codes))
print('Test codes:', len(test_codes))
print('Val codes:', len(val_codes))
print('Out-train codes:', len(out_train_codes))


Train codes: 114
Test codes: 15
Val codes: 9
Out-train codes: 58


#### Test videos from cameras outside the training set

In [7]:
df_videos_test = df_videos_filtered[~df_videos_filtered['code'].isin(train_codes)]

display(df_videos_test['tag'].value_counts())
print()
display(df_videos_test.index[:10])
print()
print('Shape of sample for testing:', df_videos_test.shape)

tag
normal        800
poça          336
lâmina         34
bolsão         20
alagamento      7
Name: count, dtype: int64




Index([56, 57, 58, 59, 60, 61, 62, 63, 64, 65], dtype='int64')


Shape of sample for testing: (1197, 16)


#### Undersample videos based on flood event type

In [8]:
n_majority_videos = 20
random_state = 0

from imblearn.under_sampling import RandomUnderSampler

target_classes = ['lâmina', 'bolsão', 'alagamento']
test_minority = df_videos_test[df_videos_test['tag'].isin(target_classes)]
test_majority = df_videos_test.drop(test_minority.index)

x = test_majority.copy()
y = test_majority['tag'].copy()

rus = RandomUnderSampler(sampling_strategy='auto', random_state=random_state, replacement=False)

test_majority_filtered, tags_test_majority_filtered = rus.fit_resample(x, y)
test_majority_filtered = test_majority_filtered.sample(n_majority_videos, replace=False, random_state=random_state)

df_videos_test_filtered = pd.concat([test_minority, test_majority_filtered])

print('Videos selected:', df_videos_test_filtered.shape)
print()
display(df_videos_test_filtered['tag'].value_counts())
print()
display(df_videos_test_filtered['code'].value_counts().to_frame().T)
print()
display(df_videos_test_filtered.index[:10])


Videos selected: (81, 16)



tag
lâmina        34
bolsão        20
poça          12
normal         8
alagamento     7
Name: count, dtype: int64




code,278.0,1460.0,3290.0,482.0,384.0,1547.0,2166.0,299.0,1431.0,1393.0,...,1601.0,2156.0,226.0,1487.0,1507.0,1525.0,230.0,313.0,1881.0,112.0
count,26,23,7,3,3,2,2,2,1,1,...,1,1,1,1,1,1,1,1,1,1





Index([486, 571, 572, 573, 1029, 1030, 18477, 18478, 18569, 18671], dtype='int64')

#### Extract corresponding rows from images dataset

In [9]:
df_images_sample = df_images[df_images['id_video'].isin(df_videos_test_filtered['_id'])]

print('Images selected:', df_images_sample.shape)
print()
display(df_images_sample['tag'].value_counts())
print()
display(df_images_sample['code'].value_counts().to_frame().T)
print()
print('Videos found:', len(df_images_sample['id_video'].unique()))

Images selected: (3089, 12)



tag
lâmina        1214
bolsão         900
poça           461
alagamento     291
normal         223
Name: count, dtype: int64




code,1460.0,278.0,3290.0,384.0,482.0,2166.0,1547.0,299.0,1601.0,2156.0,...,1881.0,226.0,1393.0,112.0,1403.0,1431.0,230.0,313.0,1525.0,1487.0
count,1035,884,291,135,120,90,77,61,45,45,...,45,45,30,30,30,28,19,15,11,8



Videos found: 81


#### Check existence of images

In [10]:
import os

def get_nested_files(folder_path):
    file_paths = []
    
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            file_path = os.path.join(root, file)
            file_paths.append(file_path.replace('\\', '/'))
    
    return file_paths

# Example usage
# base_path = 'data/images'
# images_paths_found = get_nested_files(base_path)


base_path = 'data/images'

images_paths_found = get_nested_files(base_path)
images_paths_sample = df_images_sample['file_path'].apply(lambda file_path: f'{base_path}/{file_path}'.replace('\\', '/'))
files_exist_prct = images_paths_sample.isin(images_paths_found).mean()

str(round(files_exist_prct * 100, 2)) + ' %'

'100.0 %'

#### Load model with Yolo

In [11]:
from ultralytics import YOLO

# Path to the folder you want to zip
folder_path = f'YoloV8/train-results/sgkf-8-1-1-epochs23'

# Load a model
model = YOLO(f'{folder_path}/weights/best.pt')  # load a partially trained model

#### Run predictions with yolo in batch

In [12]:
import cv2
from IPython.display import clear_output as co
import time

base_path = 'data/images'
batch = 8

eval_imgs_df = df_images_sample.sort_values(['id_video', 'frame_index'])# .head(45)
img_path_list = f'{base_path}/' + eval_imgs_df['file_path']
n_imgs = len(img_path_list)

preds = []
avg_time = 0.0
s_time = time.time()
for i in range(0, n_imgs, batch):
    e_time = time.time() - s_time
    e_time_round = round(e_time / 60, 2)
    avg_time = e_time / max(1, i)
    expected_finish_time = round((n_imgs - i) * avg_time / 60, 2)
    expected_total_time = round(n_imgs * avg_time  / 60, 2)

    co(True)
    print(f'{i}/{n_imgs} | {e_time_round} min / {expected_total_time} min | time-left: {expected_finish_time} min')    

    img_path_list_batch = img_path_list[i: i + batch].tolist()
    pred = model.predict(img_path_list_batch, imgsz=640)
    pred = [[pred_i.probs.top1, pred_i.probs.data[1].item()] for pred_i in pred]
    preds.extend(pred)

eval_imgs_df[['pred', 'prob']] = preds

3088/3089 | 30.55 min / 30.56 min | time-left: 0.01 min

0: 640x640 0 0.99, 1 0.01, 612.9ms
Speed: 30.0ms preprocess, 612.9ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)


#### Save results

In [15]:
target_directory = 'data/splits/sgkf-8-1-1'
eval_imgs_df.to_csv(f'{target_directory}/images-out-cameras.csv', index=True)

#### Evaluate results

In [14]:
from sklearn.metrics import classification_report, confusion_matrix

labels = eval_imgs_df['flood'].tolist()
pred_labels = [pred[0] for pred in preds]

print('\n * Confusion_matrix:')
print(confusion_matrix(labels, pred_labels))
print('\n * Classification_report:')
print(classification_report(labels, pred_labels))



 * Confusion_matrix:
[[ 418  266]
 [ 694 1711]]

 * Classification_report:
              precision    recall  f1-score   support

           0       0.38      0.61      0.47       684
           1       0.87      0.71      0.78      2405

    accuracy                           0.69      3089
   macro avg       0.62      0.66      0.62      3089
weighted avg       0.76      0.69      0.71      3089



#### Function to write video results

In [18]:
import os
import cv2

def create_annotated_video(image_paths, true_labels, predicted_labels, predicted_probs, output_video_path, report=True):
    if len(image_paths) != len(true_labels) or len(true_labels) != len(predicted_labels):
        raise ValueError("Number of paths, true labels, and predicted labels must be the same.")

    # Create a VideoWriter object
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    video_writer = cv2.VideoWriter(output_video_path, fourcc, 3.0, (854, 480)) # (640, 480)

    n_imgs = len(image_paths)
    for i, (image_path, true_label, predicted_label, predicted_prob) in enumerate(zip(image_paths, true_labels, predicted_labels, predicted_probs)):
        # Read the image
        image = cv2.imread(image_path)

        # Set color based on prediction correctness
        if true_label == predicted_label:
            color = (0, 255, 0)  # Green for correct predictions
        else:
            color = (0, 0, 255)  # Red for incorrect predictions

        # Annotate the image with true and predicted labels in the bottom-left corner
        true_label_text = f'True: {true_label}'
        predicted_label_text = f'Predicted: {int(predicted_label)}'
        probability_text = f'Probability: {round(predicted_prob * 100, 1)} %'
        cv2.putText(image, true_label_text, (image.shape[1] - 130, 40), cv2.FONT_HERSHEY_SIMPLEX, 1, color, 2, cv2.LINE_AA)
        cv2.putText(image, predicted_label_text, (image.shape[1] - 210, 75), cv2.FONT_HERSHEY_SIMPLEX, 1, color, 2, cv2.LINE_AA)
        cv2.putText(image, probability_text, (image.shape[1] - 315, 110), cv2.FONT_HERSHEY_SIMPLEX, 1, color, 2, cv2.LINE_AA)

        # Resize the image for video (optional)
        image = cv2.resize(image, (854, 480)) # (640, 480)

        # Write the annotated image to the video
        video_writer.write(image)

        # Report progress
        if report:
            print(f'Images processed: {i + 1}/{n_imgs} ', end='\r')    

    # Release the VideoWriter
    video_writer.release()

# Example usage:

# base_path = 'data/images'

# unique_video_ids = eval_imgs_df['id_video'].unique()
# video_imgs = eval_imgs_df[eval_imgs_df['id_video']==unique_video_ids[0]]

# image_paths = (f'{base_path}/' + video_imgs['file_path']).tolist()
# true_labels = video_imgs['flood'].tolist()
# predicted_labels = video_imgs['pred'].tolist()
# predicted_probs = video_imgs['prob'].tolist()

# output_video_path = 'output_video.mp4'

# create_annotated_video(image_paths, true_labels, predicted_labels, predicted_probs, output_video_path)


#### Write video results

In [19]:
import os

base_path = 'data/images'
base_output_path = 'data/eval-videos'

unique_camera_codes = eval_imgs_df['code'].unique()
n_cameras = len(unique_camera_codes)

for i, code in enumerate(unique_camera_codes):
    for true_label in [0, 1]:
        camera_imgs = eval_imgs_df[(eval_imgs_df['code'] == code) & (eval_imgs_df['flood'] == true_label)].sort_values('timestamp')

        if not len(camera_imgs):
            continue

        image_paths = (f'{base_path}/' + camera_imgs['file_path']).tolist()
        true_labels = camera_imgs['flood'].tolist()
        predicted_labels = camera_imgs['pred'].tolist()
        predicted_probs = camera_imgs['prob'].tolist()
        output_video_path = f'{base_output_path}/{true_label}/{int(code)}.mp4'
        
        output_video_dir = os.path.dirname(output_video_path)
        if not os.path.isdir(output_video_dir):
            os.makedirs(output_video_dir)
            
        create_annotated_video(image_paths, true_labels, predicted_labels, predicted_probs, output_video_path, report=False)
        print(f'Cameras processed: {i + 1}/{n_cameras} ', end='\r')    
        

Cameras processed: 21/21 

#### End

---

### Extra:

#### Load model with TensorFlow

In [None]:
# Load the best model from the saved checkpoint file
checkpoint_filepath = 'results/best_model_cnn.h5'
model1 = models.load_model(checkpoint_filepath)

params = {
    'data': 'data/images',
    'epochs': 25,
    'imgsz': 640,
    'batch': 32,
    'device': [0, 1],
    'learning_rate': 0.0001,
}

img_height, img_width = 640, 640 # 480, 854

test_datagen = ImageDataGenerator(rescale=1./255)

test_generator = test_datagen.flow_from_directory(
    params['data'] + '/test',
    target_size=(224, 224),
    batch_size=params['batch'],
    class_mode='binary',
    shuffle=False,
)


#### Copy images into test folder

In [12]:
from modules.octa_video_util import copy_images_to_folders

base_directory = 'data/images'
target_directory = 'data/splits/sgkf-8-1-1-test-videos'

dataset = df_images_sample.copy()
file_path_field = 'file_path'
label_field = 'flood'

train_indexes = None
test_indexes = list(df_images_sample.index)
val_indexes = None

copy_images_to_folders(
    base_directory, target_directory, dataset,
    train_indexes, test_indexes, val_indexes,
    file_path_field=file_path_field, tag_field=label_field
)


Copying images to test folders:
Processed 1070/1070 files (100.00%) - Found: 1070/1070

#### Save dataframe of sample images

In [59]:
target_directory = 'data/splits/sgkf-8-1-1-videos'

dataset = df_images_sample.copy()

# data_train = dataset.loc[Y_train.index]
data_test = dataset.loc[test_indexes]
# data_val = dataset.loc[Y_val.index]

# data_train['set'] = 'train'
data_test['set'] = 'test'
# data_val['set'] = 'val'

data_split_df = data_test.copy()
# data_split_df = pd.concat([data_train, data_test, data_val])

data_split_df.to_csv(f'{target_directory}/images.csv')
print(f'split dataframe saved with shape: {data_split_df.shape}')

split dataframe saved with shape: (1070, 13)


#### Count save images

In [61]:
import os

target_directory = 'data/splits/sgkf-8-1-1-videos'

# print('train:', len(os.listdir(f'{target_directory}/train/0')), len(os.listdir(f'{target_directory}/train/1')))
print('test:', len(os.listdir(f'{target_directory}/test/0')), len(os.listdir(f'{target_directory}/test/1')))
# print('val:', len(os.listdir(f'{target_directory}/val/0')), len(os.listdir(f'{target_directory}/val/1')))

test: 247 823


#### Set up TensorFlow

In [64]:
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras import layers, models
from sklearn.metrics import classification_report, confusion_matrix
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint

# Check if GPU is available
physical_devices = tf.config.list_physical_devices('GPU')
if physical_devices:
    print("GPU is available.")
    for device in physical_devices:
        tf.config.experimental.set_memory_growth(device, True)
else:
    print("GPU not found. Using CPU.")
    
# Define mirrored strategy for GPU training
strategy = tf.distribute.MirroredStrategy()

GPU not found. Using CPU.
INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:CPU:0',)
