#### Download mongo `Videos` and `Videos Localizados` collections as pandas dataframe

In [29]:
from modules.mongo import MongoDB
import pandas as pd
import json

conn_str = "mongodb+srv://luisresende13:Gaia0333@pluvia-cluster.ea8fb4s.mongodb.net/?retryWrites=true&w=majority"
mongo = MongoDB(conn_str)

# Get entire MongoDB collection as list of objects
data = mongo.get('Waterbag', 'Videos')
data_label = mongo.get('Waterbag', 'Videos Localizados')

# Convert to pandas dataframe
df = pd.DataFrame(data)
df_label = pd.DataFrame(data_label)

# Convert json fields to string
df['tags'] = df['tags'].apply(json.dumps)
df_label['tags'] = df_label['tags'].apply(json.dumps)

# Save as pandas dataframe
df.to_csv('data/datasets/videos.csv', index=False)
df_label.to_csv('data/datasets/videos-label.csv', index=False)

print('Videos Originais: ', df.shape)
print('Videos Rotulados:', df_label.shape)

Videos Originais:  (68900, 14)
Videos Rotulados: (62017, 14)


#### Reload data

In [4]:
import pandas as pd

df = pd.read_csv('data/datasets/videos.csv')
df_label = pd.read_csv('data/datasets/videos-label.csv')

  df = pd.read_csv('data/datasets/videos.csv')


#### Preprocessing

In [5]:
import json

df['tags'] = df['tags'].apply(json.loads)
df_label['tags'] = df_label['tags'].apply(json.loads)

df_custom = df_label.copy()
df_custom['blob_name'] = df_custom['blob_name'].str.replace('.webm', '.mp4')
df_custom['bucket_name'] = 'flood-video-collection'

tagged = ~df_label['tags'].isin([[]])
cameras_with_labels = df_label[tagged]['code'].unique()
videos_from_cameras_with_labels = df_label[df_label['code'].isin(cameras_with_labels)]

print('Vídeos rotulados:', tagged.sum())
print('Câmeras com rótulos:', len(cameras_with_labels))
print('Vídeos de câmeras com rótulos:', len(videos_from_cameras_with_labels))

print('\nMissing values:')
print(df_custom[df_custom['timestamp'].isna()])

df_custom.dropna(subset=['timestamp'], inplace=True)

Vídeos rotulados: 120
Câmeras com rótulos: 15
Vídeos de câmeras com rótulos: 4225

Missing values:
                            _id  \
18462  6504ff5b874b309c35491888   
18463  6505012a874b309c35491889   
18464  65050136874b309c3549188a   

                                               blob_name  blob_size  \
18462           comando/CODE2017 2023-04-14 17-41-36.mp4        NaN   
18463  comando/lâmina/101084/CODE2205 2023-08-20 12-0...        NaN   
18464  comando/lâmina/101084/CODE2206 2023-08-20 12-1...        NaN   

                  bucket_name file_name  code  n_folders timestamp  \
18462  flood-video-collection       NaN   NaN        NaN       NaN   
18463  flood-video-collection       NaN   NaN        NaN       NaN   
18464  flood-video-collection       NaN   NaN        NaN       NaN   

      folder_structure folder                  tags  url api_url  \
18462              NaN    NaN  [alagamento, bolsão]  NaN     NaN   
18463              NaN    NaN              [lâmina]  NaN  

#### Utility functions

In [6]:
from modules.octa_video_util import _filter_by_query, _assign_tag
from modules.octa_video_util import VideoDownloader, VideoFrameExtractor
from modules.octa_video_util import buildImageDataset, buildImageDatasetThreads
from modules.octa_video_util import copy_images_to_folders

#### Download videos based on attributes

In [10]:
# from modules.octa_video_util import VideoDownloader

target_directory = 'data/videos/rotulados'
bucket_name = 'flood-video-collection'
query_params = {'code': list(cameras_with_labels)}
overwrite = False
credentials_path = '../../Flask APIs/cams-rio-api/auth/octacity-iduff.json'
max_threads = 100

downloader = VideoDownloader(df_custom, target_directory, credentials_path, max_threads)
downloader.download_videos(query_params, overwrite)


DONE! 4222/4222 files downloaded.4222/4222 files (100.00%) - data/videos/rotulados\waze/flood/f5000084-5a91-485a-97b9-abc35f3ef70d/1671/CODE1671 2023-04-14 17-28-48.mp4


#### Breakdown videos' frames

In [None]:
# from modules.octa_video_util import VideoFrameExtractor
# from modules.octa_video_util import _filter_by_query

base_directory = 'data/videos/rotulados'
target_directory = 'data/imgs'
query_params = {'code': list(cameras_with_labels)}
# query_params = {'code': cameras_with_labels[0]}
df_filtered = df_custom[_filter_by_query(df_custom, query_params)]
overwrite = False
MAX_THREADS = 10  # WATCH OUT

frame_extractor = VideoFrameExtractor(df_filtered, base_directory, target_directory, MAX_THREADS)
frame_extractor.extract_frames(overwrite)


Processed 92/4222 rows (2.18%)ta/videos/rotulados\comando/bolsão/101445/3211/CODE3211 2023-08-26 04-20-02.mp4mp4Extracted 45/45 frames from data/videos/rotulados\comando/alagamento/93901/1027/CODE1027 2023-03-30 20-50-14.mp4

#### Build images dataset

In [None]:
# from modules.octa_video_util import buildImageDataset
# from modules.octa_video_util import _assign_tag

dataset = df_custom.dropna(subset=['timestamp']).copy()
base_directory = 'data/videos/rotulados'
target_directory = 'data/imgs'
fps = 3

df_images =  buildImageDataset(dataset, base_directory, target_directory, fps=3)

# Create unique tag column
tags_priority_list = ['alagamento', 'bolsão', 'lâmina', 'transbordo', 'poça']
df_images['tag'] = df_images['tags'].apply(lambda tags_list: _assign_tag(tags_list, tags_priority_list))

# Save images dataset
df_images.to_csv('data/datasets/images.csv', index=False)

# Print results
print('Image dataset shape:', df_images.shape)
print('Unique tags', df_images.tag.value_counts())

#### Build images dataset (Faster version with threads)

In [7]:
# from modules.octa_video_util import buildImageDatasetThreads
# from modules.octa_video_util import _assign_tag

dataset = df_custom.dropna(subset=['timestamp']).copy()
base_directory = 'data/videos/rotulados'
target_directory = 'data/imgs'
images_dataframe_path = 'data/datasets/images.csv'
fps = 3
print_each = 50
max_threads = 10
tags_priority_list = ['alagamento', 'bolsão', 'lâmina', 'transbordo', 'poça']

df_images = buildImageDatasetThreads(dataset, base_directory, target_directory, fps, print_each, max_threads)

# Create unique tag column
df_images['tag'] = df_images['tags'].apply(lambda tags_list: _assign_tag(tags_list, tags_priority_list))

# Save images dataset
df_images.to_csv(images_dataframe_path, index=False)

# Print results
print('\nImage dataset shape:', df_images.shape)
print('Unique tags', df_images.tag.value_counts())

Processed videos: 4222/4222 (100.0) %

Image dataset shape: (177098, 10)
Unique tags tag
normal        173178
poça            2728
alagamento       803
lâmina           225
bolsão           164
Name: count, dtype: int64


#### Reload images dataframe

In [1]:
import pandas as pd

df_images = pd.read_csv('data/datasets/images.csv')

#### Custom sampling of images (Example usage)

In [2]:
import numpy as np
from sklearn.model_selection import StratifiedGroupKFold
from modules.octa_video_util import _filter_by_query

query_params = {'code': list(range(1, 900))}
df_presample = df_images[_filter_by_query(df_images, query_params)].copy()

df_presample['flood'] = df_presample['tag'].isin(['lâmina', 'bolsão', 'alagamento', 'transbordo'])

df_presample.reset_index(drop=True, inplace=True)
X = df_presample.drop('flood', axis=1)
y = df_presample['flood']
groups = df_presample['code']

sgkf = StratifiedGroupKFold(n_splits=2, shuffle=True, random_state=None)

for i, (train_index, test_index) in enumerate(sgkf.split(X, y, groups)):
    break

X_train = X.loc[train_index]
X_test = X.loc[test_index]

Y_train = y.loc[train_index]
Y_test = y.loc[test_index]

print('Train samples:',len(train_index))
print('Test samples:', len(test_index))

display(Y_train.value_counts().to_frame('train'))
display(Y_test.value_counts().to_frame('test'))

Train samples: 3817
Test samples: 69


Unnamed: 0_level_0,train
flood,Unnamed: 1_level_1
False,3592
True,225


Unnamed: 0_level_0,test
flood,Unnamed: 1_level_1
False,69


#### Customized Image Dataset Splitting and Copying Procedure

In [3]:
from modules.octa_video_util import copy_images_to_folders

base_directory = 'data/imgs'
target_directory = 'data/sample/1'
dataset = df_images
train_indexes = list(train_index)
test_indexes = list(test_index)

file_path_field = 'file_path'
tag_field = 'tag'

copy_images_to_folders(base_directory, target_directory, dataset, train_indexes, test_indexes, file_path_field=file_path_field, tag_field=tag_field)


Copying images to train folders:
Processed 3817/3817 files (100.00%) - Found: 2597/3817
Copying images to test folders:
Processed 69/69 files (100.00%) - Found: 69/3817