## Download videos, images and datasets from Octa City's collection of flood videos

#### Download mongo `Videos Localizados` collections as pandas dataframe

In [46]:
from modules.mongo import MongoDB
import pandas as pd
import json

conn_str = 'your_mongo_connection_string'
mongo = MongoDB(conn_str)

# Get entire MongoDB collection as list of objects
data = mongo.get('Waterbag', 'Videos Localizados')

# Convert to pandas dataframe
df = pd.DataFrame(data)

# Convert 'tags' json field to string
df['tags'] = df['tags'].apply(json.dumps)

# Save as pandas dataframe
df.to_csv('data/datasets/videos.csv', index=False)

print('Arquivos de vídeo do dataset: ', df.shape)

Arquivos de vídeo do dataset:  (62017, 15)


#### Reload data

In [4]:
import pandas as pd

df = pd.read_csv('data/datasets/videos.csv')

#### Preprocessing

In [5]:
import json

df['tags'] = df['tags'].apply(json.loads)

df_custom = df.copy()
df_custom['bucket_name'] = 'flood-video-collection'
df_custom['blob_name'] = df_custom['blob_name'].str.replace('.webm', '.mp4') # reproduce the .mp4 collection using the .webm collection


## == Report preprocessed dataset ===============

seen = df_custom['seen'].sum()
tagged = ~df['tags'].isin([[]])
cameras_with_labels = df[tagged]['code'].unique()
videos_from_cameras_with_labels = df[df['code'].isin(cameras_with_labels)]

print('Vídeos assistidos:', seen)
print('Vídeos rotulados:', tagged.sum())
print('Câmeras com rótulos:', len(cameras_with_labels))
print('Vídeos de câmeras com rótulos:', len(videos_from_cameras_with_labels))

print('\nROWS WITH MISSING VALUES:')
display(df_custom[df_custom['timestamp'].isna()])


## == DROP ROWS WITH MISSING TIMESTAMPS ===========

df_custom.dropna(subset=['timestamp'], inplace=True)


Vídeos assistidos: 0
Vídeos rotulados: 139
Câmeras com rótulos: 19
Vídeos de câmeras com rótulos: 4761

ROWS WITH MISSING VALUES:


Unnamed: 0,_id,blob_name,blob_size,bucket_name,file_name,code,n_folders,timestamp,folder_structure,folder,tags,url,api_url,bucket,seen
18462,6504ff5b874b309c35491888,comando/CODE2017 2023-04-14 17-41-36.mp4,,flood-video-collection,,,,,,,"[alagamento, bolsão]",,,flood-videos-stamped,False
18463,6505012a874b309c35491889,comando/lâmina/101084/CODE2205 2023-08-20 12-0...,,flood-video-collection,,,,,,,[lâmina],,,flood-videos-stamped,False
18464,65050136874b309c3549188a,comando/lâmina/101084/CODE2206 2023-08-20 12-1...,,flood-video-collection,,,,,,,[lâmina],,,flood-videos-stamped,False


#### Utility functions

In [6]:
from modules.octa_video_util import filter_by_query, _assign_tag
from modules.octa_video_util import VideoDownloader, VideoFrameExtractor
from modules.octa_video_util import buildImageDataset, buildImageDatasetThreads
from modules.octa_video_util import copy_images_to_folders

#### Download videos based on attributes

In [7]:
# from modules.octa_video_util import VideoDownloader

target_directory = 'data/videos'
bucket_name = 'flood-video-collection'

query_params = {'code': [101, 102, 103], 'seen': [True, False]} 

overwrite = False
credentials_path = '../../Flask APIs/cams-rio-api/auth/octacity-iduff.json'
max_threads = 30

downloader = VideoDownloader(df_custom, target_directory, credentials_path, max_threads)
downloader.download_videos(query_params, overwrite)


DONE! 83/83 files downloaded.0%)


#### Breakdown videos' frames

In [8]:
# from modules.octa_video_util import VideoFrameExtractor
# from modules.octa_video_util import filter_by_query

base_directory = 'data/videos'
target_directory = 'data/images'
query_params = {'code': [101, 102, 103], 'seen': [True, False]} 

df_filtered = filter_by_query(df_custom, query_params).copy()
overwrite = False
MAX_THREADS = 10  # WATCH OUT

frame_extractor = VideoFrameExtractor(df_filtered, base_directory, target_directory, MAX_THREADS)
frame_extractor.extract_frames(overwrite)


Extracted 45/45 frames from data/videos\waze/flood/fd1952e4-631d-4d83-908f-512e99203c46/103/CODE103 2023-04-08 10-33-33.mp4Extracted 45/45 frames from data/videos\waze/flood/38cf318e-7927-4266-a98a-74de426c617f/103/CODE103 2023-08-28 09-50-06.mp4

#### Build images dataset

Create the dataset of images for all videos in `base_directory` folder

Obs: To update to the latest tags, re-build the 'videos' dataset and pass it down to `buildImageDataset`

In [9]:
# from modules.octa_video_util import buildImageDataset
# from modules.octa_video_util import _assign_tag

dataset = df_custom.copy()
base_directory = 'data/videos'
images_dataset_path = 'data/datasets/images.csv'
fps = 3
tags_priority_list = ['alagamento', 'bolsão', 'lâmina', 'transbordo', 'poça']

# Build images dataset
df_images =  buildImageDataset(dataset, base_directory, fps=3)

# Create unique tag column based on class priority list
df_images['tag'] = df_images['tags'].apply(lambda tags_list: _assign_tag(tags_list, tags_priority_list))

# Save images dataset
df_images.to_csv(images_dataset_path, index=False)

# Print results
print('Image dataset shape:', df_images.shape)
print('Unique tags', df_images.tag.value_counts())

Processed videos: 83/83 (100.0) %

Image dataset shape: (2716, 11)
Unique tags tag
normal    2716
Name: count, dtype: int64


#### Build images dataset (Faster version with threads)

Create the dataset of images for all videos in `base_directory` folder

Obs: To update to the latest tags, re-build the 'videos' dataset and pass it down to `buildImageDatasetThreads`

In [10]:
# from modules.octa_video_util import buildImageDatasetThreads
# from modules.octa_video_util import _assign_tag

dataset = df_custom.copy()
base_directory = 'data/videos'
images_dataset_path = 'data/datasets/images.csv'
fps = 3
tags_priority_list = ['alagamento', 'bolsão', 'lâmina', 'transbordo', 'poça']
print_each = 50
max_threads = 10

df_images = buildImageDatasetThreads(dataset, base_directory, fps, print_each, max_threads)

# Create unique tag column
df_images['tag'] = df_images['tags'].apply(lambda tags_list: _assign_tag(tags_list, tags_priority_list))

# Save images dataset
df_images.to_csv(images_dataset_path, index=False)

# Print results
print('\nImage dataset shape:', df_images.shape)
print('\nUnique tags:', df_images.tag.value_counts())

Processed videos: 83/83 (100.0) %

Image dataset shape: (2716, 11)

Unique tags: tag
normal    2716
Name: count, dtype: int64


#### Reload images dataframe

In [11]:
import pandas as pd

df_images = pd.read_csv('data/datasets/images.csv')

print('Imagens assistidas dos videos baixados:', df_images['seen'].sum(), '/', len(df_images))

Imagens assistidas dos videos baixados: 0 / 2716


---
## Example of train and test split + Copying image into train and test folders

#### Custom sampling of images (Example usage)

In [12]:
import numpy as np
from sklearn.model_selection import StratifiedGroupKFold
from modules.octa_video_util import _filter_by_query

query_params = {'code': [101, 102, 103], 'seen': [True, False]}
df_presample = filter_by_query(df_images, query_params).copy()

# Build target variable for binary classification 
df_presample['flood'] = df_presample['tag'].isin(['lâmina', 'bolsão', 'alagamento', 'transbordo']).astype(int)

df_presample.reset_index(drop=True, inplace=True)
X = df_presample.drop('flood', axis=1)
y = df_presample['flood']
groups = df_presample['code']

sgkf = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=1)

for i, (train_index, test_index) in enumerate(sgkf.split(X, y, groups)):
    break

X_train = X.loc[train_index]
X_test = X.loc[test_index]

Y_train = y.loc[train_index]
Y_test = y.loc[test_index]

print('Train samples:',len(train_index))
print('Test samples:', len(test_index))

display(Y_train.value_counts().to_frame('train'))
display(Y_test.value_counts().to_frame('test'))

Train samples: 2652
Test samples: 64


Unnamed: 0_level_0,train
flood,Unnamed: 1_level_1
0,2652


Unnamed: 0_level_0,test
flood,Unnamed: 1_level_1
0,64


#### Copy images from `train_index`and `test_index` into structured 'train' and 'test' folders

In [13]:
from modules.octa_video_util import copy_images_to_folders

base_directory = 'data/images'
target_directory = 'data/samples/1'
dataset = df_images
train_indexes = list(train_index)
test_indexes = list(test_index)

file_path_field = 'file_path'
label_field = 'tag'

copy_images_to_folders(base_directory, target_directory, dataset, train_indexes, test_indexes, file_path_field=file_path_field, tag_field=label_field)


Copying images to train folders:
Processed 2652/2652 files (100.00%) - Found: 927/2652
Copying images to test folders:
Processed 64/64 files (100.00%) - Found: 41/2652