# Download videos, images and datasets from Octa City's collection of flood videos

#### Set credentials

In [2]:
# mongo_connection_string = 'your_mongo_connection_string'
# google_credentials_path = 'path/to/your/google_service_account_credential.json'

## 1. Download videos dataset

Download `Videos Localizados` mongo collections as pandas dataframe

In [60]:
from modules.mongo import MongoDB
import pandas as pd
import numpy
import json
from time import time

videos_dataset_path = 'data/datasets/videos.csv'

# Get MongoDB collection as list of objects
s1 = time()
mongo = MongoDB(mongo_connection_string)
data = mongo.get('Waterbag', 'Videos Localizados')

# Convert to pandas dataframe
s2 = time()
df = pd.DataFrame(data)

# Convert 'tags' json field to string
df['tags'] = df['tags'].apply(json.dumps)

# Save as pandas dataframe
s3 = time()
df.to_csv(videos_dataset_path, index=False)

s4 = time()
total_mega_bytes = df['blob_size'].replace('', 0).astype('int').sum() / 1e9
print('Video files in dataset:', df.shape[0])
print('Total mega bytes (MBs):', round(total_mega_bytes, 3))
print('Time to download:', round(s2 - s1, 1))
print('Time to save dataframe:', round(s4 - s3, 1))
print('Time Total:', round(s4 - s1, 1), 's')

Video files in dataset: 62017
Total mega bytes (MBs): 78.405
Time to download: 16.6
Time to save dataframe: 2.5
Time Total: 19.7 s


#### Report videos dataset

In [12]:
seen = df['seen'].sum()
tagged = ~df['tags'].isin([[]])
cameras_with_labels = df[tagged]['code'].unique()
videos_from_cameras_with_labels = df[df['code'].isin(cameras_with_labels)]
rows_with_missin_values = df[df['timestamp'].isna()]

print('\nVídeos assistidos:', seen)
print('Vídeos rotulados:', tagged.sum())
print('Câmeras com rótulos:', len(cameras_with_labels))
print('Vídeos de câmeras com rótulos:', len(videos_from_cameras_with_labels))

print('\nROWS WITH MISSING VALUES:')
display(rows_with_missin_values)


Vídeos assistidos: 708
Vídeos rotulados: 608
Câmeras com rótulos: 47
Vídeos de câmeras com rótulos: 9159

ROWS WITH MISSING VALUES:


Unnamed: 0,_id,blob_name,blob_size,bucket_name,file_name,code,n_folders,timestamp,folder_structure,folder,tags,url,api_url,bucket,seen
18462,6504ff5b874b309c35491888,comando/CODE2017 2023-04-14 17-41-36.webm,,,,,,,,,"[alagamento, bolsão]",,,flood-videos-stamped,False
18463,6505012a874b309c35491889,comando/lâmina/101084/CODE2205 2023-08-20 12-0...,,,,,,,,,[lâmina],,,flood-videos-stamped,False
18464,65050136874b309c3549188a,comando/lâmina/101084/CODE2206 2023-08-20 12-1...,,,,,,,,,[lâmina],,,flood-videos-stamped,False


---
## 2. Reload videos dataset

In [7]:
import pandas as pd
import json

videos_dataset_path = 'data/datasets/videos.csv'
df = pd.read_csv(videos_dataset_path)

# Preprocessing
df['tags'] = df['tags'].apply(json.loads)

#### Preprocessing of videos dataset

In [8]:
import json

df_custom = df.copy()
df_custom['bucket_name'] = 'flood-video-collection'
df_custom['blob_name'] = df_custom['blob_name'].str.replace('.webm', '.mp4') # reproduce the .mp4 collection using the .webm collection
df_custom.dropna(subset=['timestamp'], inplace=True) # Drop rows with missing values for `timestmaps`field

---
## 3. Download video files

#### Import utility functions

In [1]:
from modules.octa_video_util import filter_by_query, _assign_tag
from modules.octa_video_util import VideoDownloader, VideoFrameExtractor
from modules.octa_video_util import buildImageDataset, buildImageDatasetThreads
from modules.octa_video_util import copy_images_to_folders

#### Download videos with optional query

In [85]:
# from modules.octa_video_util import VideoDownloader

target_directory = 'data/videos'
bucket_name = 'flood-video-collection'

# query_params = {'code': [101, 102, 103], 'seen': [True, False]} 
query_params = {'seen': [True]} 

overwrite = False
max_threads = 30

downloader = VideoDownloader(df_custom, target_directory, google_credentials_path, max_threads)
downloader.download_videos(query_params, overwrite)

DONE! 708/708 files downloaded.0%)


---
## 4. Extract and save image files

#### Extract images from video files (with optional query)

In [6]:
# from modules.octa_video_util import VideoFrameExtractor
# from modules.octa_video_util import filter_by_query

base_directory = 'data/videos'
target_directory = 'data/images'

# query_params = {'code': [101, 102, 103], 'seen': [True, False]} 
query_params = {'seen': [True]} 

df_filtered = filter_by_query(df_custom, query_params).copy()
overwrite = False
MAX_THREADS = 10  # Be careful

frame_extractor = VideoFrameExtractor(df_filtered, base_directory, target_directory, MAX_THREADS)
frame_extractor.extract_frames(overwrite)

Processed 709/708 rows (100.14%)/videos\polygons/manual/72/77/CODE77 2023-02-08 16-20-00.mp4p4-00-15.mp4mp4Extracted 45/45 frames from data/videos\polygons/comando/alagamento/0/1639/CODE1639 2023-08-28 23-35-03.mp4

---
## 5. Build images dataset

Create the dataset of images from the video files in `base_directory` folder.

Obs: To update to the latest 'tags' and 'seen' values, re-download the 'videos' dataset and pass it down to `buildImageDataset` or `buildImageDatasetThreads` below.

#### Build images dataset from video files

In [9]:
# from modules.octa_video_util import buildImageDataset
# from modules.octa_video_util import _assign_tag

dataset = df_custom.copy()
base_directory = 'data/videos'
images_dataset_path = 'data/datasets/images.csv'
fps = 3

# Build images dataset
df_images =  buildImageDataset(dataset, base_directory, fps=fps)

# Save images dataset
df_images.to_csv(images_dataset_path, index=False)

# Print results
print('Image dataset shape:', df_images.shape)

Processed videos: 708/708 (100.0) %

Image dataset shape: (29356, 10)


#### Build images dataset from video files with threads (Faster version)

In [7]:
# from modules.octa_video_util import buildImageDatasetThreads
# from modules.octa_video_util import _assign_tag

dataset = df_custom.copy()
base_directory = 'data/videos'
images_dataset_path = 'data/datasets/images.csv'
fps = 3
print_each = 50
max_threads = 10

df_images = buildImageDatasetThreads(dataset, base_directory, fps, print_each, max_threads)

# Save images dataset
df_images.to_csv(images_dataset_path, index=False)

# Print results
print('\nImage dataset shape:', df_images.shape)

Processed videos: 708/708 (100.0) %

Image dataset shape: (29356, 10)


---
## 6. Reload images dataset

In [2]:
import pandas as pd

images_dataset_path = 'data/datasets/images.csv'
df_images = pd.read_csv(images_dataset_path)

#### Create 'tag' field from multiple tags based on tag priority

In [3]:
# Create unique tag column based on class priority list
default_tag = 'normal'
tags_priority_list = ['alagamento', 'bolsão', 'lâmina', 'poça', 'transbordo']

df_images['tag'] = df_images['tags'].apply(lambda tags_list: _assign_tag(tags_list, tags_priority_list, default_tag))

print('Imagens assistidas (de videos baixados):', df_images['seen'].sum(), '/', len(df_images))
print()
display(df_images.tag.value_counts())

Imagens assistidas (de videos baixados): 29356 / 29356



tag
poça          19877
normal         4889
lâmina         3661
alagamento      863
bolsão           66
Name: count, dtype: int64

---
## 7. Example: Train and test split + Copying images into train and test folders

#### Custom sampling of images (Example usage)

In [4]:
import numpy as np
from sklearn.model_selection import StratifiedGroupKFold
from modules.octa_video_util import _filter_by_query

random_state = 0
max_samples = 300
minority_classes = ['lâmina', 'bolsão', 'alagamento']
class_col = 'tag'
target_col = 'flood'
groups_col = 'code'

# Custom pre-sample of images dataset
# query_params = {'code': [101, 102, 103], 'seen': [True, False]}
query_params = {}
df_presample = filter_by_query(df_images, query_params).copy()
df_presample.reset_index(drop=True, inplace=True)

# Create target variable for binary classification
df_presample[target_col] = df_presample[class_col].isin(minority_classes).astype(int)

# Get x and y
x = df_presample.drop(target_col, axis=1)
y = df_presample[target_col]
groups = df_presample[groups_col]

# Custom under sampling
minority_samples = (y == 1).sum()
y_minority_sample = y[y == 0].sample(n=minority_samples, replace=False, random_state=random_state)
y_res = pd.concat([y_minority_sample, y[y == 1]], axis=0).sample(max_samples, replace=True, random_state=random_state)
x_res = x.loc[y_res.index]
groups_res = groups.loc[y_res.index]

# Stratified group KFold split
sgkf = StratifiedGroupKFold(n_splits=3, shuffle=True, random_state=random_state)
for i, (train_index, test_index) in enumerate(sgkf.split(x_res, y_res, groups_res)):
    break

X_train = x_res.iloc[train_index]
X_test = x_res.iloc[test_index]

Y_train = y_res.iloc[train_index]
Y_test = y_res.iloc[test_index]

print('Train samples:',len(train_index))
print('Test samples:', len(test_index))

display(Y_train.value_counts().to_frame('train'))
display(Y_test.value_counts().to_frame('test'))

Train samples: 216
Test samples: 84


Unnamed: 0_level_0,train
flood,Unnamed: 1_level_1
1,127
0,89


Unnamed: 0_level_0,test
flood,Unnamed: 1_level_1
0,54
1,30


#### Copy images from `train_index`and `test_index` into structured 'train' and 'test' folders

In [10]:
from modules.octa_video_util import copy_images_to_folders

base_directory = 'data/images'
target_directory = 'data/samples/1'
dataset = df_presample.loc[y_res.index].reset_index(drop=True).copy()
train_indexes = list(train_index)
test_indexes = list(test_index)

file_path_field = 'file_path'
label_field = 'flood'

copy_images_to_folders(base_directory, target_directory, dataset, train_indexes, test_indexes, file_path_field=file_path_field, tag_field=label_field)

Copying images to train folders:
Processed 216/216 files (100.00%) - Found: 56/216
Copying images to test folders:
Processed 84/84 files (100.00%) - Found: 66/216

In [8]:
df_presample.loc[train_index]['tag'].value_counts()

tag
poça    216
Name: count, dtype: int64