# Download videos, images and datasets from Octa City's collection of flood videos

#### Set credentials

In [None]:
# mongo_connection_string = 'your_mongo_connection_string'
# google_credentials_path = 'path/to/your/google_service_account_credential.json'

## 1. Download videos dataset

Download `Videos Localizados` mongo collections as pandas dataframe

In [None]:
from modules.mongo import MongoDB
import pandas as pd
import numpy
import json
from time import time

videos_dataset_path = 'data/datasets/videos.csv'

# Get MongoDB collection as list of objects
s1 = time()
mongo = MongoDB(mongo_connection_string)
data = mongo.get('Waterbag', 'Videos Localizados')

# Convert to pandas dataframe
s2 = time()
df = pd.DataFrame(data)

# Convert 'tags' json field to string
df['tags'] = df['tags'].apply(json.dumps)

# Save as pandas dataframe
s3 = time()
df.to_csv(videos_dataset_path, index=False)

s4 = time()
total_mega_bytes = df['blob_size'].replace('', 0).astype('int').sum() / 1e9
print('Video files in dataset:', df.shape[0])
print('Total mega bytes (MBs):', round(total_mega_bytes, 3))
print('Time to download:', round(s2 - s1, 1))
print('Time to save dataframe:', round(s4 - s3, 1))
print('Time Total:', round(s4 - s1, 1), 's')

#### Report videos dataset

In [None]:
seen = df['seen'].sum()
tagged = ~df['tags'].isin([[]])
cameras_with_labels = df[tagged]['code'].unique()
videos_from_cameras_with_labels = df[df['code'].isin(cameras_with_labels)]
rows_with_missin_values = df[df['timestamp'].isna()]

print('\nVídeos assistidos:', seen)
print('Vídeos rotulados:', tagged.sum())
print('Câmeras com rótulos:', len(cameras_with_labels))
print('Vídeos de câmeras com rótulos:', len(videos_from_cameras_with_labels))

print('\nROWS WITH MISSING VALUES:')
display(rows_with_missin_values)

---
## 2. Reload videos dataset

In [1]:
import pandas as pd
import json

videos_dataset_path = 'data/datasets/videos.csv'
df = pd.read_csv(videos_dataset_path)

# Preprocessing
df['tags'] = df['tags'].apply(json.loads)

#### Preprocessing of videos dataset

In [2]:
import json

df_custom = df.copy()
df_custom['bucket_name'] = 'flood-video-collection'
df_custom['blob_name'] = df_custom['blob_name'].str.replace('.webm', '.mp4') # reproduce the .mp4 collection using the .webm collection
df_custom.dropna(subset=['timestamp'], inplace=True) # Drop rows with missing values for `timestmaps`field

---
## 3. Download video files

#### Import utility functions

In [3]:
from modules.octa_video_util import filter_by_query, _assign_tag
from modules.octa_video_util import VideoDownloader, VideoFrameExtractor
from modules.octa_video_util import buildImageDataset, buildImageDatasetThreads
from modules.octa_video_util import copy_images_to_folders

#### Download videos with optional query

In [None]:
# from modules.octa_video_util import VideoDownloader

target_directory = 'data/videos'
bucket_name = 'flood-video-collection'

# query_params = {'code': [101, 102, 103], 'seen': [True, False]} 
query_params = {'seen': [True]}

overwrite = False
max_threads = 30

downloader = VideoDownloader(df_custom, target_directory, google_credentials_path, max_threads)
downloader.download_videos(query_params, overwrite)

---
## 4. Extract and save image files

#### Extract images from video files (with optional query)

In [4]:
# from modules.octa_video_util import VideoFrameExtractor
# from modules.octa_video_util import filter_by_query

base_directory = 'data/videos'
target_directory = 'data/images'

# query_params = {'code': [101, 102, 103], 'seen': [True, False]} 
query_params = {'seen': [True]} 

df_filtered = filter_by_query(df_custom, query_params).copy()
overwrite = False
MAX_THREADS = 10  # Be careful

frame_extractor = VideoFrameExtractor(df_filtered, base_directory, target_directory, MAX_THREADS)
frame_extractor.extract_frames(overwrite)

Processed 708/708 rows (100.00%)

FINISHED.

Frames found: 29356
Frames written to disk: 29356
Frames folder exists: 0
Videos not found: 0


#### Count saved images

In [5]:
import os

def count_files_with_extension(folder_path, ext=""):
    count = 0
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith(ext):
                count += 1
    return count

# Example usage:
folder_path = "data/images"
extension = ""
file_count = count_files_with_extension(folder_path, extension)
print(f"Number of '{extension}' files in '{folder_path}': {file_count}")


Number of '' files in 'data/images': 29356


---
## 5. Build images dataset

Create the dataset of images from the video files in `base_directory` folder.

Obs: To update to the latest 'tags' and 'seen' values, re-download the 'videos' dataset and pass it down to `buildImageDataset` or `buildImageDatasetThreads` below.

#### Build images dataset from video files

In [34]:
# from modules.octa_video_util import buildImageDataset
# from modules.octa_video_util import _assign_tag

dataset = df_custom.copy()
base_directory = 'data/videos'
images_dataset_path = 'data/datasets/images.csv'
fps = 3

# Build images dataset
df_images =  buildImageDataset(dataset, base_directory, fps=fps)

# Save images dataset
df_images.to_csv(images_dataset_path, index=False)

# Print results
print('Image dataset shape:', df_images.shape)

Processed videos: 708/708 (100.0) %

Image dataset shape: (29356, 10)


#### Build images dataset from video files with threads (Faster version)

##### Obs: Not working

In [6]:
# from modules.octa_video_util import buildImageDatasetThreads
# from modules.octa_video_util import _assign_tag

dataset = df_custom.copy()
base_directory = 'data/videos'
images_dataset_path = 'data/datasets/images.csv'
fps = 3
print_each = 50
max_threads = 10

df_images = buildImageDatasetThreads(dataset, base_directory, fps, print_each, max_threads)

# Save images dataset
df_images.to_csv(images_dataset_path, index=False)

# Print results
print('\nImage dataset shape:', df_images.shape)

Processed videos: 708/708 (100.0) %

Image dataset shape: (29356, 10)


---
## 6. Reload images dataset

In [1]:
import pandas as pd

images_dataset_path = 'data/datasets/images.csv'
df_images = pd.read_csv(images_dataset_path)

#### Create 'tag' field from multiple tags based on tag priority

In [2]:
from modules.octa_video_util import _assign_tag

# Create unique tag column based on class priority list
default_tag = 'normal'
tags_priority_list = ['alagamento', 'bolsão', 'lâmina', 'poça', 'transbordo']

df_images['tag'] = df_images['tags'].apply(lambda tags_list: _assign_tag(tags_list, tags_priority_list, default_tag))

print('Imagens assistidas (de videos baixados):', df_images['seen'].sum(), '/', len(df_images))
print()
display(df_images.tag.value_counts())

Imagens assistidas (de videos baixados): 29356 / 29356



tag
poça          19877
normal         4889
lâmina         3661
alagamento      863
bolsão           66
Name: count, dtype: int64