# Download videos, images and datasets from Octa City's collection of flood videos

#### Set credentials

In [6]:
# mongo_connection_string = 'your_mongo_connection_string' 
# google_credentials_path = 'path/to/your/service_account_credential.json'

mongo_connection_string = 'mongodb+srv://luisresende13:Gaia0333@pluvia-cluster.ea8fb4s.mongodb.net/?retryWrites=true&w=majority' 
google_credentials_path = '../../Flask APIs/cams-rio/auth/octacity-iduff.json'

## 1. Download videos dataset

Download `Videos Localizados` mongo collections as pandas dataframe

In [2]:
from modules.mongo import MongoDB
import pandas as pd
import numpy
import json
from time import time

db = 'Waterbag'
coll = 'Videos Localizados'
videos_dataset_path = 'data/datasets/videos.csv'

# Get MongoDB collection as list of objects
s1 = time()
mongo = MongoDB(mongo_connection_string)
data = mongo.get(db, coll)

# Convert to pandas dataframe
s2 = time()
df = pd.DataFrame(data)

# Convert 'tags' json field to string
df['tags'] = df['tags'].apply(json.dumps)

# Save as pandas dataframe
s3 = time()
df.to_csv(videos_dataset_path, index=False)

s4 = time()
total_mega_bytes = df['blob_size'].replace('', 0).astype('int').sum() / 1e9
print('Video files in dataset:', df.shape[0])
print('Total mega bytes (MBs):', round(total_mega_bytes, 3))
print('Time to download:', round(s2 - s1, 1))
print('Time to save dataframe:', round(s4 - s3, 1))
print('Time Total:', round(s4 - s1, 1), 's')

Video files in dataset: 62017
Total mega bytes (MBs): 78.405
Time to download: 7.8
Time to save dataframe: 1.9
Time Total: 10.3 s


#### Report videos dataset

In [3]:
total = len(df)
seen = df['seen'].sum()
tagged = df['tags'].apply(json.loads).apply(len) != 0
cameras_with_labels = df[tagged]['code'].unique()
videos_from_cameras_with_labels = df[df['code'].isin(cameras_with_labels)]
rows_with_missin_values = df[df['timestamp'].isna()]

print('Videos:', total)
print('Vídeos assistidos:', seen)
print('Vídeos rotulados:', tagged.sum())
print('Câmeras com rótulos:', len(cameras_with_labels))
print('Vídeos de câmeras com rótulos:', len(videos_from_cameras_with_labels))

from modules.octa_video_util import _assign_tag
default_tag = 'normal'
tags_priority_list = ['alagamento', 'bolsão', 'lâmina']

video_tags = df[df['seen']]['tags'].apply(lambda tags_list: _assign_tag(tags_list, tags_priority_list, default_tag))
print()
display(video_tags.rename('TAGS IN VIDEOS SEEN').value_counts())

print('\nROWS WITH MISSING VALUES:')
display(rows_with_missin_values)

Videos: 62017
Vídeos assistidos: 14680
Vídeos rotulados: 5592
Câmeras com rótulos: 406
Vídeos de câmeras com rótulos: 38575



TAGS IN VIDEOS SEEN
normal        14142
alagamento      538
Name: count, dtype: int64


ROWS WITH MISSING VALUES:


Unnamed: 0,_id,blob_name,blob_size,bucket_name,file_name,code,n_folders,timestamp,folder_structure,folder,tags,url,api_url,bucket,seen


---
## 2. Reload videos dataset

In [2]:
import pandas as pd
import json

videos_dataset_path = 'data/datasets/videos.csv'
df = pd.read_csv(videos_dataset_path)

#### Preprocessing of videos dataset

In [3]:
# Preprocessing
from modules.octa_video_util import _assign_tag

default_tag = 'normal'
tags_priority_list = ['alagamento', 'bolsão', 'lâmina']

df['tags'] = df['tags'].apply(json.loads)
df['tag'] = df['tags'].apply(lambda tags_list: _assign_tag(tags_list, tags_priority_list, default_tag))
df['flood'] = df['tag'].isin(['lâmina', 'bolsão', 'alagamento']).astype(int)

import json

df_custom = df.copy()
df_custom['bucket_name'] = 'flood-video-collection'
df_custom['blob_name'] = df_custom['blob_name'].str.replace('.webm', '.mp4') # reproduce the .mp4 collection using the .webm collection
df_custom.dropna(subset=['timestamp'], inplace=True) # Drop rows with missing values for `timestmaps` field

#### Report videos dataset

In [4]:
df_seen = df[df['seen']]

codes = df_seen['code'].unique()
codes_1 = df_seen[df_seen['flood'] == 1]['code'].unique()
codes_0 = df_seen[df_seen['flood'] == 0]['code'].unique()

display(df['tag'].value_counts().rename('All Videos'))
print()
display(df_seen['tag'].value_counts().rename('Seen Videos Only'))
print()
display(df['flood'].value_counts().rename('Flood'))

print()
print('Videos seen:', len(df_seen))
print()
print('Cameras seen with positive samples:', len(codes_1))
print('Cameras seen with negative samples:', len(codes_0))
print('Cameras seen total:', len(codes))

tag
normal        59295
bolsão         1174
lâmina         1009
alagamento      539
Name: All Videos, dtype: int64




tag
normal        11965
bolsão         1174
lâmina         1003
alagamento      538
Name: Seen Videos Only, dtype: int64




flood
0    59295
1     2722
Name: Flood, dtype: int64


Videos seen: 14680

Cameras seen with positive samples: 144
Cameras seen with negative samples: 473
Cameras seen total: 473


---
## 3. Download video files

#### Import utility functions

In [4]:
from modules.octa_video_util import filter_by_query, _assign_tag
from modules.octa_video_util import VideoDownloader, VideoFrameExtractor
from modules.octa_video_util import buildImageDataset, buildImageDatasetThreads
from modules.octa_video_util import copy_images_to_folders

#### Download videos with optional query

In [7]:
# from modules.octa_video_util import VideoDownloader

target_directory = 'data/videos'
bucket_name = 'flood-video-collection'

# query_params = {'code': [101, 102, 103], 'seen': [True, False]} 
query_params = {'seen': [True]}

overwrite = False
max_threads = 12

# Exclude missing video manually
df_custom_2 = df_custom[~df_custom['blob_name'].str.contains('parallel')]

downloader = VideoDownloader(df_custom_2, target_directory, google_credentials_path, max_threads)
downloader.download_videos(query_params, overwrite)

DONE! 14679/14679 files downloaded.0%)


---
## 4. Build images dataset

Create the dataset of images from the video files in `base_directory` folder.

Obs: To update to the latest 'tags' and 'seen' values, re-download the 'videos' dataset and pass it down to `buildImageDataset` or `buildImageDatasetThreads` below.

#### Build images dataset from video files

In [8]:
from modules.octa_video_util import buildImageDataset
from modules.octa_video_util import _assign_tag

dataset = df_custom_2.copy()
base_directory = 'data/videos'
images_dataset_path = 'data/datasets/images.csv'
fps = 3

# Build images dataset
df_images =  buildImageDataset(dataset, base_directory, fps=fps)

# Save images dataset
df_images.to_csv(images_dataset_path, index=False)

# Print results
print('Image dataset shape:', df_images.shape)

Processed videos: 14679/14679 (100.0) %

Image dataset shape: (591919, 10)


---
## 5. Extract and save image files

#### Extract images from video files (with optional query)

In [9]:
# from modules.octa_video_util import VideoFrameExtractor
# from modules.octa_video_util import filter_by_query

base_directory = 'data/videos'
target_directory = 'data/images'
query_params = { 'seen': [True] }
MAX_THREADS = 10  # BE CAREFUL WITH `max_threads`
overwrite = False
fps = 3
delete_on_success = True 

df_custom_2 = df_custom[~df_custom['blob_name'].str.contains('parallel')]

df_filtered = filter_by_query(df_custom_2, query_params).copy()
frame_extractor = VideoFrameExtractor(df_filtered, base_directory, target_directory, MAX_THREADS)
frame_extractor.extract_frames(overwrite, fps, delete_on_success)

Processed 14679/14679 rows (100.00%)

FINISHED.

Frames found: 591919
Frames written to disk: 591919
Frames folder exists: 0
Videos not found: 0


#### Count saved images

In [10]:
import os
def count_files_with_extension(folder_path, ext=""):
    count = 0
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith(ext):
                count += 1
    return count

# Example usage:
folder_path = "data/images"
extension = ".jpg"
file_count = count_files_with_extension(folder_path, extension)
print(f"Number of '{extension}' files in '{folder_path}': {file_count}")

Number of '.jpg' files in 'data/images': 591919


---
## 6. Reload images dataset

In [11]:
import pandas as pd

images_dataset_path = 'data/datasets/images.csv'
df_images = pd.read_csv(images_dataset_path)

---
## 7. Preprocess the images dataset

Add useful and convenient information as new fields of the dataset

In [12]:
from modules.octa_video_util import _assign_tag
from modules.octa_video_util import filter_by_query

df_images_clean = df_images.copy()

#### Create field `tag` based on tag priority list

In [13]:
default_tag = 'normal'
tags_priority_list = ['alagamento', 'bolsão', 'lâmina']

# Create unique tag column based on class priority list
df_images_clean['tag'] = df_images_clean['tags'].apply(lambda tags_list: _assign_tag(tags_list, tags_priority_list, default_tag))

display(df_images_clean.tag.value_counts())

tag
normal        461519
bolsão         49619
alagamento     42560
lâmina         38221
Name: count, dtype: int64

#### Create `flood` field (binarize tag field)

In [14]:
target_classes = ['lâmina', 'bolsão', 'alagamento']

# Binarize categorical variable from list of target classes
df_images_clean['flood'] = df_images_clean['tag'].isin(target_classes).astype(int)

display(df_images_clean['flood'].value_counts())
print()
display(df_images_clean.index[:10])

flood
0    461519
1    130400
Name: count, dtype: int64




RangeIndex(start=0, stop=10, step=1)

#### Save processed dataset

In [15]:
from datetime import datetime

df_images_clean.to_csv('data/datasets/images_clean.csv', index=False)

print('Clean Images Dataset Saved at:', str(datetime.now()))

Clean Images Dataset Saved at: 2024-03-08 02:01:40.863189
