# Build dataset from collection of flood videos

#### Import modules

In [1]:
cd ../../../../../Apps/APIs/cams-rio

C:\Users\luisr\Desktop\Repositories\Apps\APIs\cams-rio


In [2]:
import pandas as pd
from datetime import datetime as dt
from IPython.display import clear_output as co

#### Import Google Cloud Storage wrapper module and set storage instance

In [3]:
from modules.googlecloudstorage import GCS

sa_json = 'auth/octacity-iduff.json' # 'auth/pluvia-sa.json'
user_project = None
default_bucket_name = 'flood-video-collection'

gcs = GCS(sa_json, user_project, default_bucket_name)

#### List collection blobs with .mp4 extension

In [4]:
bucket_name = 'flood-video-collection'
prefix = ''
delimiter = None
ext = '.mp4'

drop_first = False
print_each = 1000

blobs = gcs.list_blobs(prefix, delimiter, bucket_name)

names = []
for i, blob in enumerate(blobs):
    if blob.name.endswith(ext):
        names.append([blob.name, blob.size])
    if (i + 1) % print_each == 0: print(f'\n- Blobs Searched: {i + 1}'); co(True)

print(f'\n- Blobs Searched: {i + 1}')
if drop_first: print(f'\n · First item excluded: {names[0]}'); del names[0]
print(f'\n · Results: {len(names)}')


- Blobs Searched: 44036

 · Results: 44036


#### Build video blobs control dataset

In [5]:
control = pd.DataFrame(names, columns=['blob_name', 'blob_size'])
control['bucket_name'] = bucket_name

blob_info = []
for blob_name in control['blob_name']:
    info = blob_name.split('/')
    file_name = info[-1]
    code = info[-2]
    n_folders = len(info) - 1
    stamp = ' '.join(file_name.split(' ')[1:])
    try: timestamp = dt.strptime(stamp, '%Y-%m-%d %H:%M:%S.mp4')
    except: timestamp = dt.strptime(stamp, '%Y-%m-%d %H-%M-%S.mp4')
    blob_info.append([file_name, code, n_folders, timestamp])

control[['file_name', 'code', 'n_folders', 'timestamp']] = blob_info

# build `folder_structure` column based on `n_folders`

folder_structure = {
    2: '{type}/{code}', # rain/1234/example.mp4
    3: 'polygons/{type}/{code}', # polygons/normal/1234/example.mp4
    4: '{source}/{type}/{event}/{code}', # comando/bolsão/id-abcd/1234/example.mp4
    5: 'polygons/{source}/{type}/{polygon}/{code}' # polygons/comando/bolsão/0/1234/example.mp4
#     3: '{type}/{subtype}/{code}', # rivers/manual/123/example.mp4
#     4: 'polygons/{type}/{polygon}/{code}', # polygons/flood/0/1234/example.mp4
}

control['folder_structure'] = control['n_folders'].map(folder_structure)

# fix 'rivers/manual' folder structure

filter_msk = control['blob_name'].str.contains('rivers')
control.loc[filter_msk, 'folder_structure'] = '{type}/{subtype}/{code}'   # polygons/flood/0/1234/example.mp4

# fix 'polygons' folder structure

filter_msk = (control['n_folders'] == 4) & control['blob_name'].str.contains('polygons')
control.loc[filter_msk, 'folder_structure'] = 'polygons/{type}/{polygon}/{code}'   # polygons/flood/0/1234/example.mp4

#### Checking: Folder structure count

In [6]:
control[['n_folders', 'folder_structure']].value_counts().sort_index()

n_folders  folder_structure                         
2          {type}/{code}                                21182
3          polygons/{type}/{code}                          69
           {type}/{subtype}/{code}                        478
4          polygons/{type}/{polygon}/{code}             11393
           {source}/{type}/{event}/{code}               10192
           {type}/{subtype}/{code}                         89
5          polygons/{source}/{type}/{polygon}/{code}      545
7          {type}/{subtype}/{code}                         88
dtype: int64

#### Save blobs control dataset

In [7]:
data_path = '../../../Data Science Projects/Hackaton COR IV - Centro de Operações do RJ/INCUBAÇÃO/Cameras/Dados'
control_path = f'{data_path}/Controle de vídeos/videos_control_2023-04-29.csv'

control.to_csv(control_path, index=False)