# Build dataset from collection of flood videos

#### Import modules

In [1]:
cd ../../../../../Apps/APIs/cams-rio

C:\Users\luisr\Desktop\Repositories\Apps\APIs\cams-rio


In [2]:
import pandas as pd
from datetime import datetime as dt
from IPython.display import clear_output as co

  from pandas.core.computation.check import NUMEXPR_INSTALLED


#### Import Google Cloud Storage wrapper module and set storage instance

In [3]:
from modules.googlecloudstorage import GCS

sa_json = 'auth/octacity-iduff.json' # 'auth/pluvia-sa.json'
user_project = None
default_bucket_name = 'flood-video-collection'

gcs = GCS(sa_json, user_project, default_bucket_name)

#### List collection blobs with .mp4 extension

In [4]:
bucket_name = 'flood-video-collection'
prefix = ''
delimiter = None
ext = '.mp4'

drop_first = False
print_each = 1000

blobs = gcs.list_blobs(prefix, delimiter, bucket_name)

names = []
for i, blob in enumerate(blobs):
    if blob.name.endswith(ext):
        names.append([blob.name, blob.size])
    if (i + 1) % print_each == 0: print(f'\n- Blobs Searched: {i + 1}'); co(True)

print(f'\n- Blobs Searched: {i + 1}')
if drop_first: print(f'\n · First item excluded: {names[0]}'); del names[0]
print(f'\n · Results: {len(names)}')


- Blobs Searched: 43290

 · Results: 43290


#### Build blobs control dataset

In [32]:
control = pd.DataFrame(names, columns=['blob_name', 'blob_size'])

control['bucket_name'] = bucket_name
blob_info = []
for blob_name in control['blob_name']:
    info = blob_name.split('/')
    file_name = info[-1]
    code = info[-2]
    stamp = ' '.join(file_name.split(' ')[1:])
    try: timestamp = dt.strptime(stamp, '%Y-%m-%d %H:%M:%S.mp4')
    except: timestamp = dt.strptime(stamp, '%Y-%m-%d %H-%M-%S.mp4')
    blob_info.append([file_name, code, len(info), timestamp])

control[['file_name', 'code', 'n_folders', 'timestamp']] = blob_info

folder_structure = {
    3: '{type}/{code}', # rain/1234/example.mp4
    4: 'polygons/{type}/{code}', # polygons/normal/1234/example.mp4
#     5: 'polygons/{type}/{polygon}/{code}', # polygons/flood/0/1234/example.mp4
    5: '{source}/{type}/{event}/{code}', # comando/bolsão/id-abcd/1234/example.mp4
    6: 'polygons/{source}/{type}/{polygon}/{code}' # polygons/comando/bolsão/0/1234/example.mp4
}

control['folder_structure'] = control['n_folders'].map(folder_structure)

# fix 'rivers/manual' folder structure
depth_4_blob_names = control['blob_name'][control['n_folders'] == 4]
rivers_msk = depth_4_blob_names.apply(lambda blob_name: blob_name.split('/')[0] in ['rivers'])
control.loc[depth_4_blob_names.index[rivers_msk], 'folder_structure'] = '{type}/{subtype}/{code}' # rivers/manual/1234/example.mp4

# fix 'rivers/manual' folder structure
depth_5_blob_names = control['blob_name'][control['n_folders'] == 5]
polygons_msk = depth_5_blob_names.apply(lambda blob_name: blob_name.split('/')[0] in ['polygons'])
control.loc[depth_5_blob_names.index[polygons_msk], 'folder_structure'] = 'polygons/{type}/{polygon}/{code}' # polygons/flood/0/1234/example.mp4

#### Save blobs control dataset

In [33]:
data_path = '../../../Data Science Projects/Hackaton COR IV - Centro de Operações do RJ/INCUBAÇÃO/Cameras/Dados'
control_path = f'{data_path}/blobs/videos_control_19-04.csv'

control.to_csv(control_path, index=False)