# Cloud Storage buckets image and video processing pipelines

In [1]:
import pandas as pd, numpy as np
from IPython.display import clear_output as co
from time import time

  from pandas.core.computation.check import NUMEXPR_INSTALLED


#### Google Cloud Storage wrapper module

In [2]:
from modules.googlecloudstorage import GCS, storage

sa_json = '../../../../Apps/Python/cams-rio/auth/octacity-iduff.json' # 'auth/pluvia-sa.json'
user_project = 'octacity'
default_bucket_name = 'city-camera-images'

gcs = GCS(sa_json, user_project, default_bucket_name)

#### Image classification settings - Histogram max. percentage

In [3]:
from modules.histogram import HistogramClassifier

threshold=0.6
clf_hist = HistogramClassifier(threshold)

skipper = clf_hist.is_histogram_clustered

#### Python module: Controlled pipeline execution (pandas dataframe based)

In [4]:
from modules.controlled_pipeline import ControlledPipeline

### EXAMPLE USAGE · Google Cloud Storage image processing python module

In [5]:
from modules.gcs_image import Video

# skipper = None

video = Video(shape=(854, 480), fps=3, ext='.mp4', codec='mp4v', skipper=skipper, gcs=gcs)

#### Upload video from folder of images in cloud storage bucket  (simple example)

In [6]:
folder, from_bucket_name = 'flood/1635/2023-02-10 18:10:00/', np.nan #'city-camera-images'
to_blob_name, to_bucket_name = 'test/TEST.mp4', 'city-camera-images' # 'flood-video-collection'

s = time()
video.upload_from_bucket_folder_of_images(
    folder, from_bucket_name, to_blob_name, to_bucket_name,
    ext='.jpg', content_type='video/mp4',
    overwrite=True, report=True
); t = time() - s

print(f'EXECUTION TIME: {round(t, 4)} s')


* VIDEO UPLOAD SUCCESS · IMAGES: 7 · BLOB: test/TEST.mp4 · BUCKET: city-camera-images
EXECUTION TIME: 1.7714 s


# Cloud Storage buckets image and video processing pipelines

#### Load adapted cameras dataset

In [33]:
import pandas as pd

cameras = pd.read_csv('../../../../Apps/Python/cams-rio/static/city/cameras.csv')

# cameras = pd.read_csv('static/city/cameras.csv')
cameras['Codigo'] = cameras['Codigo'].astype(int)
cameras['cluster_id'] = cameras['cluster_id'].astype(int)
cameras.set_index('Codigo', inplace=True)

#### Folder to blob name settings

In [93]:
def folder_to_blob_name(folder, folder_map={}, polygon_blobs=[]):
    if folder.endswith('/'): folder = folder[:-1]
    info = folder.split('/')
    prefix, code, stamp = '/'.join(info[:-2]), info[-2], info[-1]
    to_prefix = folder_map[prefix]
    if prefix in polygon_blobs:
        polygon_id = cameras.loc[int(code), 'cluster_id']
        return f'{to_prefix}/{polygon_id}/{code}/CODE{code} {stamp}.mp4'
    return f'{to_prefix}/{code}/CODE{code} {stamp}.mp4'

def blob_name_to_blob_name(blob_name, folder_map={}, polygon_blobs=[]):
    info = blob_name.split('/')
    prefix, code, filename = '/'.join(info[:-2]), info[-2], info[-1]
    if prefix not in folder_map: return blob_name
    to_prefix = folder_map[prefix]
    if prefix in polygon_blobs:
        polygon_id = cameras.loc[int(code), 'cluster_id']
        return f'{to_prefix}/{polygon_id}/{code}/{filename}'
    return f'{to_prefix}/{code}/{filename}'

folder_map = {
    'pics': 'polygons/flood-unlabeled',
    'manual/2023-08-02': 'polygons/manual',
    'flood': 'polygons/flood',
    'rain': 'rain',
    'auto/flood': 'polygons/flood',
    'auto/rain': 'rain',
}

polygon_blobs = ['manual/2023-08-02', 'flood', 'pics', 'auto/flood']

folder_to_blob_name_map = lambda folder: folder_to_blob_name(folder, folder_map, polygon_blobs)
blob_name_map = lambda blob_name: blob_name_to_blob_name(blob_name, folder_map, polygon_blobs)

---
# Move videos between Cloud Storage buckets

#### List blobs in bucket

In [82]:
bucket_name = 'city-camera-images'
prefix = ''
delimiter = None
ext = '.mp4'

print_each = 1000

blobs = gcs.list_blobs(prefix, delimiter, bucket_name)

names = []
for i, blob in enumerate(blobs):
    if blob.name.endswith(ext): names.append(blob.name)
    if (i + 1) % print_each == 0: print(f'\n- Blobs Searched: {i + 1}'); co(True)

print(f'\n- Blobs Searched: {i + 1}')
print(f'\n · Results: {len(names)}')


- Blobs Searched: 514831

 · Results: 7679


#### Build video files control dataset from blobs names

In [84]:
control = pd.DataFrame(names, columns=['blob_name'])

control['from_bucket_name'] = None
control['to_blob_name'] = control['blob_name'].map(blob_name_map)
control['to_bucket_name'] = 'flood-video-collection'
control['content_type'] = 'video/mp4'
control['overwrite'] = False
control['status'] = 'PENDING'

control_path = f'Dados/blobs/video_control.csv'

control.to_csv(control_path, index=False)

#### Run move-videos pipeline using control dataset

In [12]:
save_each = 50
report_each = 10

control_path = f'Dados/blobs/video_control.csv'
# control_func = upload_from_blob_name

params_fields = ['blob_name', 'from_bucket_name', 'to_blob_name', 'to_bucket_name', 'overwrite']
query = {'status': ['PENDING', 'UPLOAD_FAILED']}
status_field = 'status'
status_options = ['UPLOAD_FAILED', 'UPLOADED']
error_flag = 'ERROR'

pipe = ControlledPipeline(
    control_path, control_func, params_fields,
    query, status_field, status_options, error_flag
)

pipe.run(report_each, save_each)


- PROGRESS: 7450 / 7452 ops · PROGRESS-PRCT: 100.0 %

- TOTAL: 15321 / 15323 ops · TOTAL-PRCT: 100.0 %

- RUNNING: 353.1 min · EXPECT-FINISH: 0.1 min · RATE: 0.3517 ops / s

* BLOBS CONTROL UPDATE SUCCESSFUL!

* VIDEO UPLOAD SUCCESS · IMAGES: 6 · BLOB: rain/99/CODE99 2023-02-10 23:45:01.mp4 · BUCKET: flood-video-collection

* VIDEO UPLOAD SUCCESS · IMAGES: 5 · BLOB: rain/99/CODE99 2023-02-11 00:30:01.mp4 · BUCKET: flood-video-collection

- PROGRESS: 7451 / 7452 ops · PROGRESS-PRCT: 100.0 %

- TOTAL: 15322 / 15323 ops · TOTAL-PRCT: 100.0 %

- RUNNING: 353.1 min · EXPECT-FINISH: 0.0 min · RATE: 0.3517 ops / s

* BLOBS CONTROL UPDATE SUCCESSFUL!


#### Reload blobs control

In [13]:
import pandas as pd

control_path = f'Dados/blobs/video_control.csv'

control = pd.read_csv(control_path)

control['status'].value_counts()

UPLOADED         15322
UPLOAD_FAILED        1
Name: status, dtype: int64

---
# Write video from folder of images in Cloud Storage bucket

#### List blobs in bucket

In [None]:
bucket_name = 'city-camera-images'
prefix = ''
delimiter = None
ext = '/'

drop_first = False
print_each = 1000

blobs = gcs.list_blobs(prefix, delimiter, bucket_name)

names = []
for i, blob in enumerate(blobs):
    if blob.name.endswith(ext): names.append(blob.name)
    if (i + 1) % print_each == 0: print(f'\n- Blobs Searched: {i + 1}'); co(True)

print(f'\n- Blobs Searched: {i + 1}')
if drop_first: print(f'\n · First item excluded: {names[0]}'); del names[0]
print(f'\n · Results: {len(names)}')

#### Build videos control dataset from blobs names

In [65]:
control = pd.DataFrame(names, columns=['folder'])

control = control[control['folder'] != 'pics/']

control['from_bucket_name'] = 'city-camera-images'
control['to_blob_name'] = control['folder'].map(folder_to_blob_name_map)
control['to_bucket_name'] = 'flood-video-collection'
control['ext'] = '.jpg'
control['content_type'] = 'video/mp4'
control['overwrite'] = True
control['status'] = 'PENDING'

control_path = f'Dados/blobs/folders_to_video_control.csv'

control.to_csv(control_path, index=False)

#### Run image-video conversion pipeline using control dataset

In [12]:
save_each = 50
report_each = 10

control_path = f'Dados/blobs/folders_to_video_control.csv'
control_func = video.upload_from_bucket_folder_of_images

params_fields = ['folder', 'from_bucket_name', 'to_blob_name', 'to_bucket_name', 'ext', 'content_type', 'overwrite']
query = {'status': ['PENDING', 'UPLOAD_FAILED']}
status_field = 'status'
status_options = ['UPLOAD_FAILED', 'UPLOADED']
error_flag = 'ERROR'

pipe = ControlledPipeline(
    control_path, control_func, params_fields,
    query, status_field, status_options, error_flag
)

pipe.run(report_each, save_each)


- PROGRESS: 7450 / 7452 ops · PROGRESS-PRCT: 100.0 %

- TOTAL: 15321 / 15323 ops · TOTAL-PRCT: 100.0 %

- RUNNING: 353.1 min · EXPECT-FINISH: 0.1 min · RATE: 0.3517 ops / s

* BLOBS CONTROL UPDATE SUCCESSFUL!

* VIDEO UPLOAD SUCCESS · IMAGES: 6 · BLOB: rain/99/CODE99 2023-02-10 23:45:01.mp4 · BUCKET: flood-video-collection

* VIDEO UPLOAD SUCCESS · IMAGES: 5 · BLOB: rain/99/CODE99 2023-02-11 00:30:01.mp4 · BUCKET: flood-video-collection

- PROGRESS: 7451 / 7452 ops · PROGRESS-PRCT: 100.0 %

- TOTAL: 15322 / 15323 ops · TOTAL-PRCT: 100.0 %

- RUNNING: 353.1 min · EXPECT-FINISH: 0.0 min · RATE: 0.3517 ops / s

* BLOBS CONTROL UPDATE SUCCESSFUL!


#### Reload blobs control

In [13]:
import pandas as pd

control_path = 'Dados/blobs/folders_to_video_control.csv'

control = pd.read_csv(control_path)

control['status'].value_counts()

UPLOADED         15322
UPLOAD_FAILED        1
Name: status, dtype: int64

---
## Check folder/file correspondence between buckets

#### List folders in origin bucket

In [85]:
bucket_name = 'city-camera-images'
prefix = '' # search all blobs
delimiter = None
ext = '/'

drop_first = False
print_each = 1000

blobs = gcs.list_blobs(prefix, delimiter, bucket_name)

names = []
for i, blob in enumerate(blobs):
    if blob.name.endswith(ext): names.append(blob.name)
    if (i + 1) % print_each == 0: print(f'\n- Blobs Searched: {i + 1}'); co(True)

print(f'\n- Blobs Searched: {i + 1}')
if drop_first: print(f'\n · First item excluded: {names[0]}'); del names[0]
print(f'\n · Results: {len(names)}')

drop_if_exists = ['pics/', 'flood/', 'rain/', 'comando/', 'waze/']

if len(drop_if_exists):
    for name in drop_if_exists:
        if name in names:
            print(f'\n- BLOB: {name} (excluded)')
            del names[names.index(name)]
        
origin_folders = names[:]


- Blobs Searched: 514831

 · Results: 15323

- BLOB: pics/ (excluded)


#### Convert folder to correspondent blob name

In [90]:
destination_blob_names_corr = pd.Series(origin_folders).map(folder_to_blob_name_map)

#### List blobs in origin bucket

In [86]:
bucket_name = 'city-camera-images'
prefix = '' # search all blobs
delimiter = None
ext = '.mp4'

drop_first = False
print_each = 1000

blobs = gcs.list_blobs(prefix, delimiter, bucket_name)

names = []
for i, blob in enumerate(blobs):
    if blob.name.endswith(ext): names.append(blob.name)
    if (i + 1) % print_each == 0: print(f'\n- Blobs Searched: {i + 1}'); co(True)

print(f'\n- Blobs Searched: {i + 1}')
if drop_first: print(f'\n · First item excluded: {names[0]}'); del names[0]
print(f'\n · Results: {len(names)}')

drop_if_exists = ['pics/', 'flood/', 'rain/', 'comando/', 'waze/']

if len(drop_if_exists):
    for name in drop_if_exists:
        if name in names:
            print(f'\n- BLOB: {name} (excluded)')
            del names[names.index(name)]
        
origin_blob_names = names[:]


- Blobs Searched: 514831

 · Results: 7679


#### Convert blob name to correspondent blob name

In [95]:
origin_blob_names_corr = pd.Series(origin_blob_names).map(blob_name_map)

#### List blobs in destination bucket

In [87]:
bucket_name = 'flood-video-collection'
prefix = ''
delimiter = None
ext = '.mp4'

drop_first = False
print_each = 100

blobs = gcs.list_blobs(prefix, delimiter, bucket_name)

names = []
for i, blob in enumerate(blobs):
    if (i+1) % print_each == 0: print(f'\n- Blobs Searched: {i+1}'); co(True)
    if blob.name.endswith(ext): names.append(blob.name)


print(f'\n- Blobs Searched: {i+1}')
if drop_first:
    print(f'\n · First item excluded: {names[0]}'); del names[0]
print(f'\n · Results: {len(names)}')
    
destination_blob_names = pd.Series(names[:])


- Blobs Searched: 32651

 · Results: 32651


#### Check differences

In [96]:
origin_set = set(destination_blob_names_corr)
origin_blobs_set = set(origin_blob_names_corr)
destination_set = set(destination_blob_names)

dest_diff = list(origin_set.difference(destination_set))
diff = list(destination_set.difference(origin_set))
orig_diff = list(origin_blobs_set.difference(destination_set))
dest_orig_diff = list(destination_set.difference(origin_blobs_set))

print(f'- Difference from destination (folder-mp4): {len(dest_diff)}')
print(f'- Difference from source (folder-mp4): {len(diff)}')
print(f'- Difference from destination (mp4): {len(orig_diff)}')
print(f'- Difference from source (mp4): {len(dest_orig_diff)}')

- Difference from destination (folder-mp4): 0
- Difference from source (folder-mp4): 17329
- Difference from destination (mp4): 111
- Difference from source (mp4): 25083


#### Check missing blobs

In [124]:
left = pd.DataFrame([name.split('/') for name in orig_diff])

for i in range(3):
    print(f'\nBlob Name Level: {i}\n')
    display(left[i].value_counts())


Blob Name Level: 0



comando    98
waze       13
Name: 0, dtype: int64


Blob Name Level: 1



buraco_na_pista                67
vazamento                      14
acidente_enguiço_sem_vítima     8
accident_major                  7
accident_minor                  6
operação_policial               6
incêndio_em_veículo             3
Name: 1, dtype: int64


Blob Name Level: 2



91700                                   20
92182                                   19
92140                                    9
92146                                    8
90535                                    7
90516                                    7
92227                                    6
d16d470e-5cc5-4489-9af9-0c47b3178c5b     6
93123                                    6
92395                                    6
91305                                    5
93121                                    3
92387                                    2
00fcf469-98c9-44b1-88b1-e3d2432d56bd     2
1ea4ec2d-ecfc-42f3-878a-1b085eeed3b1     1
474f3057-9ea3-42b4-8510-f0fe8a601368     1
74c4dc90-1857-48b0-87c1-faba429bf162     1
1ef454c6-05ba-42f7-b3aa-6d7394a4c916     1
b7770a0d-d063-4228-9107-f130649a6481     1
Name: 2, dtype: int64

---
## EXTRA: Performance comparison

In [2]:
from time import time

def average_time(func, n=100):
    s = time()
    for i in range(n): func()
    t = time()
    dt = t - s
    avg = dt / n
    return avg

query = {
    'folder': 'manual/2023-08-02/1119/2023-02-08 15:55:00/',
    'to_blob_name': 'test/TEST.mp4',
    'from_bucket_name': 'city-camera-images',
    'to_bucket_name': 'city-camera-images', # 'flood-video-collection'
    'ext': '.jpg',
    'content_type': 'video/mp4',
    'overwrite': True,
    'report': False,
}

def local_upload_func():
    video.upload_from_bucket_folder_of_images(**query)

import requests
from urllib.parse import urlencode

baseurl = 'http://127.0.0.1:5001'
url = f'{baseurl}/write-video' # ?{urlencode(query)}

def api_upload_func(_id=None):
    res = requests.get(url, params=query)
    print(_id, res.status_code, res.reason, res.json()['uploaded'])

import multiprocessing

def run_many(func, params, workers=None):
    if workers is None: workers = len(params)
    pool = multiprocessing.Pool(processes=workers)
    s = time()
    pool.starmap(func, params); t = time()
    return (t - s) / len(params)

n = 2
params = [(i,) for i in range(n)]

avg_par = run_many(api_upload_func, params)