# Build image dataset from collection of flood-videos 

---

In [1]:
cd ../

C:\Users\luisr\Desktop\Repositories\Data Science Projects\Hackaton COR IV - Centro de Operações do RJ\INCUBAÇÃO\Cameras


#### Class to measure execution time

In [2]:
from time import time

# Simples class to report execution time

class Timer:
    def __init__(self):
        self.start = time()
    def end(self, decimals=4):
        end = time() - self.start
        print('\n* TIME TO EXECUTE:', round(end, decimals), 's')

### Function to build image dataset from flood videos dataset

In [3]:
import os, pandas as pd, cv2
from IPython.display import clear_output as co
from modules.controlled_pipeline import Progress

def videos_to_frames(videos, folder, dt_col='timestamp', fps=3, print_each=100, as_datetime=True, dataset_path=None, datetime_unit='ms'):
    
    # loop preparation
    n_videos = len(videos)
    imgs_rows = []; fail = []
    progress = Progress(n_videos)
    
    # iterate rows of videos dataset
    for i, (idx, row) in enumerate(videos.iterrows()):
        
        # report progress
        if (i + 1) % print_each == 0 or (i + 1) == n_videos:
            fail_cnt = pd.DataFrame(fail)['reason'].value_counts().to_dict() if len(fail) else None
            co(True); print(f'\nBUILD IMAGE DATASET FROM VIDEO FILES · N-FAIL: {len(fail)} · FAIL-CNT: {fail_cnt}')
            progress.report(i)
            
        # continue/skip if video file of path doesn't exists
        file_path = f'{folder}/{row["blob_name"]}'.replace(':', '-') # file path fix
        if not os.path.exists(file_path):
            fail.append({'path': file_path, 'reason': 'file-not-found'})
            continue
            
        # open video file
        video = cv2.VideoCapture(file_path)
        # if not video.isOpened():
            # print(f'Cannot open video · PATH: {file_path}')
            # fail.append({'path': file_path, 'reason': 'cannot-open-file'})
            # continue
            
        # get number of frames of video file
        num_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
        
        # build video frames' dataset and add `image_timestamp` field
        start = row[dt_col].round('s')
        images_timestamps = pd.to_datetime(pd.date_range(start=start, periods=num_frames, freq=f'{int(1e9/fps)}ns')) # pd.offsets.Second() / fps
        images = pd.DataFrame([row], index=range(num_frames))
        images['image_timestamp'] = images_timestamps
        
        # append video frames' dataset
        imgs_rows.append(images)
        
    # concat video frames' dataset
    images = pd.concat(imgs_rows)
    
    # format dataset datetime field
    images['image_timestamp'] = images['image_timestamp'].round(datetime_unit)
    if not as_datetime:
        images['image_timestamp'] = images['image_timestamp'].astype(str) # round to seconds by pre-converting to string format
    
    if dataset_path is not None:
        images.to_csv(dataset_path, index=False)
        print(f'\n - Images dataset saved to csv file · {dataset_path}')

    # report if any process fails
    if len(fail):
        print(f'PROCESS PARTIALLY SUCCESSFUL · N-FAILED: {len(fail)}')
        display(pd.DataFrame(fail))
        
    # return full frames' dataset
    return images

### Extra · Execution time testing · Compare methods for checking whether a video file exists

In [4]:
import os, cv2

n = 10
file_path = 'Dados/flood-video-collection/normal/1000/CODE1000 2023-04-07 20-22-43.mp4'  # file does not exist

timer = Timer()
for i in range(n):
    os.path.exists(file_path)
timer.end(decimals=4)

timer = Timer()
for i in range(n):
    video = cv2.VideoCapture(file_path)
    video.isOpened()
timer.end(decimals=4)

video.release()


* TIME TO EXECUTE: 0.002 s

* TIME TO EXECUTE: 3.3562 s


### Reload flood videos dataset

In [5]:
import pandas as pd

# date = '2023-04-29'

video_control_path = f'Dados/Controle de vídeos/videos_control_{date}.csv'

videos = pd.read_csv(video_control_path)
videos['timestamp'] = pd.to_datetime(videos['timestamp'])

print(videos.columns)

Index(['blob_name', 'blob_size', 'bucket_name', 'file_name', 'code',
       'n_folders', 'timestamp', 'folder_structure'],
      dtype='object')


### Build image dataset from flood videos dataset

In [8]:
images_dataset_path = f'Dados/Controle de vídeos/images_control_{date}.csv'

images_dataset = videos_to_frames(
    videos=videos,
    folder='Dados/flood-video-collection',
    dt_col='timestamp', fps=3,
    print_each=250, as_datetime=False,
    dataset_path=images_dataset_path
)


BUILD IMAGE DATASET FROM VIDEO FILES · N-FAIL: 28643 · FAIL-CNT: {'file-not-found': 28643}

- PROGRESS: 44373 / 44374 ops · PROGRESS-PRCT: 100.0 %

- RUNNING: 9.9 min · EXPECT-FINISH: 0.0 min · RATE: 74.3799 ops / s

 - Images dataset saved to csv file · Dados/Controle de vídeos/images_control_2023-04-29.csv
PROCESS PARTIALLY SUCCESSFUL · N-FAILED: 28644


Unnamed: 0,path,reason
0,Dados/flood-video-collection/comando/bolsão/95...,file-not-found
1,Dados/flood-video-collection/comando/bolsão/95...,file-not-found
2,Dados/flood-video-collection/comando/bolsão/95...,file-not-found
3,Dados/flood-video-collection/comando/bolsão/95...,file-not-found
4,Dados/flood-video-collection/comando/bolsão/95...,file-not-found
...,...,...
28639,Dados/flood-video-collection/waze/flood/ff0f2d...,file-not-found
28640,Dados/flood-video-collection/waze/flood/ff5e0e...,file-not-found
28641,Dados/flood-video-collection/waze/flood/ff5e0e...,file-not-found
28642,Dados/flood-video-collection/waze/flood/ff5e0e...,file-not-found


---
## Basic Exploratory Data Analaysis

#### Reload flood images dataset

In [9]:
import pandas as pd

image_control_path = 'Dados/Controle de vídeos/images_control_2023-04-29.csv'

images = pd.read_csv(image_control_path)
images['timestamp'] = pd.to_datetime(images['timestamp'])
images['image_timestamp'] = pd.to_datetime(images['image_timestamp'])

print(images.columns)

Index(['blob_name', 'blob_size', 'bucket_name', 'file_name', 'code',
       'n_folders', 'timestamp', 'folder_structure', 'image_timestamp'],
      dtype='object')


#### Number of images

In [10]:
print('Number of images in result: ', images.shape)

Number of images in result:  (678639, 9)


#### Percentage of images in videos downloaded

In [12]:
a = videos['blob_name'].isin(images['blob_name'])

p_in = (a.mean() * 100).round(2)
n_in = a.sum()

print(f'\nPercentage of images downloaded: {p_in} %')
print('Total Downloads:', n_in)


Percentage of images downloaded: 35.13 %
Total Downloads: 15469


#### First rows

In [13]:
images.head()

Unnamed: 0,blob_name,blob_size,bucket_name,file_name,code,n_folders,timestamp,folder_structure,image_timestamp
0,comando/alagamento/93898/1993/CODE1993 2023-03...,1317301,flood-video-collection,CODE1993 2023-03-30 19-55-06.mp4,1993,4,2023-03-30 19:55:06,{source}/{type}/{event}/{code},2023-03-30 19:55:06.000
1,comando/alagamento/93898/1993/CODE1993 2023-03...,1317301,flood-video-collection,CODE1993 2023-03-30 19-55-06.mp4,1993,4,2023-03-30 19:55:06,{source}/{type}/{event}/{code},2023-03-30 19:55:06.333
2,comando/alagamento/93898/1993/CODE1993 2023-03...,1317301,flood-video-collection,CODE1993 2023-03-30 19-55-06.mp4,1993,4,2023-03-30 19:55:06,{source}/{type}/{event}/{code},2023-03-30 19:55:06.667
3,comando/alagamento/93898/1993/CODE1993 2023-03...,1317301,flood-video-collection,CODE1993 2023-03-30 19-55-06.mp4,1993,4,2023-03-30 19:55:06,{source}/{type}/{event}/{code},2023-03-30 19:55:07.000
4,comando/alagamento/93898/1993/CODE1993 2023-03...,1317301,flood-video-collection,CODE1993 2023-03-30 19-55-06.mp4,1993,4,2023-03-30 19:55:06,{source}/{type}/{event}/{code},2023-03-30 19:55:07.333
