In [1]:
from pathlib import Path
import json
from datetime import datetime
import re
data_dir = Path('/home/vbs/vbs23/data/information/metadata/')

In [2]:
list(data_dir.iterdir())[:3]

[PosixPath('/home/vbs/vbs23/data/information/metadata/Dapang_Jul2022'),
 PosixPath('/home/vbs/vbs23/data/information/metadata/PhuQuoc1_Jun2022'),
 PosixPath('/home/vbs/vbs23/data/information/metadata/Molokai_Jul2022')]

In [3]:
for dirname in data_dir.iterdir():
    dirname_pattern = re.compile(r'(\w+)_(\w+)(\d{4})')
    loc, month, year = dirname_pattern.match(dirname.name).groups()
    print(loc, month, year)
    break

Dapang Jul 2022


In [4]:
json_example = '/home/vbs/vbs23/data/information/metadata/Ambon_Apr2012/0001.json'
!cat $json_example

{
    "camera_model": "Canon PowerShot G12",
    "captions": [
        {
            "ClipCapCaption": [
                "a diver swims over a coral reef.",
                "coral reef outside the island.",
                "coral reef outside the island.",
                "coral reef outside the island.",
                "coral reef outside the island.",
                "a diver swims over a coral reef.",
                "coral reef outside the island.",
                "coral reef outside the island.",
                "a clownfish on a coral reef.",
                "a diver swims over a coral reef."
            ],
            "id": [
                "00001",
                "00002",
                "00003",
                "00004",
                "00005",
                "00006",
                "00007",
                "00008",
                "00009",
                "00010"
            ]
        }
    ],
    "created_time": "2012:04:04 03:08:10",
    "duration": "10.47 s",
    "ex

In [5]:
!cat db_schema.py

from typing import List, Optional
from pydantic import BaseModel

class SelectedFrames(BaseModel):
    id: List[str]
    path: List[str]

class Thumbnails(BaseModel):
    id: List[str]
    path: List[str]

class ClipCapCaption(BaseModel):
    id: List[str]
    ClipCapCaption: List[str]

class Video(BaseModel):
    camera_model: str
    created_time: str
    duration: str
    ext: str
    filename: str
    fps: str
    height: str
    width: str
    captions: List[ClipCapCaption]
    selected_frames: List[SelectedFrames]
    thumbnails: List[Thumbnails]

# {
#     "camera_model": "Canon PowerShot G12",
#     "captions": [
#         {
#             "ClipCapCaption": [
#                 "a diver swims over a coral reef.",
#                 "coral reef outside the island.",
#                 "coral reef outside the island.",
#                 "coral reef outside the island.",
#                 "coral reef outside the island.",
#                 "a diver swims over a coral reef.",
#        

In [6]:
from db_schema import Video
import pydantic
json_valid = pydantic.parse_file_as(path=json_example, type_=Video)
json_valid

Video(camera_model='Canon PowerShot G12', created_time='2012:04:04 03:08:10', duration='10.47 s', ext='mp4', filename='0001.mp4', fps='23.976', height='720', width='1280', captions=[ClipCapCaption(id=['00001', '00002', '00003', '00004', '00005', '00006', '00007', '00008', '00009', '00010'], ClipCapCaption=['a diver swims over a coral reef.', 'coral reef outside the island.', 'coral reef outside the island.', 'coral reef outside the island.', 'coral reef outside the island.', 'a diver swims over a coral reef.', 'coral reef outside the island.', 'coral reef outside the island.', 'a clownfish on a coral reef.', 'a diver swims over a coral reef.'])], selected_frames=[SelectedFrames(id=['00001', '00002', '00003', '00004', '00005', '00006', '00007', '00008', '00009', '00010'], path=['information/selected_frames/Ambon_Apr2012/0001_00001.jpg', 'information/selected_frames/Ambon_Apr2012/0001_00002.jpg', 'information/selected_frames/Ambon_Apr2012/0001_00003.jpg', 'information/selected_frames/Amb

In [44]:
def time2part_of_day(hour: int):
    # part_of_day = [
    #     'early morning',
    #     'morning',
    #     'late morning',
    #     'early afternoon',
    #     'afternoon',
    #     'early evening',
    #     'evening',
    #     'late afternoon',
    #     'night',
    # ]
    
    if (hour >= 5) and (hour < 9):
        return 'early morning'
    elif (hour >= 9) and (hour < 11):
        return 'morning'
    elif (hour >= 11) and (hour < 12):
        return 'late morning'
    elif (hour >= 12) and (hour < 15):
        return 'early afternoon'
    elif (hour >= 15) and (hour < 16):
        return 'afternoon'
    elif (hour >= 16) and (hour < 17):
        return 'late afternoon'
    elif (hour >= 17) and (hour < 19):
        return 'early evening'
    elif (hour >= 19) and (hour < 21):
        return 'early evening'
    elif (hour >= 21) and (hour < 23):
        return 'late afternoon'
    else:
        return 'night'
        
def time2day_of_week(day: int):
    day_of_week = [
        'Monday',
        'Tuesday',
        'Wednesday',
        'Thursday',
        'Friday',
        'Saturday',
        'Sunday',
    ]
    return day_of_week[day]
def time2month(month: int):
    month_of_year = [
        'January',
        'February',
        'March',
        'April',
        'May',
        'June',
        'July',
        'August',
        'September',
        'October',
        'November',
        'December',
    ]
    assert month <= 12, f'Invalid month: {month}'
    assert month > 0, f'Invalid month: {month}'
    return month_of_year[month-1]

In [45]:
result = {}

for dirname in data_dir.iterdir():
    dirname_pattern = re.compile(r'(\w+)_(\w+)(\d{4})')
    loc, month, year = dirname_pattern.match(dirname.name).groups()
    # print(loc, month, year)
    for json_file in dirname.iterdir():
        json_valid = pydantic.parse_file_as(path=json_file, type_=Video)
        
        shot_name = int(json_file.stem)

        cap_ids = json_valid.captions[0].id
        thum_ids = json_valid.thumbnails[0].id
        fram_ids = json_valid.selected_frames[0].id
        
        assert cap_ids == thum_ids == fram_ids, 'ids not equal'

        ids = list(map(int, cap_ids))

        for _id in ids: 
            _time = datetime.strptime(json_valid.created_time, '%Y:%m:%d %H:%M:%S')
            item = {
                'id': _id,
                'location': loc,
                # 'month': month,
                # 'year': year,
                'camera_model': json_valid.camera_model,
                'thumbnails': json_valid.thumbnails[0].path[_id-1],
                'selected_frames': json_valid.selected_frames[0].path[_id-1],
                'created_time': json_valid.created_time,
                'date': _time.date().strftime('%Y-%m-%d'),
                'month': time2month(_time.month),
                'year': _time.year,
                'day_of_week': time2day_of_week(_time.weekday()),
                'part_of_day': time2part_of_day(_time.hour),
                'local_time': _time.time().strftime('%H:%M'),
                'captions': json_valid.captions[0].ClipCapCaption[_id-1],
                'width': json_valid.width,
                'height': json_valid.height,
                'ext': json_valid.ext,
                'fps': json_valid.fps,
            }
            result[f'{dirname.name}_{shot_name}_{_id}'] = item

In [46]:
result

{'Dapang_Jul2022_5_1': {'id': 1,
  'location': 'Dapang',
  'camera_model': 'DJI Osmo 3',
  'thumbnails': 'information/thumbnails/Dapang_Jul2022/0005_00001.jpg',
  'selected_frames': 'information/selected_frames/Dapang_Jul2022/0005_00001.jpg',
  'created_time': '2022:08:27 07:48:37',
  'date': '2022-08-27',
  'month': 'August',
  'year': 2022,
  'day_of_week': 'Saturday',
  'part_of_day': 'early morning',
  'local_time': '07:48',
  'captions': 'underwater view of a coral reef.',
  'width': '1920',
  'height': '1080',
  'ext': 'mp4',
  'fps': '30'},
 'Dapang_Jul2022_5_2': {'id': 2,
  'location': 'Dapang',
  'camera_model': 'DJI Osmo 3',
  'thumbnails': 'information/thumbnails/Dapang_Jul2022/0005_00002.jpg',
  'selected_frames': 'information/selected_frames/Dapang_Jul2022/0005_00002.jpg',
  'created_time': '2022:08:27 07:48:37',
  'date': '2022-08-27',
  'month': 'August',
  'year': 2022,
  'day_of_week': 'Saturday',
  'part_of_day': 'early morning',
  'local_time': '07:48',
  'captions':

In [47]:
len(result)

43797

In [48]:
# convert to dataframe
import pandas as pd
df = pd.DataFrame.from_dict(result, orient='index')
df['index'] = df.index
df.head()

Unnamed: 0,id,location,camera_model,thumbnails,selected_frames,created_time,date,month,year,day_of_week,part_of_day,local_time,captions,width,height,ext,fps,index
Dapang_Jul2022_5_1,1,Dapang,DJI Osmo 3,information/thumbnails/Dapang_Jul2022/0005_000...,information/selected_frames/Dapang_Jul2022/000...,2022:08:27 07:48:37,2022-08-27,August,2022,Saturday,early morning,07:48,underwater view of a coral reef.,1920,1080,mp4,30,Dapang_Jul2022_5_1
Dapang_Jul2022_5_2,2,Dapang,DJI Osmo 3,information/thumbnails/Dapang_Jul2022/0005_000...,information/selected_frames/Dapang_Jul2022/000...,2022:08:27 07:48:37,2022-08-27,August,2022,Saturday,early morning,07:48,the reef is a bit smaller than the ones we saw.,1920,1080,mp4,30,Dapang_Jul2022_5_2
Dapang_Jul2022_5_3,3,Dapang,DJI Osmo 3,information/thumbnails/Dapang_Jul2022/0005_000...,information/selected_frames/Dapang_Jul2022/000...,2022:08:27 07:48:37,2022-08-27,August,2022,Saturday,early morning,07:48,the reef is a bit more shallow than the beach.,1920,1080,mp4,30,Dapang_Jul2022_5_3
Dapang_Jul2022_5_4,4,Dapang,DJI Osmo 3,information/thumbnails/Dapang_Jul2022/0005_000...,information/selected_frames/Dapang_Jul2022/000...,2022:08:27 07:48:37,2022-08-27,August,2022,Saturday,early morning,07:48,a small group of fish swim.,1920,1080,mp4,30,Dapang_Jul2022_5_4
Dapang_Jul2022_5_5,5,Dapang,DJI Osmo 3,information/thumbnails/Dapang_Jul2022/0005_000...,information/selected_frames/Dapang_Jul2022/000...,2022:08:27 07:48:37,2022-08-27,August,2022,Saturday,early morning,07:48,a small group of fish swim around a coral reef.,1920,1080,mp4,30,Dapang_Jul2022_5_5


In [49]:
df.to_csv('vbs23_meta.csv', index=True)