#Data loading and process

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
!python -m pip install -r requirements.txt
!python -m pip install -e script/mltools

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
import shutil
from mltools.file_util import get_resource
from huggingface_hub import hf_hub_download


hf_dataset_identifier = "sayakpaul/ucf101-subset"
filename = "UCF101_subset.tar.gz"
file_path = hf_hub_download(
    repo_id=hf_dataset_identifier, filename=filename, repo_type="dataset"
)
print(file_path)
shutil.copy(file_path, get_resource("UCF101_subset.tar.gz"))

In [None]:
import tarfile

tarfile.open(get_resource("UCF101_subset.tar.gz")).extractall(get_resource("UCF101_subset"), filter='data')

In [None]:
import os
from mltools.file_util import count_files, get_subfolders, get_resource


def ucf101_subset_info(root_dir: str) -> None:
    subset = get_subfolders(root_dir)
    cls_name = get_subfolders(os.path.join(root_dir, subset[0]))
    print("Total classes: ", len(cls_name))
    print("class: ", cls_name)
    for folder in subset:
        print(f"{folder}: {count_files(os.path.join(root_dir, folder), "avi")}")
        
ucf101_subset_info(get_resource("UCF101_subset/UCF101_subset"))

In [None]:
from mltools.file_util import get_file_list

videos_dir = get_resource("bekhoaxe/videos")
cls_name = get_subfolders(videos_dir)
data_files = {
    cls: get_file_list(os.path.join(videos_dir, cls), "avi") for cls in cls_name
}

In [None]:
import pandas as pd

df = pd.DataFrame(data_files)
df = df.sample(frac=1).reset_index(drop=True)
print(df.head())

In [None]:
from mltools.list_utl import custom_size_chunking

df_len = len(df)
train_f, test_f, val_f = 0.7, 0.2, 0.1
train, test, validation = list(custom_size_chunking(df, [train_f, test_f, val_f]))

print("Dataset size: ", df_len)
print("Split sizes: ", len(train), len(test), len(validation))

In [None]:
from mltools.file_util import copy_file, make_dir

splited_videos = get_resource("bekhoaxe/splited_videos")

train_dir = get_resource("bekhoaxe/splited_videos/train")
test_dir = get_resource("bekhoaxe/splited_videos/test")
val_dir = get_resource("bekhoaxe/splited_videos/val")

def initialize_data_subset(data, subset_dir):
    for cls in cls_name:
        dir_path = make_dir(os.path.join(subset_dir, cls))
        for file in data[cls]:
            copy_file(file, os.path.join(dir_path, os.path.basename(file)))

initialize_data_subset(train, train_dir)
initialize_data_subset(test, test_dir)
initialize_data_subset(validation, val_dir)


In [None]:
ucf101_subset_info(splited_videos)