# 1. Data Preperation

In [1]:
import requests
import tarfile
import os
import random
import cv2
from pathlib import Path

In [ ]:
# Function to download a file from a URL
def download_file(url, filename):
    print("Getting the dataset...")
    r = requests.get(url, stream=True)
    with open(filename, 'wb') as f:
        for chunk in r.iter_content(chunk_size=128):
            f.write(chunk)
    print("Dataset downloaded.")

dataset_url = "https://github.com/atiselsts/data/raw/master/kaggle-dataset-6classes.tar"
dataset_filename = "kaggle-dataset-6classes.tar"
download_file(dataset_url, dataset_filename)

In [ ]:
# Function to extract a tar file
def extract_tar(filename):
    print("Extracting the dataset...")
    with tarfile.open(filename) as tar:
        tar.extractall()
    print("Dataset extracted.")
    
extract_tar(dataset_filename)

In [2]:
# Function to process videos and separate frames
def process_videos_and_frames():
    TEST_PROPORTION = 0.3
    input_dir = './kaggle-dataset-6classes'
    output_videos_dir = './kaggle-dataset-6classes-preprocessed/videos'
    output_frames_dir = './kaggle-dataset-6classes-preprocessed/frames'
    N_CLASSES = 7
    subdirs = [str(i) for i in range(N_CLASSES)]

    def mk(directory):
        Path(directory).mkdir(parents=True, exist_ok=True)

    def process_video(classdir, filename, target_subset):
        input_fullname = os.path.join(input_dir, classdir, filename)
        output_fullname = os.path.join(output_videos_dir, target_subset, classdir, filename)
        print(input_fullname, "->", target_subset)

        # Copy video file to the target directory
        os.makedirs(os.path.dirname(output_fullname), exist_ok=True)
        with open(input_fullname, "rb") as f_in:
            with open(output_fullname, "wb") as f_out:
                f_out.write(f_in.read())

        # Process frames
        vidcap = cv2.VideoCapture(input_fullname)
        success, image = vidcap.read()
        frame_number = 0
        while success:
            out_filename = f"frame_{frame_number}_{os.path.splitext(filename)[0]}.jpg"
            save_path_and_name = os.path.join(output_frames_dir, target_subset, classdir, out_filename)
            os.makedirs(os.path.dirname(save_path_and_name), exist_ok=True)
            cv2.imwrite(save_path_and_name, image)
            success, image = vidcap.read()
            frame_number += 1

    random.seed(123)  # make it repeatable

    # Prepare directories
    for movement in range(N_CLASSES):
        mk(os.path.join(output_videos_dir, "test", str(movement)))
        mk(os.path.join(output_videos_dir, "trainval", str(movement)))
        mk(os.path.join(output_frames_dir, "test", str(movement)))
        mk(os.path.join(output_frames_dir, "trainval", str(movement)))

    # Process each video file
    for classdir in subdirs:
        for filename in os.listdir(os.path.join(input_dir, classdir)):
            if ".mp4" not in filename:
                continue
            target_subset = "test" if random.random() < TEST_PROPORTION else "trainval"
            process_video(classdir, filename, target_subset)

    print("Preprocessing complete.")
    
# Now process the videos and frames
process_videos_and_frames()

./kaggle-dataset-6classes\0\HandWash_001_A_11_G_01.mp4 -> test
./kaggle-dataset-6classes\0\HandWash_001_A_12_G_01.mp4 -> test
./kaggle-dataset-6classes\0\HandWash_002_A_11_G_01.mp4 -> trainval
./kaggle-dataset-6classes\0\HandWash_002_A_12_G_01.mp4 -> test
./kaggle-dataset-6classes\0\HandWash_003_A_11_G_01.mp4 -> trainval
./kaggle-dataset-6classes\0\HandWash_003_A_12_G_01.mp4 -> test
./kaggle-dataset-6classes\0\HandWash_004_A_11_G_01.mp4 -> trainval
./kaggle-dataset-6classes\0\HandWash_004_A_12_G_01.mp4 -> trainval
./kaggle-dataset-6classes\0\HandWash_005_A_11_G_01.mp4 -> trainval
./kaggle-dataset-6classes\0\HandWash_005_A_12_G_01.mp4 -> test
./kaggle-dataset-6classes\0\HandWash_006_A_11_G_01.mp4 -> trainval
./kaggle-dataset-6classes\0\HandWash_006_A_12_G_01.mp4 -> trainval
./kaggle-dataset-6classes\0\HandWash_007_A_11_G_01.mp4 -> test
./kaggle-dataset-6classes\0\HandWash_007_A_12_G_01.mp4 -> test
./kaggle-dataset-6classes\0\HandWash_008_A_11_G_01.mp4 -> trainval
./kaggle-dataset-6class