In [41]:
# imports
import os
import csv
import shutil
import pandas as pd
import cv2
import matplotlib.pyplot as plt
import string
import random

word frequency comes from https://www.kaggle.com/datasets/rtatman/english-word-frequency

# Images

In [42]:
# define input and output folders
source_root = "images_folder"
output_root = "letters"

# define dataset split ratios
train_ratio = 0.8
val_ratio = 0.1
test_ratio = 0.1

# reproducability
random.seed(42)

# create output subfolders
splits = ["train", "val", "test"]
for split in splits:
    os.makedirs(os.path.join(output_root, split), exist_ok=True)

# loop through each class subfolder
for root, dirs, files in os.walk(source_root):
    label = os.path.basename(root)

    if label == os.path.basename(source_root): # skip folder itself
        continue

    # get images
    image_files = [f for f in files]

    # shuffle for random splitting
    random.shuffle(image_files)

    # split into train/test/val
    n_total = len(image_files)
    n_train = int(train_ratio * n_total)
    n_val = int(val_ratio * n_total)

    split_sets = {
        "train": image_files[:n_train],
        "val": image_files[n_train:n_train + n_val],
        "test": image_files[n_train + n_val:]
    }

    # copy images into structured folders
    for split_name, files_in_split in split_sets.items():
        split_label_folder = os.path.join(output_root, split_name, label)
        os.makedirs(split_label_folder, exist_ok=True)

        for idx, file in enumerate(files_in_split, start=1):
            image_id = f"{label.lower()}_{idx}"
            src_path = os.path.join(root, file)
            dst_path = os.path.join(split_label_folder, f"{image_id}.jpg")
            shutil.copy2(src_path, dst_path)


# Videos

In [43]:
# TODO - CHOOSE HOW MANY WORDS IN DATASET
n = 40  # number of most common words to include

In [44]:
def extract_frames(video_path, output_dir, video_id):
    """ extracts frames from video and saves to a folder """
    os.makedirs(output_dir, exist_ok=True)
    vid = cv2.VideoCapture(video_path)
    currentframe = 0
    while True:
        ret, frame = vid.read()
        if not ret:
            break
        frame_name = f"frame_{currentframe:04d}.jpg"
        cv2.imwrite(os.path.join(output_dir, frame_name), frame)
        currentframe += 1
    vid.release()
    cv2.destroyAllWindows()

In [45]:
# define input and output folders
source_root = "videos_folder"
output_root = "words"

# read and filter word list
all_words = [d for d in os.listdir(source_root) if os.path.isdir(os.path.join(source_root, d))]
alphabet = list(string.ascii_lowercase)
all_words = [w for w in all_words if w not in alphabet]

# filter by frequency
word_freq = pd.read_csv("unigram_freq.csv")
word_freq = word_freq.loc[word_freq["word"].isin(all_words)].reset_index(drop=True)
words_train = word_freq["word"].iloc[:n].tolist()

# create main output folders
for split in ["train", "val", "test"]:
    os.makedirs(os.path.join(output_root, split), exist_ok=True)

# traverse through dataset
for word in words_train:
    word_folder = os.path.join(source_root, word)
    if not os.path.exists(word_folder):
        continue

    # collect all video files for that word
    video_files = [os.path.join(word_folder, f) for f in os.listdir(word_folder)]

    # shuffle for random splitting
    random.shuffle(video_files)

    # split into train/test/val
    n_total = len(video_files)
    n_train = int(train_ratio * n_total)
    n_val = int(val_ratio * n_total)

    split_sets = {
        "train": video_files[:n_train],
        "val": video_files[n_train:n_train + n_val],
        "test": video_files[n_train + n_val:]
    }

    # process and extract frames
    for split_name, split_videos in split_sets.items():
        for idx, video_path in enumerate(split_videos, start=1):
            video_id = f"{word}_{idx}"
            output_dir = os.path.join(output_root, split_name, word, video_id)
            os.makedirs(output_dir, exist_ok=True)
            extract_frames(video_path, output_dir, video_id)


# Test Data Loader in PyTorch

In [46]:
from torchvision import datasets, transforms

# letters / images
letter_transform = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.ToTensor()
])

letter_train= datasets.ImageFolder(root='letters/train', transform=letter_transform)
letter_val   = datasets.ImageFolder(root='letters/val', transform=letter_transform)
letter_test  = datasets.ImageFolder(root='letters/test', transform=letter_transform)


# words / videos
word_transform = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.ToTensor()
])

word_train= datasets.ImageFolder(root='words/train', transform=word_transform)
word_val   = datasets.ImageFolder(root='words/val', transform=word_transform)
word_test  = datasets.ImageFolder(root='words/test', transform=word_transform)