# 1. Preprocess the dataset

## 1.1 Extract frames from videos
My dataset is organized so that each top-level folder represents a gloss sentence. For instance:
```bash
VLS/
├── Ban_an_chua/     # Gloss: "Ban_an_chua"
│   ├── Signer01/
│   │   ├── video1.mp4
│   │   ├── video2.mp4
│   │   └── video3.mp4
│   ├── Signer02/
│   │   ├── video1.mp4
│   │   ├── video2.mp4
│   │   └── video3.mp4
│   ├── Signer03/
│   └── Signer04/
└── 7_gio_toi/       # Gloss: "7_gio_toi"
    ├── Signer01/
    │   ├── video1.mp4
    │   └── video2.mp4
    ├── Signer02/
    └── ...

```

To do that, I will write code following these steps:
1. Traverses dataset organized by gloss (e.g. "Ban_an_chua", "7_gio_toi").
2. For each gloss folder, iterates over all signer subfolders and their video files.
3. Creates one output folder per video, numbering them sequentially (0, 1, 2, …).
4. Extracts all frames from each video and saves them in folder `./data/interim`.

The extracted frames will be saved in the following structure:
```bash
interim/
└── VSL/
    ├── Ban_an_chua/
    │   ├── 0/
    │   │   ├── 00001.jpg
    │   │   ├── 00002.jpg
    │   │   └── ...
    │   ├── 1/
    │   └── ...
    └── 7_gio_toi/
    ├── 0/
    │   ├── 00001.jpg
    │   └── ...
    ├── 1/
    └── ...
```

In [None]:
import os
import cv2
from attr import dataclass
from tqdm import tqdm # Progress bar

In [None]:
!cd /home/martinvalentine/Desktop/sign-language-lstm/data

In [None]:
# Define the path to the dataset and destination folder
dataset_path = "./raw/VSL"
destination_folder = "./interim/VSL"

In [None]:
# Create the destination folder if it does not exist
os.makedirs(destination_folder, exist_ok=True)

In [None]:
# Loop over each gloss folder
for gloss in sorted(os.listdir(dataset_path)):
    gloss_input_path = os.path.join(dataset_path, gloss)
    if not os.path.isdir(gloss_input_path):
        continue

    # Replace spaces with dashes in the gloss folder name for output
    gloss_out_name = gloss.replace(" ", "-")
    # Create an output folder for this gloss in the destination_folder
    gloss_output_path = os.path.join(destination_folder, gloss_out_name)
    os.makedirs(gloss_output_path, exist_ok=True)

    video_count = 0  # Counter to number each video sequentially
    # Iterate over each signer folder in the gloss folder
    for signer in sorted(os.listdir(gloss_input_path)):
        signer_input_path = os.path.join(gloss_input_path, signer)
        if not os.path.isdir(signer_input_path):
            continue

        # Iterate over each video file in the signer folder
        for video_file in sorted(os.listdir(signer_input_path)):
            if not video_file.lower().endswith((".mp4", ".avi", ".mkv", ".mov")):
                continue

            video_path = os.path.join(signer_input_path, video_file)
            # Create a folder for this video using the video_count as folder name
            video_output_folder = os.path.join(gloss_output_path, str(video_count))
            os.makedirs(video_output_folder, exist_ok=True)

            cap = cv2.VideoCapture(video_path)
            if not cap.isOpened():
                print(f"Failed to open {video_path}")
                continue

            frame_idx = 0      # Index of the frame in the video
            save_idx = 0       # Index used for saved frames in the output folder
            # Extract frames, saving only every 2nd frame
            while True:
                ret, frame = cap.read()
                if not ret:
                    break  # End of video

                if frame_idx % 2 == 0:
                    out_filename = f"frame{save_idx:04d}.png"
                    out_path = os.path.join(video_output_folder, out_filename)
                    cv2.imwrite(out_path, frame)
                    save_idx += 1

                frame_idx += 1

            cap.release()
            print(f"Extracted {save_idx} frames (every 2nd frame) from {video_path} into folder {video_output_folder}")
            video_count += 1

print("Frame extraction complete!")


## 1.2 Generate the annotation file
On this step, I will generate the annotation file for the dataset. The annotation file will contain the following information:
The annotation CSV file with two fields: video, orth
- video: a path pattern like "Ban-an-chua/0/*.png" (gloss folder name with spaces replaced by dashes)
- orth: the gloss name with original spacing, e.g., "Ban an chua"

In [None]:
cd /home/martinvalentine/Desktop/sign-language-lstm/data

In [None]:
import os
import csv

In [None]:
# Define the path to the dataset and destination folder for the annotation file
dataset_path = "./interim/VSL"
annotation_file_path = "./processed"
output_csv = os.path.join(annotation_file_path, "annotations.csv")

In [None]:
# Create the destination folder if it does not exist
os.makedirs(annotation_file_path, exist_ok=True)

In [None]:
# Create the CSV file and write the header
# Open the annotation file in write mode
with open(output_csv, "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f, delimiter=",")
    # Write header
    writer.writerow(["video", "orth"])

    # Loop over each gloss folder (e.g., "Ban an chua", "7 gio toi", etc.)
    for gloss in sorted(os.listdir(dataset_path)):
        gloss_path = os.path.join(dataset_path, gloss)
        if not os.path.isdir(gloss_path):
            continue

        # For the video field, keep the original gloss name.
        video_gloss = gloss
        # For the orth field, replace dashes with spaces.
        orth_gloss = gloss.replace("-", " ")

        # Iterate over each numbered video folder inside the gloss folder
        for video_folder in sorted(os.listdir(gloss_path)):
            video_folder_path = os.path.join(gloss_path, video_folder)
            if not os.path.isdir(video_folder_path):
                continue

            # Build the glob pattern for frames, e.g., "Ban-an-chua/0/*.png"
            video_pattern = f"{video_gloss}/{video_folder}/*.png"
            # Write a row to the CSV: video, orth
            writer.writerow([video_pattern, orth_gloss])

print(f"Annotation CSV created at {output_csv}")

In [62]:
# Display the first few rows of the annotation CSV file
import pandas as pd
df = pd.read_csv(output_csv)

# Display the first 20 rows
df.head(20)

Unnamed: 0,video,orth
0,7-gio-toi_nay-ban-ranh-khong/0/*.png,7 gio toi_nay ban ranh khong
1,7-gio-toi_nay-ban-ranh-khong/1/*.png,7 gio toi_nay ban ranh khong
2,7-gio-toi_nay-ban-ranh-khong/2/*.png,7 gio toi_nay ban ranh khong
3,7-gio-toi_nay-ban-ranh-khong/3/*.png,7 gio toi_nay ban ranh khong
4,7-gio-toi_nay-ban-ranh-khong/4/*.png,7 gio toi_nay ban ranh khong
5,7-gio-toi_nay-ban-ranh-khong/5/*.png,7 gio toi_nay ban ranh khong
6,7-gio-toi_nay-ban-ranh-khong/6/*.png,7 gio toi_nay ban ranh khong
7,7-gio-toi_nay-ban-ranh-khong/7/*.png,7 gio toi_nay ban ranh khong
8,7-gio-toi_nay-ban-ranh-khong/8/*.png,7 gio toi_nay ban ranh khong
9,Ba-con-thuong/0/*.png,Ba con thuong


## 1.3 Generate the info dictionary
Before generating the gloss dictionary, I need to create a information dictionary that contains the following information:
- fileid: the unique ID of the video
- folder: the path pattern of the video frames
- signer: the signer of the video
- label: the gloss label
- num_frames: the number of frames in the video
- original_info: the original information string

In [None]:
import os
import glob
import pandas as pd
import numpy as np
from tqdm import tqdm

In [None]:
# The path to the annotation file and the destination folder for the processed information
anno_path = "./interim/annotations/annotations.csv"
info_dict_path = "./processed"

# The prefix (base folder) where frames are stored
prefix = "./interim/VSL"

# Create the destination folder if it does not exist
info_dict_file = os.path.join(info_dict_path, "info_dict.npy")


In [None]:
# Read the CSV file
df = pd.read_csv(anno_path)

In [None]:
# Convert the DataFrame to a dictionary
records = df.to_dict(orient="records")
records

In [None]:
# Build a dictionary to hold all the info.
info_dict = {}
info_dict['prefix'] = prefix  # Base directory where frames are stored
print(f"Generate information dict from {anno_path}")

In [None]:
for i, record in tqdm(enumerate(records), total=len(records)):
    video_field = record["video"]  # e.g., "7-gio-toi_nay-ban-ranh-khong/0/*.png"
    folder = video_field.rsplit("/*.png", 1)[0]  # Remove the glob part to get the folder path
    label = record["orth"]     # Use the orth column as the gloss label

    # Create the full folder path by adding the prefix
    full_folder = os.path.join(info_dict['prefix'], folder)
    num_frames = len(glob.glob(os.path.join(full_folder, "*.png")))

    # Create the information dictionary entry with folder including the prefix
    info_dict[i] = {
        'fileid': i,
        'folder': full_folder,  # Now contains the prefix
        'label': label,
        'num_frames': num_frames,
        'original_info': f"{video_field},{label}"
    }

In [None]:
# Display the first few items in the info_dict
{k: info_dict[k] for k in list(info_dict)[:5]}

In [None]:
# Save the info_dict as a .npy file
np.save(info_dict_file, info_dict)

In [48]:
# Load the saved info_dict file
info_dict = np.load(info_dict_file, allow_pickle=True).item()
info_dict

{'prefix': './interim/VSL',
 0: {'fileid': 0,
  'folder': './interim/VSL/7-gio-toi_nay-ban-ranh-khong/0',
  'label': '7 gio toi_nay ban ranh khong',
  'num_frames': 104,
  'original_info': '7-gio-toi_nay-ban-ranh-khong/0/*.png,7 gio toi_nay ban ranh khong'},
 1: {'fileid': 1,
  'folder': './interim/VSL/7-gio-toi_nay-ban-ranh-khong/1',
  'label': '7 gio toi_nay ban ranh khong',
  'num_frames': 102,
  'original_info': '7-gio-toi_nay-ban-ranh-khong/1/*.png,7 gio toi_nay ban ranh khong'},
 2: {'fileid': 2,
  'folder': './interim/VSL/7-gio-toi_nay-ban-ranh-khong/2',
  'label': '7 gio toi_nay ban ranh khong',
  'num_frames': 108,
  'original_info': '7-gio-toi_nay-ban-ranh-khong/2/*.png,7 gio toi_nay ban ranh khong'},
 3: {'fileid': 3,
  'folder': './interim/VSL/7-gio-toi_nay-ban-ranh-khong/3',
  'label': '7 gio toi_nay ban ranh khong',
  'num_frames': 134,
  'original_info': '7-gio-toi_nay-ban-ranh-khong/3/*.png,7 gio toi_nay ban ranh khong'},
 4: {'fileid': 4,
  'folder': './interim/VSL/7-g

## 1.4 Generate the gloss dictionary
The gloss dictionary will be a Python dictionary that maps each unique gloss to an integer ID. The dictionary will be saved as a `.npy` file.

In [None]:
import os
import glob
import numpy as np
import pandas as pd
from tqdm import tqdm

In [42]:
# The path to the info dictionary and the destination folder for the processed information
info_dict_path = "./processed/info_dict.npy"
gloss_dict_path = "./processed"
gloss_dict_file = os.path.join(gloss_dict_path, "gloss_dict.npy")

In [None]:
gloss_dict = {}
next_id = 1 # Start from 1, 0 is reserved for the blank token

In [None]:
# Read the info dictionary file
info_dict = np.load(info_dict_path, allow_pickle=True).item()

In [None]:
# Iterate over the info_dict to extract the gloss labels
for key, value in info_dict.items():
        # Process only video entries (keys that are integers)
        if not isinstance(key, int):
            continue

        # Split the label by whitespace; if label contains multiple tokens, each is counted separately.
        tokens = value['label'].split()
        for token in tokens:
            token = token.strip()
            if not token:
                continue
            if token not in gloss_dict:
                gloss_dict[token] = [next_id, 1]
                next_id += 1
            else:
                gloss_dict[token][1] += 1

In [None]:
gloss_dict

In [43]:
# Load the saved gloss_dict file
gloss_dict = np.load(gloss_dict_file, allow_pickle=True).item()
gloss_dict

{'7': [1, 9],
 'gio': [2, 9],
 'toi_nay': [3, 9],
 'ban': [4, 30],
 'ranh': [5, 9],
 'khong': [6, 33],
 'Ba': [7, 12],
 'con': [8, 12],
 'thuong': [9, 12],
 'Ban': [10, 123],
 'an': [11, 48],
 'chua': [12, 12],
 'cung': [13, 33],
 'di': [14, 23],
 'ai': [15, 11],
 'gia': [16, 10],
 'dinh': [17, 10],
 'bao_nhieu': [18, 34],
 'keo': [19, 12],
 'khoe': [20, 24],
 'lam_gi': [21, 9],
 'nha': [22, 9],
 'o_dau': [23, 9],
 'ten_gi': [24, 12],
 'tra': [25, 12],
 'thich': [26, 70],
 'tuoi': [27, 12],
 'Chuc': [28, 12],
 'ngu_ngon': [29, 12],
 'Co': [30, 9],
 'chuyen_gi': [31, 9],
 'Hen': [32, 12],
 'gap_lai': [33, 12],
 'Hom_nay': [34, 12],
 'toi': [35, 24],
 'vui': [36, 12],
 'o_day': [37, 12],
 'Moi_nguoi': [38, 11],
 'gia_dinh': [39, 11],
 'yeu_thuong': [40, 11],
 'Nguoi_ay': [41, 9],
 'la_ai': [42, 9],
 'O': [43, 12],
 'truong': [44, 23],
 'ban_than': [45, 12],
 'co': [46, 12],
 'Tai_sao': [47, 9],
 'cuoi': [48, 9],
 'Toi': [49, 164],
 'sang': [50, 12],
 'bun_dau': [51, 12],
 'buon': [52, 12

In [44]:
# Count the number of unique labels
num_labels = len(gloss_dict)

print(f"Total unique labels in gloss_dict: {num_labels}")

Total unique labels in gloss_dict: 64


## 1.5 Generate the ground truth STM file

Next, we need to generate the ground truth STM file. The STM file is a text file that contains the ground truth information for the dataset. Each line in the file represents a segment of the video with the following format:
```python
{fileid} 1 {starttime} {endtime} {label}
```
Where:
- `fileid`: A unique identifier for the video or video segment.
- `1`: A placeholder for the channel identifier.
- `signer`: The identifier for the signer (if available; you can use a default value if not).
- `start_time`: The starting timestamp (often 0.0 if unknown).
- `end_time`: The ending timestamp (often a large number like 1.79769e+308 to indicate the full duration).
- `label`: The gloss (annotation) for that segment.

Because I have mixed my dataset videos of each signer so I will use `UnknownSigner` as a placeholder for the signer.

In [50]:
import pandas as pd
import os

In [70]:
# Load CSV file (containing video and orth columns)
df = pd.read_csv("./processed/annotations.csv")

# Define output STM file path
stm_file_path = "./processed"
# Define output STM file path
stm_file = os.path.join(stm_file_path, "vsl-ground-truth-dev.npy")


In [71]:
# Open file for writing
with open(stm_file, "w") as f:
    for _, row in df.iterrows():
        video_path = row["video"]  # Example: "12July_2010_Monday_heute-244/1/*.png"
        label = row["orth"].upper()  # Convert gloss labels to uppercase

        # Extract fileid from the video path (removing "/*.png")
        fileid = "/".join(video_path.split("/")[:-1])  # Keeps "12July_2010_Monday_heute-244/0"


        # Set a default signer value since it is not provided in the CSV
        signer = "UnknownSigner"

        # STM format: fileid 1 signer start_time end_time gloss_sequence
        f.write(f"{fileid} 1 {signer} 0.0 1.79769e+308 {label}\n")

print(f"Ground truth STM saved to {stm_file}")

Ground truth STM saved to ./processed/vsl-ground-truth-dev.npy


In [72]:
# Read the saved STM file first 10 lines
# Open and read the STM file
with open(stm_file, "r") as f:
    lines = f.readlines()

print("Ground Truth STM File:")
print("=" * 60)
for line in lines:
    # Remove newline and split into parts; split into max 6 parts so the gloss remains intact.
    parts = line.strip().split(" ", 5)
    if len(parts) < 6:
        continue
    fileid, channel, signer, start_time, end_time, gloss = parts
    print(f"FileID    : {fileid}")
    print(f"Channel   : {channel}")
    print(f"Signer    : {signer}")
    print(f"StartTime : {start_time}")
    print(f"EndTime   : {end_time}")
    print(f"Gloss     : {gloss}")
    print("-" * 60)


Ground Truth STM File:
FileID    : 7-gio-toi_nay-ban-ranh-khong/0
Channel   : 1
Signer    : UnknownSigner
StartTime : 0.0
EndTime   : 1.79769e+308
Gloss     : 7 GIO TOI_NAY BAN RANH KHONG
------------------------------------------------------------
FileID    : 7-gio-toi_nay-ban-ranh-khong/1
Channel   : 1
Signer    : UnknownSigner
StartTime : 0.0
EndTime   : 1.79769e+308
Gloss     : 7 GIO TOI_NAY BAN RANH KHONG
------------------------------------------------------------
FileID    : 7-gio-toi_nay-ban-ranh-khong/2
Channel   : 1
Signer    : UnknownSigner
StartTime : 0.0
EndTime   : 1.79769e+308
Gloss     : 7 GIO TOI_NAY BAN RANH KHONG
------------------------------------------------------------
FileID    : 7-gio-toi_nay-ban-ranh-khong/3
Channel   : 1
Signer    : UnknownSigner
StartTime : 0.0
EndTime   : 1.79769e+308
Gloss     : 7 GIO TOI_NAY BAN RANH KHONG
------------------------------------------------------------
FileID    : 7-gio-toi_nay-ban-ranh-khong/4
Channel   : 1
Signer    : Unkn

Next, I will resize the images to a 256x256 resolution. This is a common size for image-based models and will help reduce the computational requirements for training.

## 1.6 Resize the images

I will resize the images to a resolution of 256x256 pixels. This will help standardize the image sizes and reduce the computational requirements for training the model.


In [8]:
cd /home/martinvalentine/Desktop/sign-language-lstm/data

/home/martinvalentine/Desktop/sign-language-lstm/data


In [2]:
import cv2
import os
import numpy as np
from tqdm import tqdm

In [3]:
# Function to resize an image
def resize_img(img_path, dsize='256x256'):
    dsize = tuple(int(res) for res in dsize.split("x"))
    img = cv2.imread(img_path)
    resized_img = cv2.resize(img, dsize, interpolation=cv2.INTER_LANCZOS4) # Resize using Lanczos interpolation method for better quality
    return resized_img

In [11]:
# Define the source and destination folders
source_folder = "./interim/VSL_Sample"
destination_folder = "./processed/VSL_Sample_256x256"

In [12]:
# Create the destination folder if it does not exist
os.makedirs(destination_folder, exist_ok=True)

In [13]:
# Process images
for root, dirs, files in os.walk(source_folder):
    for file in files:
        if file.endswith(".png"):  # Process only PNG images
            src_path = os.path.join(root, file)

            # Generate destination path while keeping folder structure
            rel_path = os.path.relpath(root, source_folder)  # Relative path inside VSL
            dst_dir = os.path.join(destination_folder, rel_path)  # Create matching structure in output folder
            os.makedirs(dst_dir, exist_ok=True)  # Ensure destination directory exists

            dst_path = os.path.join(dst_dir, file)  # Destination file path
            resized_img = resize_img(src_path)  # Resize image

            if resized_img is not None:
                cv2.imwrite(dst_path, resized_img)  # Save resized image
                print(f"Saved: {dst_path}")


Saved: ./processed/VSL_Sample_256x256/Toi-khoe/5/frame0002.png
Saved: ./processed/VSL_Sample_256x256/Toi-khoe/5/frame0035.png
Saved: ./processed/VSL_Sample_256x256/Toi-khoe/5/frame0033.png
Saved: ./processed/VSL_Sample_256x256/Toi-khoe/5/frame0004.png
Saved: ./processed/VSL_Sample_256x256/Toi-khoe/5/frame0031.png
Saved: ./processed/VSL_Sample_256x256/Toi-khoe/5/frame0007.png
Saved: ./processed/VSL_Sample_256x256/Toi-khoe/5/frame0024.png
Saved: ./processed/VSL_Sample_256x256/Toi-khoe/5/frame0032.png
Saved: ./processed/VSL_Sample_256x256/Toi-khoe/5/frame0014.png
Saved: ./processed/VSL_Sample_256x256/Toi-khoe/5/frame0003.png
Saved: ./processed/VSL_Sample_256x256/Toi-khoe/5/frame0022.png
Saved: ./processed/VSL_Sample_256x256/Toi-khoe/5/frame0009.png
Saved: ./processed/VSL_Sample_256x256/Toi-khoe/5/frame0012.png
Saved: ./processed/VSL_Sample_256x256/Toi-khoe/5/frame0015.png
Saved: ./processed/VSL_Sample_256x256/Toi-khoe/5/frame0023.png
Saved: ./processed/VSL_Sample_256x256/Toi-khoe/5/frame0

We have successfully resized the images to 256x256 pixels. The preprocessing steps are now complete.