# Video Preprocessing

Description

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import glob
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import json
import gc
from tqdm.notebook import tqdm
import unicodedata

In [3]:
from preprocess.video_analyzer import VideoAnalyzer, analyze_none_landmarks
from preprocess.preprocessor import Preprocessor

2025-05-30 11:38:13.231804: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748572693.258032   24855 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748572693.266310   24855 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-30 11:38:13.299829: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Settings

In [4]:
path_to_root = "/home/ben/projects/SaoPauloBrazilChapter_BrazilianSignLanguage/"

timestamp = "04242025"
motion_version = "versionB"
pose_version = "versionB"
preprocessing_version = "v4"

In [5]:
metadata = pd.read_csv(os.path.join(
    path_to_root,
    "data",
    "raw",
    "combined",
    "target_dataset_video_metadata.csv"
    ))

### Run Video Analysis

Gets the Landmark data and motion detection data

In [None]:
for i, metadata_row in metadata[:].iterrows():
    print(f"\rProcessing video {i+1} of {len(metadata)}: {metadata_row.filename}", end="")
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
    analyzer = VideoAnalyzer(
        metadata_row,
        timestamp,
        path_to_root,
        verbose=False,
        motion_detection_version=motion_version,
        pose_detection_version=pose_version
    )
    pose_data = analyzer.pose_detect()
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
    pose_result = analyzer.pose_analyze()
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '0'

    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '0'
    motion_data = analyzer.motion_detect()
    motion_result = analyzer.motion_analyze()

    analyzer.save_analysis_info()

### Run Preprocessing

Preprocesses the landmark data

In [6]:
for i, metadata_row in tqdm(metadata[:].iterrows(), total=len(metadata)):
    gc.collect()
    
    with open(os.path.join(
        path_to_root, 
        "data", 
        "interim", 
        "Analysis",
        f"{timestamp}_motion{motion_version}_pose{pose_version}", 
        metadata_row["filename"].split(".")[0] + "_analysis_info.json"
        )) as f:
        analysis_info = json.load(f)
    
    preprocessing_params = {
        "face_width_aim": 0.155,
        "shoulders_width_aim": 0.35,
        "face_midpoint_to_shoulders_height_aim": 0.275,
        "shoulders_y_aim": 0.52,
        "use_statistic": "mean",
        "use_stationary_frames": True,
        "skip_stationary_frames": False,
        "start_frame": analysis_info['motion_analysis']['start_frame'],
        "end_frame": analysis_info['motion_analysis']['end_frame'],
    }

    preprocessor = Preprocessor(
            metadata_row,
            preprocessing_params,
            path_to_root,
            preprocess_version=preprocessing_version,
            verbose=False,
            save_intermediate=True,
        )

    preprocessor.preprocess_landmarks()
    # preprocessor.preprocess_video()
    

        
    # Force garbage collection after each video
    gc.collect()

  0%|          | 0/150 [00:00<?, ?it/s]

### Rename Files

Colab can't handle non-ASCII characters in the filenames, so we need to rename the files to remove accents

- Need to rename the npy & json files
- Need to edit filenames in the metadata csv and keep the old ones as well
- Need to add the new/old columns to the metadata csv

In [7]:
def detect_accents(text):
    normalized = unicodedata.normalize('NFD', text)
    had_accents = False

    for c in normalized:
        if unicodedata.category(c) == 'Mn':
            had_accents = True
            continue

    return had_accents

def strip_accents(text):
    # Normalize to NFD and keep only non-accented ASCII
    return ''.join(
        c for c in unicodedata.normalize('NFD', text)
        if unicodedata.category(c) != 'Mn'
    )

Remove accents from `filename` & `label` in the metadata csv

In [34]:
preprocessed_folder = os.path.join(path_to_root, "data", "preprocessed")
landmarks_folder = os.path.join(preprocessed_folder,"landmarks", preprocessing_version)
landmarks_metadata_json_folder = os.path.join(landmarks_folder, "individual_metadata")

In [35]:
landmarks_metadata = pd.read_csv(os.path.join(preprocessed_folder, f"landmarks_metadata_{preprocessing_version}.csv"))
landmarks_metadata.insert(1, 'filename_accented', landmarks_metadata['filename'])
landmarks_metadata.insert(3, 'label_accented', landmarks_metadata['label'])
landmarks_metadata.insert(4, 'accented', np.nan)
landmarks_metadata['filename'] = landmarks_metadata['filename'].apply(lambda s: strip_accents(s))
landmarks_metadata['label'] = landmarks_metadata['label'].apply(lambda s: strip_accents(s))
landmarks_metadata['accented'] = landmarks_metadata['filename_accented'].apply(lambda s: detect_accents(s))

In [36]:
landmarks_metadata.accented.value_counts()

accented
False    120
True      30
Name: count, dtype: int64

In [37]:
landmarks_metadata.to_csv(os.path.join(preprocessed_folder, f"landmarks_metadata_{preprocessing_version}.csv"), index=False)

Rename the `.npy` and `.json` files that are created by the preprocessing, and used in the modelling

In [10]:
for label, df in landmarks_metadata.groupby('label'):
    if not df.reset_index().iloc[0].accented:
        continue
    print(f"{df.reset_index().iloc[0].label_accented} -> {label} for {len(df)} sets of files")

    for i, row in df.iterrows():

        old_fn_npy = row.filename_accented.replace('mp4', 'npy')
        new_fn_npy = row.filename.replace('mp4', 'npy')

        old_fn_json = row.filename_accented.replace('mp4', 'json')
        new_fn_json = row.filename.replace('mp4', 'json')

        # npy file
        old_path_npy = os.path.join(landmarks_folder, old_fn_npy)
        new_path_npy = os.path.join(landmarks_folder, new_fn_npy)
        os.rename(old_path_npy, new_path_npy)
        
        # json file
        old_path_json = os.path.join(landmarks_metadata_json_folder, old_fn_json)
        new_path_json = os.path.join(landmarks_metadata_json_folder, new_fn_json)
        os.rename(old_path_json, new_path_json)

aniversário -> aniversario for 6 sets of files
bebê -> bebe for 6 sets of files
cabeça -> cabeca for 6 sets of files
café -> cafe for 6 sets of files
família -> familia for 6 sets of files


Edit the `.json` files to update the metadata to be consistent with the csv

In [33]:
for i, row in landmarks_metadata.iterrows():
    json_path = os.path.join(landmarks_metadata_json_folder, row.filename.replace('mp4', 'json'))
    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    new_json_metadata = {**row[:5]}
    old_json_metadata = dict(list(data['metadata'].items())[2:])
    data['metadata'] = {**new_json_metadata, **old_json_metadata}

    with open(json_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)