In [None]:
# Setup: Mount Google Drive and install required dependencies
from google.colab import drive
drive.mount('/content/drive')

# Install required packages for EgoVLP
!pip install decord
!pip install transformers ftfy regex tqdm
!pip install timm==0.4.12
!pip install av
!pip install ffmpeg-python

import os
os.chdir('/content/drive/MyDrive/AML_Project/3_EgoVLP')

# Clone EgoVLP repository if not already present
if not os.path.exists('EgoVLP-main'):
    !git clone https://github.com/showlab/EgoVLP.git EgoVLP-main

Mounted at /content/drive
Collecting decord
  Downloading decord-0.6.0-py3-none-manylinux2010_x86_64.whl.metadata (422 bytes)
Downloading decord-0.6.0-py3-none-manylinux2010_x86_64.whl (13.6 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m13.6/13.6 MB[0m [31m123.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: decord
Successfully installed decord-0.6.0
Collecting ftfy
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Downloading ftfy-6.3.1-py3-none-any.whl (44 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m44.8/44.8 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ftfy
Successfully installed ftfy-6.3.1
Collecting timm==0.4.12
  Downloading timm-0.4.12-py3-none-any.whl.metadata (30 kB)
Downloading timm-0.4.12-py3-none-any.whl (3

# Part 3: Extending Baselines to New Feature Extraction Backbone - EgoVLP

## Overview

This section extends the CaptainCook4D baselines by integrating a new feature extraction backbone: **EgoVLP** (Egocentric Vision-Language Pre-training).

**Objectives:**
- Extract video features using the EgoVLP backbone
- Adapt the CaptainCook4D feature extraction pipeline for EgoVLP
- Generate 256-dimensional feature vectors at 1-second intervals
- Enable comparison of LSTM baseline with different feature modalities

**EgoVLP Architecture:**
- **Video Encoder**: SpaceTimeTransformer (16 frames sampled per second)
- **Text Encoder**: BERT-base-uncased
- **Feature Dimension**: 256
- **Training**: Pre-trained on egocentric video-text pairs

**Key Tasks:**
1. Download and setup EgoVLP model and dependencies
2. Extract video features per second from cooking videos
3. Save features as compressed numpy arrays (.npz)
4. Enable reproducibility and efficient batch processing

In [None]:
# Part 3: EgoVLP Feature Extraction Pipeline for CaptainCook4D

import sys
import os
import torch
import numpy as np
import glob
import shutil
import urllib.request
from tqdm import tqdm
from decord import VideoReader, cpu
from torchvision.transforms import Compose, Lambda
from torchvision.transforms._transforms_video import CenterCropVideo, NormalizeVideo

# Path configuration
DRIVE_ROOT = '/content/drive/MyDrive/AML_Project/3_EgoVLP'
REPO_PATH = os.path.join(DRIVE_ROOT, 'EgoVLP-main')
CHECKPOINT_PATH = os.path.join(DRIVE_ROOT, 'checkpoints/egovlp.pth')
VIDEO_DIR = os.path.join(DRIVE_ROOT, 'videos')
FEATURES_DIR = os.path.join(DRIVE_ROOT, 'features')
TEMP_WORK_DIR = '/content/temp_video_processing'

# --- Setup and Utilities ---
def ensure_pretrained_backbone():
    """
    Downloads ViT backbone weights (ImageNet) if not present.
    
    Required because EgoVLP/TimeSformer looks for this file statically in ./pretrained/
    This function handles automatic download with progress tracking.
    
    Raises:
        RuntimeError: If backbone download fails or connection is unavailable
    """
    backbone_dir = "pretrained"
    filename = "jx_vit_base_p16_224-80ecf9dd.pth"
    file_path = os.path.join(backbone_dir, filename)
    url = "https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_base_p16_224-80ecf9dd.pth"

    if not os.path.exists(backbone_dir):
        os.makedirs(backbone_dir)

    if not os.path.exists(file_path):
        print(f"Backbone '{filename}' missing. Download in progress...")
        try:
            # Use urllib with simple text progress bar
            with tqdm(unit='B', unit_scale=True, miniters=1, desc=filename) as t:
                def reporthook(blocknum, blocksize, totalsize):
                    t.total = totalsize
                    t.update(blocknum * blocksize - t.n)
                urllib.request.urlretrieve(url, file_path, reporthook=reporthook)
            print("‚úÖ Backbone download completed.")
        except Exception as e:
            print(f"Backbone download error: {e}")
            raise RuntimeError("Unable to download base ImageNet weights. Check your connection.")
    else:
        print(f"‚úÖ ImageNet backbone found: {file_path}")

def setup_environment():
    """
    Initializes the EgoVLP environment.
    
    Sets up:
    - Python path to include EgoVLP repository
    - Temporary directory for video processing
    - GPU optimization flags
    - Ensures backbone weights are downloaded
    """
    if REPO_PATH not in sys.path:
        sys.path.append(REPO_PATH)
    if not os.path.exists(TEMP_WORK_DIR):
        os.makedirs(TEMP_WORK_DIR)

    # GPU optimization
    if torch.cuda.is_available():
        torch.backends.cudnn.benchmark = True

    # Ensure base weights are present BEFORE importing or loading the model
    ensure_pretrained_backbone()

setup_environment()

try:
    from model.model import FrozenInTime
except ImportError as e:
    # Dynamic handling of missing dependencies
    missing_module = e.name
    print(f"Critical import error: {e}")

    suggestion = ""
    if missing_module == 'av':
        suggestion = "Run: !pip install av"
    elif missing_module == 'ffmpeg':
        suggestion = "Run: !pip install ffmpeg-python"
    elif missing_module == 'cv2':
        suggestion = "Run: !pip install opencv-python"
    else:
        suggestion = f"Try running: !pip install {missing_module}"

    raise RuntimeError(f"Missing critical dependency. {suggestion}")

# --- Model Loading ---
def get_egovlp_model(checkpoint_path, device='cuda'):
    """
    Initializes and loads the EgoVLP model.
    
    Creates a FrozenInTime model with:
    - SpaceTimeTransformer video encoder (16 frames per second)
    - BERT text encoder
    - 256-dimensional projection layer
    
    Args:
        checkpoint_path: Path to the pre-trained EgoVLP checkpoint
        device: Device to load model on ('cuda' or 'cpu')
    
    Returns:
        model: Loaded EgoVLP model in evaluation mode
    
    Raises:
        FileNotFoundError: If checkpoint path does not exist
    """
    print(f"Initializing model on {device}...")
    model = FrozenInTime(
        video_params={"model": "SpaceTimeTransformer", "pretrained": True, "num_frames": 16},
        text_params={"model": "bert-base-uncased", "pretrained": True},
        projection_dim=256,
        load_checkpoint=None
    )

    if os.path.exists(checkpoint_path):
        checkpoint = torch.load(checkpoint_path, map_location='cpu', weights_only=False)
        state_dict = checkpoint['state_dict'] if 'state_dict' in checkpoint else checkpoint
        new_state_dict = {k.replace('module.', ''): v for k, v in state_dict.items()}
        model.load_state_dict(new_state_dict, strict=False)
    else:
        raise FileNotFoundError(f"Checkpoint not found: {checkpoint_path}")

    model.to(device)
    model.eval()
    return model

# --- Preprocessing ---
def get_transform(input_size=224):
    """
    Creates video preprocessing pipeline.
    
    Applies standard normalization for ImageNet pre-trained models:
    - Rescale pixel values to [0, 1]
    - Normalize using ImageNet statistics
    - Center crop to (224, 224)
    
    Args:
        input_size: Target spatial dimension (default: 224)
    
    Returns:
        Compose: PyTorch transforms pipeline
    """
    mean, std = [0.485, 0.456, 0.406], [0.229, 0.224, 0.225]
    return Compose([
        Lambda(lambda x: x / 255.0),
        NormalizeVideo(mean, std),
        CenterCropVideo(input_size),
    ])

# --- Feature Extraction ---
def extract_features_per_second(model, video_path, device, batch_size_inference=16):
    """
    Extracts EgoVLP features second-by-second from a video.
    
    For each second of the video:
    1. Load video and compute FPS
    2. Sample 16 frames evenly spaced within the second
    3. Apply preprocessing (normalization, center crop)
    4. Pass through EgoVLP model to get 256-dim features
    5. Batch process for efficiency
    
    Args:
        model: Pre-loaded EgoVLP model
        video_path: Path to video file
        device: Device to process on ('cuda' or 'cpu')
        batch_size_inference: Number of seconds to process in parallel (default: 16)
    
    Returns:
        np.ndarray: Features array of shape (num_seconds, 256)
                   Returns None if video is too short (< 1 second)
    """
    # Load video
    vr = VideoReader(video_path, ctx=cpu(0))
    fps = vr.get_avg_fps()
    total_frames = len(vr)
    duration_sec = int(total_frames / fps)

    if duration_sec == 0:
        return None  # Video too short (< 1 second)

    transform = get_transform()

    features_list = []
    batch_buffer = []

    # print(f"  -> Video duration: {duration_sec}s. FPS: {fps:.2f}. Batch: {batch_size_inference}")

    for sec in range(duration_sec):
        # Define the temporal interval of the current second
        start_frame = int(sec * fps)
        end_frame = int((sec + 1) * fps)

        # Avoid index out of bounds
        end_frame = min(end_frame, total_frames)

        # Sample 16 frames evenly spaced in this second
        if end_frame - start_frame < 16:
             frame_indices = np.linspace(start_frame, end_frame - 1, 16, dtype=int)
        else:
             frame_indices = np.linspace(start_frame, end_frame - 1, 16, dtype=int)

        raw_frames = vr.get_batch(frame_indices)  # (T, H, W, C)

        if isinstance(raw_frames, torch.Tensor):
            frames_tensor = raw_frames
        else:
            frames_tensor = torch.tensor(raw_frames.asnumpy())

        # Standard PyTorch Video: (T, H, W, C) -> (C, T, H, W)
        frames_tensor = frames_tensor.permute(3, 0, 1, 2).float()

        # Normalization: input (C, T, H, W) -> output (C, T, H, W)
        transformed_frames = transform(frames_tensor)

        # EgoVLP requires: (Time, Channels, Height, Width) for single element
        # Permute from (C, T, H, W) to (T, C, H, W)
        transformed_frames = transformed_frames.permute(1, 0, 2, 3)

        batch_buffer.append(transformed_frames)

        # If buffer is full, perform inference
        if len(batch_buffer) == batch_size_inference:
            input_tensor = torch.stack(batch_buffer).to(device)
            with torch.no_grad():
                feat_batch = model({'video': input_tensor}, video_only=True)
                features_list.append(feat_batch.cpu().numpy())
            batch_buffer = []

    # Process any remaining items in buffer
    if len(batch_buffer) > 0:
        input_tensor = torch.stack(batch_buffer).to(device)
        with torch.no_grad():
            feat_batch = model({'video': input_tensor}, video_only=True)
            features_list.append(feat_batch.cpu().numpy())

    # Concatenate all batches: output (Total_Seconds, 256)
    if features_list:
        return np.concatenate(features_list, axis=0)
    return np.array([])

def process_all_videos():
    """
    Main processing function: extracts EgoVLP features for all videos.
    
    Workflow:
    1. Initialize EgoVLP model on GPU/CPU
    2. Discover all video files in VIDEO_DIR
    3. For each video:
       - Skip if features already exist
       - Copy video to temporary local storage
       - Extract features second-by-second
       - Save features as compressed .npz file
       - Clean up temporary files
    4. Report success count
    """
    device = "cuda" if torch.cuda.is_available() else "cpu"

    try:
        model = get_egovlp_model(CHECKPOINT_PATH, device)
    except Exception as e:
        print(f"Critical model error: {e}")
        return

    if not os.path.exists(FEATURES_DIR):
        os.makedirs(FEATURES_DIR)

    video_files = []
    for ext in ['*.mp4', '*.MP4', '*.avi', '*.mov']:
        video_files.extend(glob.glob(os.path.join(VIDEO_DIR, ext)))
    video_files = sorted(list(set(video_files)))

    print(f"Found {len(video_files)} videos to process.")

    count_success = 0

    # Progress bar with dynamic refresh
    pbar = tqdm(video_files, desc="Feature Extraction")

    for drive_video_path in pbar:
        video_name = os.path.basename(drive_video_path)
        feature_filename = f"{video_name}_1s_1s.npz"
        feature_save_path = os.path.join(FEATURES_DIR, feature_filename)

        # --- Skip logic ---
        if os.path.exists(feature_save_path):
            pbar.set_postfix_str(f"‚è© Skip: {video_name}")
            continue

        local_temp_path = os.path.join(TEMP_WORK_DIR, video_name)

        try:
            pbar.set_postfix_str(f"üîÑ Processing: {video_name}")
            shutil.copy(drive_video_path, local_temp_path)

            features = extract_features_per_second(model, local_temp_path, device)

            if features is not None and len(features) > 0:
                np.savez_compressed(feature_save_path, features=features)
                count_success += 1
            else:
                print(f"\nNo features extracted for {video_name} (too short?)")

        except Exception as e:
            print(f"\nError with {video_name}: {e}")
            import traceback
            traceback.print_exc()
        finally:
            if os.path.exists(local_temp_path):
                os.remove(local_temp_path)

    print(f"\n‚úÖ Completed. {count_success} new videos processed.")

if __name__ == "__main__":
    process_all_videos()



‚úÖ Backbone ImageNet trovato: pretrained/jx_vit_base_p16_224-80ecf9dd.pth
Inizializzazione modello su cuda...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

######USING ATTENTION STYLE:  frozen-in-time
‚úÖ Trovati 384 video da elaborare.


Estrazione Video: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 384/384 [17:03:44<00:00, 159.96s/it, üîÑ Processing: 9_8_360p_224.mp4]


‚úÖ Completato. 223 nuovi video elaborati.



