In [None]:
# Importing the required libraries

import os
from pathlib import Path
import glob
import re
import numpy as np
import pandas as pd
import h5py
import torch
import librosa
import ast
import string
import zipfile
from tqdm.notebook import tqdm
from sklearn.linear_model import RidgeCV, Ridge
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from scipy.stats import pearsonr
import cv2
import nibabel as nib
from nilearn import plotting
from nilearn.maskers import NiftiLabelsMasker
import ipywidgets as widgets
from ipywidgets import VBox, Dropdown, Button
from IPython.display import Video, display, clear_output
from moviepy.editor import VideoFileClip
from transformers import BertTokenizer, BertModel
from torchvision.transforms import Compose, Lambda, CenterCrop
from torchvision.models.feature_extraction import create_feature_extractor
from pytorchvideo.transforms import Normalize, UniformTemporalSubsample, ShortSideScale

In [2]:
def load_mkv_file(movie_path):
    """
    Load video and audio data from the given .mkv movie file, and additionally
    prints related information.

    Parameters
    ----------
    movie_path : str
        Path to the .mkv movie file.

    """

    # Read the .mkv file
    cap = cv2.VideoCapture(movie_path)

    if not cap.isOpened():
        print("Error: Could not open movie.")
        return

    # Get video information
    video_fps = cap.get(cv2.CAP_PROP_FPS)
    video_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    video_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    video_total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    video_duration = video_total_frames / video_fps
    video_duration_minutes = video_duration / 60

    # Print video information
    print(">>> Video Information <<<")
    print(f"Video FPS: {video_fps}")
    print(f"Video Resolution: {video_width}x{video_height}")
    print(f"Total Frames: {video_total_frames}")
    print(f"Video Duration: {video_duration:.2f} seconds or {video_duration_minutes:.2f} minutes")

    # Release the video object
    cap.release()

    # Audio information
    clip = VideoFileClip(movie_path)
    audio = clip.audio
    audio_duration = audio.duration
    audio_fps = audio.fps
    print("\n>>> Audio Information <<<")
    print(f"Audio Duration: {audio_duration:.2f} seconds")
    print(f"Audio FPS (Sample Rate): {audio_fps} Hz")

    # Extract and display the first 20 seconds of the video
    output_video_path = 'first_20_seconds.mp4'
    video_segment = clip.subclip(0, min(20, video_duration))
    print("\nCreating clip of the first 20 seconds of the video...")
    video_segment.write_videofile(output_video_path, codec="libx264", audio_codec="aac", verbose=False, logger=None)

    # Display the video in the notebook
    display(Video(output_video_path, embed=True, width=640, height=480))


In [3]:
# Load the .mkv file
movie_path = r"C:\Projects\algonauts 2025 data\algonauts_2025.competitors\stimuli\movies\friends\s1\friends_s01e01a.mkv"
load_mkv_file(movie_path)

>>> Video Information <<<
Video FPS: 29.968454258675077
Video Resolution: 720x480
Total Frames: 26412
Video Duration: 881.33 seconds or 14.69 minutes

>>> Audio Information <<<
Audio Duration: 881.33 seconds
Audio FPS (Sample Rate): 44100 Hz

Creating clip of the first 20 seconds of the video...


In [4]:
def load_tsv_file(transcript_path):
    """
    Load and visualize language transcript data from the given .TSV file.

    Parameters
    ----------
    transcript_path : str
        Path to the .tsv transcript file.

    """

    # Load the .tsv into a pandas DataFrame
    transcript_df = pd.read_csv(transcript_path, sep='\t')

    # Select the first 20 rows (chunks)
    sample_transcript_data = transcript_df.iloc[:20]

    # Display the first 20 rows (chunks)
    # The first 11 rows are empty since no words were spoken during the
    # beginning of the episode.
    print("Transcript data (Rows 0 to 20):")
    display(sample_transcript_data)

    # Print other transcript info
    print(f"\nTranscript has {transcript_df.shape[0]} rows (chunks of 1.49 seconds) and {transcript_df.shape[1]} columns.")


In [5]:
# Load the .tsv file
transcript_path = r"C:\Projects\algonauts 2025 data\algonauts_2025.competitors\stimuli\transcripts\friends\s1\friends_s01e01a.tsv"
load_tsv_file(transcript_path)

Transcript data (Rows 0 to 20):


Unnamed: 0,text_per_tr,words_per_tr,onsets_per_tr,durations_per_tr
0,,[],[],[]
1,,[],[],[]
2,,[],[],[]
3,,[],[],[]
4,,[],[],[]
5,,[],[],[]
6,,[],[],[]
7,,[],[],[]
8,,[],[],[]
9,,[],[],[]



Transcript has 591 rows (chunks of 1.49 seconds) and 4 columns.


In [6]:
def load_transcript(transcript_path):
    """
    Loads a transcript file and returns it as a DataFrame.

    Parameters
    ----------
    transcript_path : str
        Path to the .tsv transcript file.

    """
    df = pd.read_csv(transcript_path, sep='\t')
    return df


def get_movie_info(movie_path):
    """
    Extracts the frame rate (FPS) and total duration of a movie.

    Parameters
    ----------
    movie_path : str
        Path to the .mkv movie file.

    """

    cap = cv2.VideoCapture(movie_path)
    fps, frame_count = cap.get(cv2.CAP_PROP_FPS), cap.get(cv2.CAP_PROP_FRAME_COUNT)
    cap.release()

    return fps, frame_count / fps


def split_movie_into_chunks(movie_path, chunk_duration=1.49):
    """
    Divides a video into fixed-duration chunks.

    Parameters
    ----------
    movie_path : str
        Path to the .mkv movie file.
    chunk_duration : float, optional
        Duration of each chunk in seconds (default is 1.49).

    """

    _, video_duration = get_movie_info(movie_path)
    chunks = []
    start_time = 0.0

    # Create chunks for the specified time
    while start_time < video_duration:
        end_time = min(start_time + chunk_duration, video_duration)
        chunks.append((start_time, end_time))
        start_time += chunk_duration
    return chunks

def extract_movie_segment_with_sound(movie_path, start_time, end_time,
    output_path='output_segment.mp4'):
    """
    Extracts a specific segment of a video with sound and saves it.

    Parameters
    ----------
    movie_path : str
        Path to the .mkv movie file.
    start_time : float
        Start time of the segment in seconds.
    end_time : float
        End time of the segment in seconds.
    output_path : str, optional
        Path to save the output segment (default is 'output_segment.mp4').

    """

    # Create movie segment
    movie_segment = VideoFileClip(movie_path).subclip(start_time, end_time)
    print(f"\nWriting movie file from {start_time}s until {end_time}s")

    # Write video file
    movie_segment.write_videofile(output_path, codec="libx264",
        audio_codec="aac", verbose=False, logger=None)
    return output_path


def display_transcript_and_movie(chunk_index, transcript_df, chunks,
    movie_path):
    """
    Displays transcript, movie, onset, and duration for a selected chunk.

    Parameters
    ----------
    chunk_index : int
        Index of the selected chunk.
    transcript_df : DataFrame
        DataFrame containing transcript data.
    chunks : list
        List of (start_time, end_time) tuples for video chunks.
    movie_path : str
        Path to the .mkv movie file.

    """
    # Retrieve the start and end times for the selected chunk
    start_time, end_time = chunks[chunk_index]

    # Get the corresponding transcript row if it exists in the DataFrame
    transcript_chunk = transcript_df.iloc[chunk_index] if chunk_index < len(transcript_df) else None

    # Display the stimulus chunk number
    print(f"\nChunk number: {chunk_index + 1}")

    # Display transcript details if available; otherwise, indicate no dialogue
    if transcript_chunk is not None and pd.notna(transcript_chunk['text_per_tr']):
        print(f"\nText: {transcript_chunk['text_per_tr']}")
        print(f"Words: {transcript_chunk['words_per_tr']}")
        print(f"Onsets: {transcript_chunk.get('onsets_per_tr', 'N/A')}")
        print(f"Durations: {transcript_chunk.get('durations_per_tr', 'N/A')}")
    else:
        print("<No dialogue in this scene>")

    # Extract and display the video segment
    output_movie_path = extract_movie_segment_with_sound(movie_path, start_time,
        end_time)
    display(Video(output_movie_path, embed=True, width=640, height=480))


def create_dropdown_by_text(transcript_df):
    """
    Creates a dropdown widget for selecting chunks by their text.

    Parameters
    ----------
    transcript_df : DataFrame
        DataFrame containing transcript data.

    """

    options = []

    # Iterate over each row in the transcript DataFrame
    for i, row in transcript_df.iterrows():
        if pd.notna(row['text_per_tr']):  # Check if the transcript text is not NaN
            options.append((row['text_per_tr'], i))
        else:
            options.append(("<No dialogue in this scene>", i))
    return widgets.Dropdown(options=options, description='Select scene:')


def interface_display_transcript_and_movie(movie_path, transcript_path):
    """
    Interactive interface to align movie and transcript chunks.

    Parameters
    ----------
    movie_path : str
        Path to the .mkv movie file.
    transcript_path : str
        Path to the transcript file (.tsv).

    """

    # Load the transcript data from the provided path
    transcript_df = load_transcript(transcript_path)

    # Split the video file into chunks of 1.49 seconds
    chunks = split_movie_into_chunks(movie_path)

    # Create a dropdown widget with transcript text as options
    dropdown = create_dropdown_by_text(transcript_df)

    # Create an output widget to display video and transcript details
    output = widgets.Output()

    # Display the dropdown and output widgets
    display(dropdown, output)

    # Define the function to handle dropdown value changes
    def on_chunk_select(change):
        with output:
            output.clear_output()  # Clears previous content
            chunk_index = dropdown.value
            display_transcript_and_movie(chunk_index, transcript_df, chunks,
                movie_path)

    dropdown.observe(on_chunk_select, names='value')

In [7]:
# Align the .mkv movies and .tsv language transcripts
interface_display_transcript_and_movie(movie_path, transcript_path)

Dropdown(description='Select scene:', options=(('<No dialogue in this scene>', 0), ('<No dialogue in this scen…

Output()

Loading the fMRI timeseries of one of the four subjects for either Friends or Movie10, and visualize their content.

In [8]:
# Function to list available subjects based on folder names
def list_subjects(fmri_dir):
    return sorted([d for d in os.listdir(fmri_dir) if d.startswith('sub-')])

# Function to explore HDF5 file structure and organize datasets by season/movie
def explore_h5_file(file_path, selected_dataset):
    season_movie_dict = {}
    with h5py.File(file_path, 'r') as h5_file:
        for name, obj in h5_file.items():
            if isinstance(obj, h5py.Dataset):
                if selected_dataset == 'Friends':
                    season_movie = name.split('_')[1].split('-')[1][:3]  # Extract season (e.g., 's01')
                elif selected_dataset == 'Movie10':
                    season_movie = name.split('_')[1].split('-')[1][:-2]  # Extract movie (e.g., 'bourne')
                season_movie_dict.setdefault(season_movie, []).append(f"{name} (Shape: {obj.shape})")
    return season_movie_dict

# Function to display datasets in a DataFrame
def display_datasets_in_table(season_dict):
    max_len = max(len(v) for v in season_dict.values())
    df = pd.DataFrame({k: v + [''] * (max_len - len(v)) for k, v in sorted(season_dict.items())})
    display(df)

# Create subject and dataset selector widget
def create_subject_selector(fmri_dir):
    subjects = list_subjects(fmri_dir)
    dataset_options = ['Friends', 'Movie10']

    subject_dropdown = Dropdown(options=subjects, description='Select Subject:')
    dataset_dropdown = Dropdown(options=dataset_options, description='Select Dataset:')
    button = Button(description="Explore File", button_style='primary')

    def on_button_click(b):
        clear_output(wait=True)
        display(VBox([subject_dropdown, dataset_dropdown, button]))

        selected_subject = subject_dropdown.value
        selected_dataset = dataset_dropdown.value

        if selected_dataset == 'Friends':
            h5_file_path = os.path.join(
                fmri_dir, selected_subject, 'func',
                f"{selected_subject}_task-friends_space-MNI152NLin2009cAsym_atlas-Schaefer18_parcel-1000Par7Net_desc-s123456_bold.h5"
            )
        elif selected_dataset == 'Movie10':
            h5_file_path = os.path.join(
                fmri_dir, selected_subject, 'func',
                f"{selected_subject}_task-movie10_space-MNI152NLin2009cAsym_atlas-Schaefer18_parcel-1000Par7Net_bold.h5"
            )

        if os.path.exists(h5_file_path):
            season_movie_data = explore_h5_file(h5_file_path, selected_dataset)
            display_datasets_in_table(season_movie_data)
        else:
            print("Error: HDF5 file not found.")

    button.on_click(on_button_click)
    display(VBox([subject_dropdown, dataset_dropdown, button]))

# Base directory for fMRI data
fmri_dir = r"C:\Projects\algonauts 2025 data\algonauts_2025.competitors\fmri/"

# Run the subject selector widget
create_subject_selector(fmri_dir)


VBox(children=(Dropdown(description='Select Subject:', options=('sub-01', 'sub-02', 'sub-03', 'sub-05'), value…

Unnamed: 0,s01,s02,s03,s04,s05,s06
0,"ses-001_task-s01e02a (Shape: (482, 1000))","ses-010_task-s02e01a (Shape: (477, 1000))","ses-019_task-s03e01a (Shape: (491, 1000))","ses-027_task-s04e01a (Shape: (468, 1000))","ses-036_task-s05e01a (Shape: (439, 1000))","ses-052_task-s06e01a (Shape: (465, 1000))"
1,"ses-001_task-s01e02b (Shape: (482, 1000))","ses-010_task-s02e01b (Shape: (477, 1000))","ses-019_task-s03e01b (Shape: (491, 1000))","ses-027_task-s04e01b (Shape: (468, 1000))","ses-036_task-s05e01b (Shape: (474, 1000))","ses-052_task-s06e01b (Shape: (499, 1000))"
2,"ses-001_task-s01e03a (Shape: (472, 1000))","ses-011_task-s02e02a (Shape: (450, 1000))","ses-019_task-s03e02a (Shape: (475, 1000))","ses-027_task-s04e02a (Shape: (478, 1000))","ses-036_task-s05e02a (Shape: (495, 1000))","ses-053_task-s06e02a (Shape: (453, 1000))"
3,"ses-001_task-s01e03b (Shape: (472, 1000))","ses-011_task-s02e02b (Shape: (450, 1000))","ses-019_task-s03e02b (Shape: (475, 1000))","ses-027_task-s04e02b (Shape: (478, 1000))","ses-036_task-s05e02b (Shape: (531, 1000))","ses-053_task-s06e02b (Shape: (487, 1000))"
4,"ses-002_task-s01e04a (Shape: (503, 1000))","ses-011_task-s02e03a (Shape: (464, 1000))","ses-019_task-s03e03a (Shape: (454, 1000))","ses-028_task-s04e03a (Shape: (445, 1000))","ses-037_task-s05e03a (Shape: (444, 1000))","ses-053_task-s06e03a (Shape: (439, 1000))"
5,"ses-002_task-s01e04b (Shape: (503, 1000))","ses-011_task-s02e03b (Shape: (464, 1000))","ses-019_task-s03e03b (Shape: (454, 1000))","ses-028_task-s04e03b (Shape: (445, 1000))","ses-037_task-s05e03b (Shape: (474, 1000))","ses-053_task-s06e03b (Shape: (473, 1000))"
6,"ses-002_task-s01e05a (Shape: (468, 1000))","ses-011_task-s02e04a (Shape: (454, 1000))","ses-020_task-s03e04a (Shape: (473, 1000))","ses-028_task-s04e04a (Shape: (453, 1000))","ses-037_task-s05e04a (Shape: (488, 1000))","ses-053_task-s06e04a (Shape: (439, 1000))"
7,"ses-002_task-s01e05b (Shape: (468, 1000))","ses-011_task-s02e04b (Shape: (454, 1000))","ses-020_task-s03e04b (Shape: (473, 1000))","ses-028_task-s04e04b (Shape: (453, 1000))","ses-037_task-s05e04b (Shape: (524, 1000))","ses-054_task-s06e04b (Shape: (474, 1000))"
8,"ses-003_task-s01e01a (Shape: (592, 1000))","ses-011_task-s02e05a (Shape: (455, 1000))","ses-020_task-s03e05a (Shape: (484, 1000))","ses-028_task-s04e05a (Shape: (471, 1000))","ses-041_task-s05e05a (Shape: (486, 1000))","ses-054_task-s06e05a (Shape: (452, 1000))"
9,"ses-003_task-s01e01b (Shape: (592, 1000))","ses-011_task-s02e05b (Shape: (455, 1000))","ses-020_task-s03e05b (Shape: (484, 1000))","ses-028_task-s04e05b (Shape: (471, 1000))","ses-041_task-s05e05b (Shape: (521, 1000))","ses-054_task-s06e05b (Shape: (486, 1000))"


In [9]:
def plot_fmri_on_brain(chunk_index, fmri_file_path, atlas_path, dataset_name,
    hrf_delay):
    """
    Map fMRI responses to brain parcels and plot it on a glass brain.

    Parameters
    ----------
    chunk_index : pandas.Series
        The selected chunk from the transcript, used to determine the fMRI
        sample.
    fmri_file_path : str
        Path to the HDF5 file containing fMRI data.
    atlas_path : str
        Path to the atlas NIfTI file.
    dataset_name : str
        Name of the dataset inside the HDF5 file.
    hrf_delay : int
        fMRI detects the BOLD (Blood Oxygen Level Dependent) response, a signal
        that reflects changes in blood oxygenation levels in response to
        activity in the brain. Blood flow increases to a given brain region in
        response to its activity. This vascular response, which follows the
        hemodynamic response function (HRF), takes time. Typically, the HRF
        peaks around 5–6 seconds after a neural event: this delay reflects the
        time needed for blood oxygenation changes to propagate and for the fMRI
        signal to capture them. Therefore, this parameter introduces a delay
        between stimulus chunks and fMRI samples for a better correspondence
        between input stimuli and the brain response. For example, with a
        hrf_delay of 3, if the stimulus chunk of interest is 17, the
        corresponding fMRI sample will be 20.

    """

    print(f"\nLoading fMRI file: {fmri_file_path}")

    # Load the atlas image
    atlas_img = nib.load(atlas_path)
    atlas_data = atlas_img.get_fdata()

    # Open the fMRI reeponses file, and extract the specific dataset
    with h5py.File(fmri_file_path, 'r') as f:
        print(f"Opening fMRI dataset: {dataset_name}")
        fmri_data = f[dataset_name][()]
        print(f"fMRI dataset shape: {fmri_data.shape}")

    # Extract the corresponding sample from the fMRI responses based on the
    # selected transcript chunk, and on the hrf_delay
    if (chunk_index + hrf_delay) > len(fmri_data):
        selected_sample = len(fmri_data)
    else:
        selected_sample = chunk_index + hrf_delay
    fmri_sample_data = fmri_data[selected_sample]
    print(f"Extracting fMRI sample {selected_sample+1}.")

    # Map fMRI sample values to the brain parcels in the atlas
    output_data = np.zeros_like(atlas_data)
    for parcel_index in range(1000):
        output_data[atlas_data == (parcel_index + 1)] = \
            fmri_sample_data[parcel_index]

    # Create the output NIfTI image
    output_img = nib.Nifti1Image(output_data, affine=atlas_img.affine)

    # Plot the glass brain with the mapped fMRI data
    display = plotting.plot_glass_brain(
        output_img,
        display_mode='lyrz',
        cmap='inferno',
        colorbar=True,
        plot_abs=False)
    colorbar = display._cbar
    colorbar.set_label("fMRI activity", rotation=90, labelpad=12, fontsize=12)
    plotting.show()

In [10]:
# Main interactive interface with brain visualization
def interface_display_transcript_movie_brain(movie_path, transcript_path,
    fmri_file_path, atlas_path, dataset_name, hrf_delay):
    """
    Interactive interface to display movie and transcripts chunks along with
    the fMRI response from the corresponding sample.

    This code uses functions from Section 1.2.3.

    Parameters
    ----------
    movie_path : str
        Path to the .mkv movie file.
    transcript_path : str
        Path to the .tsv transcript file.
    fmri_file_path : str
        Path to the fMRI data file.
    atlas_path : str
        Path to the brain atlas file.
    dataset_name : str
        Name of the dataset to display fMRI data from.
    hrf_delay : int
        fMRI detects the BOLD (Blood Oxygen Level Dependent) response, a signal
        that reflects changes in blood oxygenation levels in response to
        activity in the brain. Blood flow increases to a given brain region in
        response its activity. This vascular response, which follows the
        hemodynamic response function (HRF), takes time. Typically, the HRF
        peaks around 5–6 seconds after a neural event: this delay reflects the
        time needed for blood oxygenation changes to propagate and for the fMRI
        signal to capture them. Therefore, this parameter introduces a delay
        between stimulus chunks and fMRI samples. For example, with a hrf_delay
        of 3, if the stimulus chunk of interest is 17, the corresponding fMRI
        sample will be 20.

    """

    # Load the .tsv transcript data from the provided path
    transcript_df = load_transcript(transcript_path)  # from 1.2.3

    # Split the .mkv movie file into chunks of 1.49 seconds
    chunks = split_movie_into_chunks(movie_path)  # from 1.2.3

    # Create a dropdown widget with transcript text as options
    dropdown = create_dropdown_by_text(transcript_df)  # from 1.2.3

    # Create an output widget to display video, transcript, and brain
    # visualization
    output = widgets.Output()

    # Define the function to handle dropdown value changes
    def on_chunk_select(change):
        with output:
            output.clear_output()  # Clear the previous output
            chunk_index = dropdown.value

            # Display video chunk and transcript
            display_transcript_and_movie(chunk_index, transcript_df, chunks,
                movie_path)  # from 1.2.3

            # Visualize brain fMRI data
            plot_fmri_on_brain(chunk_index, fmri_file_path, atlas_path,
                dataset_name, hrf_delay)

    dropdown.observe(on_chunk_select, names='value')
    display(dropdown, output)

In [None]:
# HRF delay parameter
hrf_delay = 3  #@param {type:"slider", min:0, max:10, step:1}

root_data_dir = r"C:\Projects\algonauts 2025 data"

# Define file paths and dataset name
movie_path = root_data_dir + "/algonauts_2025.competitors/stimuli/movies/friends/s1/friends_s01e01a.mkv"
transcript_path = root_data_dir + "/algonauts_2025.competitors/stimuli/transcripts/friends/s1/friends_s01e01a.tsv"
fmri_file_path = root_data_dir + "/algonauts_2025.competitors/fmri/sub-01/func/sub-01_task-friends_space-MNI152NLin2009cAsym_atlas-Schaefer18_parcel-1000Par7Net_desc-s123456_bold.h5"
atlas_path = root_data_dir + "/algonauts_2025.competitors/fmri/sub-01/atlas/sub-01_space-MNI152NLin2009cAsym_atlas-Schaefer18_parcel-1000Par7Net_desc-dseg_parcellation.nii.gz"
dataset_name = "ses-003_task-s01e01a"

# Get the selected transcript row/chunk from the interface
interface_display_transcript_movie_brain(movie_path, transcript_path,
    fmri_file_path, atlas_path, dataset_name, hrf_delay)

Dropdown(description='Select scene:', options=(('<No dialogue in this scene>', 0), ('<No dialogue in this scen…

Output()

Stimulus feature extraction

Module to extract visual, audio and language features from the multimodal movie stimuli (focusing on the first half of the first episode from the first season of Friends as an example), and reduce their dimensionality using principal component analysis (PCA). 

In [12]:
def define_frames_transform():
    """Defines the preprocessing pipeline for the video frames. Note that this
    transform is specific to the slow_r50 model."""
    transform = Compose(
        [
            UniformTemporalSubsample(8),
            Lambda(lambda x: x/255.0),
            Normalize([0.45, 0.45, 0.45], [0.225, 0.225, 0.225]),
            ShortSideScale(size=256),
            CenterCrop(256)
        ]
  )
    return transform

transform = define_frames_transform()

In [14]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [15]:
def get_vision_model(device):
    """
    Load a pre-trained slow_r50 video model and set up the feature extractor.

    Parameters
    ----------
    device : torch.device
        The device on which the model will run (i.e., 'cpu' or 'cuda').

    Returns
    -------
    feature_extractor : torch.nn.Module
        The feature extractor model.
    model_layer : str
        The layer from which visual features will be extracted.

    """

    # Load the model
    model = torch.hub.load('facebookresearch/pytorchvideo', 'slow_r50',
        pretrained=True)

    # Select 'blocks.5.pool' as the feature extractor layer
    model_layer = 'blocks.5.pool'
    feature_extractor = create_feature_extractor(model,
        return_nodes=[model_layer])
    feature_extractor.to(device)
    feature_extractor.eval()

    return feature_extractor, model_layer

feature_extractor, model_layer = get_vision_model(device)

Downloading: "https://github.com/facebookresearch/pytorchvideo/zipball/main" to C:\Users\Pratik/.cache\torch\hub\main.zip
Downloading: "https://dl.fbaipublicfiles.com/pytorchvideo/model_zoo/kinetics/SLOW_8x8_R50.pyth" to C:\Users\Pratik/.cache\torch\hub\checkpoints\SLOW_8x8_R50.pyth
100%|██████████| 248M/248M [00:20<00:00, 12.4MB/s] 


In [16]:
def extract_visual_features(episode_path, tr, feature_extractor, model_layer,
    transform, device, save_dir_temp, save_dir_features):
    """
    Extract visual features from a movie using a pre-trained video model.

    Parameters
    ----------
    episode_path : str
        Path to the movie file for which the visual features are extracted.
    tr : float
        Duration of each chunk, in seconds (aligned with the fMRI repetition
        time, or TR).
    feature_extractor : torch.nn.Module
        Pre-trained feature extractor model.
    model_layer : str
        The model layer from which the visual features are extracted.
    transform : torchvision.transforms.Compose
        Transformation pipeline for processing video frames.
    device : torch.device
        Device for computation ('cpu' or 'cuda').
    save_dir_temp : str
        Directory where the chunked movie clips are temporarily stored for
        feature extraction.
    save_dir_features : str
        Directory where the extracted visual features are saved.

    Returns
    -------
    visual_features : float
        Array containing the extracted visual features.

    """

    # Get the onset time of each movie chunk
    clip = VideoFileClip(episode_path)
    start_times = [x for x in np.arange(0, clip.duration, tr)][:-1]
    # Create the directory where the movie chunks are temporarily saved
    temp_dir = os.path.join(save_dir_temp, 'temp')
    os.makedirs(temp_dir, exist_ok=True)

    # Empty features list
    visual_features = []

    # Loop over chunks
    with tqdm(total=len(start_times), desc="Extracting visual features") as pbar:
        for start in start_times:

            # Divide the movie in chunks of length TR, and save the resulting
            # clips as '.mp4' files
            clip_chunk = clip.subclip(start, start+tr)
            chunk_path = os.path.join(temp_dir, 'visual_chunk.mp4')
            clip_chunk.write_videofile(chunk_path, verbose=False, audio=False,
                logger=None)
            # Load the frames from the chunked movie clip
            video_clip = VideoFileClip(chunk_path)
            chunk_frames = [frame for frame in video_clip.iter_frames()]

            # Format the frames to shape:
            # (batch_size, channels, num_frames, height, width)
            frames_array = np.transpose(np.array(chunk_frames), (3, 0, 1, 2))
            # Convert the video frames to tensor
            inputs = torch.from_numpy(frames_array).float()
            # Preprocess the video frames
            inputs = transform(inputs).unsqueeze(0).to(device)

            # Extract the visual features
            with torch.no_grad():
                preds = feature_extractor(inputs)
            visual_features.append(np.reshape(preds[model_layer].cpu().numpy(), -1))

            # Update the progress bar
            pbar.update(1)

    # Convert the visual features to float32
    visual_features = np.array(visual_features, dtype='float32')

    # Save the visual features
    #out_file_visual = os.path.join(
    #    save_dir_features, f'friends_s01e01a_features_visual.h5')
    #with h5py.File(out_file_visual, 'a' if Path(out_file_visual).exists() else 'w') as f:
    #    group = f.create_group("s01e01a")
    #    group.create_dataset('visual', data=visual_features, dtype=np.float32)
    #print(f"Visual features saved to {out_file_visual}")

    # Output
    return visual_features

In [17]:
# As an exemple, extract visual features for season 1, episode 1 of Friends
episode_path = root_data_dir + "/algonauts_2025.competitors/stimuli/movies/friends/s1/friends_s01e01a.mkv"

# Duration of each movie chunk, aligned with the fMRI TR of 1.49 seconds
tr = 1.49

# Saving directories
save_dir_temp = "./visual_features"
save_dir_features = root_data_dir +  "/stimulus_features/raw/visual/"

# Execute visual feature extraction
visual_features = extract_visual_features(episode_path, tr, feature_extractor,
    model_layer, transform, device, save_dir_temp, save_dir_features)

Extracting visual features:   0%|          | 0/591 [00:00<?, ?it/s]

In [18]:
# Print the features shape
print("Visual features shape for 'friends_s01e01a.mkv':")
print(visual_features.shape)
print('(Movie samples × Visual features length)')

# Visualize the features for five movie chunks
print("\nVisual feature vectors for 5 movie chunks:\n")
print(visual_features[20:25])

Visual features shape for 'friends_s01e01a.mkv':
(591, 8192)
(Movie samples × Visual features length)

Visual feature vectors for 5 movie chunks:

[[0.57793087 0.6128943  0.6525781  ... 0.04101982 0.03598054 0.0462507 ]
 [0.3488337  0.34398144 0.55811393 ... 0.01288053 0.01017576 0.01288053]
 [0.09742165 0.10592699 0.1143828  ... 0.01131412 0.05298466 0.05829995]
 [0.28679317 0.298879   0.512058   ... 0.09510279 0.06591963 0.06591963]
 [0.25319642 0.31075132 0.46866494 ... 0.01662401 0.01250965 0.01250965]]
