<a href="https://colab.research.google.com/github/mobarakol/tutorial_notebooks/blob/main/xclip_video_feature_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#X-clip video processing: https://arxiv.org/pdf/2207.07285

In [1]:
!pip -q install av

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m35.3/35.3 MB[0m [31m27.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [1]:
import os

# Set a custom Hugging Face home directory
os.environ['HF_HOME'] = '/content'

#Extracting Video Features:(8 frames by default)

In [2]:
import av
import torch
import numpy as np

from transformers import AutoProcessor, AutoModel
from huggingface_hub import hf_hub_download

np.random.seed(0)


def read_video_pyav(container, indices):
    '''
    Decode the video with PyAV decoder.
    Args:
        container (`av.container.input.InputContainer`): PyAV container.
        indices (`List[int]`): List of frame indices to decode.
    Returns:
        result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
    '''
    frames = []
    container.seek(0)
    start_index = indices[0]
    end_index = indices[-1]
    for i, frame in enumerate(container.decode(video=0)):
        if i > end_index:
            break
        if i >= start_index and i in indices:
            frames.append(frame)
    return np.stack([x.to_ndarray(format="rgb24") for x in frames])


def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
    '''
    Sample a given number of frame indices from the video.
    Args:
        clip_len (`int`): Total number of frames to sample.
        frame_sample_rate (`int`): Sample every n-th frame.
        seg_len (`int`): Maximum allowed index of sample's last frame.
    Returns:
        indices (`List[int]`): List of sampled frame indices
    '''
    converted_len = int(clip_len * frame_sample_rate)
    end_idx = np.random.randint(converted_len, seg_len)
    start_idx = end_idx - converted_len
    indices = np.linspace(start_idx, end_idx, num=clip_len)
    indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
    return indices


# video clip consists of 300 frames (10 seconds at 30 FPS)
file_path = hf_hub_download(
    repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
)
container = av.open(file_path)

# sample 8 frames
indices = sample_frame_indices(clip_len=8, frame_sample_rate=1, seg_len=container.streams.video[0].frames)
video = read_video_pyav(container, indices)
print('video size:', video.shape)

processor = AutoProcessor.from_pretrained("microsoft/xclip-base-patch32")
model = AutoModel.from_pretrained("microsoft/xclip-base-patch32")

inputs = processor(videos=list(video), return_tensors="pt")
pixel_values = inputs.pixel_values
pixel_values = torch.cat([pixel_values, pixel_values])

video_features = model.get_video_features(pixel_values=pixel_values)
print('extracted video features size:', video_features.shape)

video size: (8, 360, 640, 3)
vision_outputs[1]: torch.Size([16, 768])
vision_outputs: 2
vision_outputs[0]: torch.Size([16, 50, 768])
cls_features: torch.Size([2, 8, 512])
extracted video features size: torch.Size([2, 512])


In [23]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, XCLIPModel, XCLIPProcessor
model_name = "microsoft/xclip-base-patch32"
model = XCLIPModel.from_pretrained(model_name)
pixel_values = inputs.pixel_values
print('pixel_values:', pixel_values.shape)
video_features = model.get_video_features(pixel_values=pixel_values)
print('video_features:', video_features.shape)
# video_embeds = visual_encoder.vision_model(pixel_values=pixel_values).last_hidden_state

pixel_values: torch.Size([1, 8, 3, 224, 224])
video_features: torch.Size([1, 512])


In [17]:
pixel_values = inputs.pixel_values
pixel_values = torch.cat([pixel_values, pixel_values])
print('pixel_values:', pixel_values.shape)
video_features = model.get_video_features(pixel_values=pixel_values)
print('video_features:', video_features.shape)


pixel_values: torch.Size([2, 8, 3, 224, 224])
video_features: torch.Size([2, 512])


In [10]:
pixel_values = inputs.pixel_values
pixel_values = torch.cat([pixel_values, pixel_values])
print('pixel_values:', pixel_values.shape)
batch_size, num_frames, num_channels, height, width = pixel_values.shape
pixel_values = pixel_values.reshape(-1, num_channels, height, width)
print('pixel_values:', pixel_values.shape)

# output_attentions = None
# output_hidden_states = None
# return_dict = None
# config = model.vision_model.config
# output_attentions = output_attentions if output_attentions is not None else config.output_attentions
# output_hidden_states = (
#     output_hidden_states if output_hidden_states is not None else config.output_hidden_states
# )
# return_dict = return_dict if return_dict is not None else config.use_return_dict

# framewise_features = model.vision_model(pixel_values=pixel_values,
#             output_attentions=output_attentions,
#             output_hidden_states=output_hidden_states,
#             return_dict=return_dict,).last_hidden_state

framewise_features = model.vision_model(pixel_values=pixel_values)

print('framewise_features[0]:', framewise_features[0].shape)
print('framewise_features[1]:', framewise_features[1].shape)
video_embeds = model.visual_projection(framewise_features[1])
# video_embeds = framewise_features[1]
print('video_embeds:', video_embeds.shape)
cls_features = video_embeds.view(batch_size, num_frames, -1)
print('cls_features:', cls_features.shape)
mit_outputs = model.mit(cls_features)
print('mit_outputs[1]:', mit_outputs[1].shape)
print('mit_outputs[0]:', mit_outputs[0].shape)

pixel_values: torch.Size([2, 8, 3, 224, 224])
pixel_values: torch.Size([16, 3, 224, 224])
framewise_features[0]: torch.Size([16, 50, 768])
framewise_features[1]: torch.Size([16, 768])
video_embeds: torch.Size([16, 512])
cls_features: torch.Size([2, 8, 512])
mit_outputs[1]: torch.Size([2, 512])
mit_outputs[0]: torch.Size([2, 8, 512])


video_embeds: torch.Size([16, 50, 512])


In [None]:
proj = model_xclip.visual_projection
out = proj(last_hidden_state)
mit = model_xclip.mit
out = mit (out.permute(1,0,2))
out[0].shape, out[1].shape

In [None]:
model.vision_model.embeddings

XCLIPVisionEmbeddings(
  (patch_embedding): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
  (position_embedding): Embedding(50, 768)
)

In [None]:
model

XCLIPModel(
  (text_model): XCLIPTextTransformer(
    (embeddings): XCLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 512)
      (position_embedding): Embedding(77, 512)
    )
    (encoder): XCLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x XCLIPEncoderLayer(
          (self_attn): XCLIPAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (mlp): XCLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=512, out_features=2048, bias=True)
            (fc2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (layer_norm2): LayerNorm((512,), eps

#Extracting Frame-wise Feature (8 frames by default)

In [None]:
import av
import torch
import numpy as np

from transformers import AutoProcessor, XCLIPVisionModel
from huggingface_hub import hf_hub_download

np.random.seed(0)


def read_video_pyav(container, indices):
    '''
    Decode the video with PyAV decoder.
    Args:
        container (`av.container.input.InputContainer`): PyAV container.
        indices (`List[int]`): List of frame indices to decode.
    Returns:
        result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
    '''
    frames = []
    container.seek(0)
    start_index = indices[0]
    end_index = indices[-1]
    for i, frame in enumerate(container.decode(video=0)):
        if i > end_index:
            break
        if i >= start_index and i in indices:
            frames.append(frame)
    return np.stack([x.to_ndarray(format="rgb24") for x in frames])


def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
    '''
    Sample a given number of frame indices from the video.
    Args:
        clip_len (`int`): Total number of frames to sample.
        frame_sample_rate (`int`): Sample every n-th frame.
        seg_len (`int`): Maximum allowed index of sample's last frame.
    Returns:
        indices (`List[int]`): List of sampled frame indices
    '''
    converted_len = int(clip_len * frame_sample_rate)
    end_idx = np.random.randint(converted_len, seg_len)
    start_idx = end_idx - converted_len
    indices = np.linspace(start_idx, end_idx, num=clip_len)
    indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
    return indices


# video clip consists of 300 frames (10 seconds at 30 FPS) https://huggingface.co/datasets/nielsr/video-demo/blob/main/eating_spaghetti.mp4
file_path = hf_hub_download(
    repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
)
container = av.open(file_path)

# sample 16 frames
indices = sample_frame_indices(clip_len=8, frame_sample_rate=1, seg_len=container.streams.video[0].frames)
video = read_video_pyav(container, indices)
print('video size:', video.shape)

processor = AutoProcessor.from_pretrained("microsoft/xclip-base-patch32")
model = XCLIPVisionModel.from_pretrained("microsoft/xclip-base-patch32")

pixel_values = processor(videos=list(video), return_tensors="pt").pixel_values

batch_size, num_frames, num_channels, height, width = pixel_values.shape
pixel_values = pixel_values.reshape(-1, num_channels, height, width)

outputs = model(pixel_values)
last_hidden_state = outputs.last_hidden_state

print('extracted frame wise features size:', last_hidden_state.shape)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


eating_spaghetti.mp4:   0%|          | 0.00/1.01M [00:00<?, ?B/s]

video size: (8, 360, 640, 3)


preprocessor_config.json:   0%|          | 0.00/309 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/965 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/862k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/472 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/4.72k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/786M [00:00<?, ?B/s]

  return torch.tensor(value)


extracted frame wise features size: torch.Size([8, 50, 768])


#Extracting Frame-wise Feature (5 frames customized)

In [None]:
from transformers import XCLIPVisionConfig
from safetensors.torch import load_file

configuration = XCLIPVisionConfig()
configuration.num_frames = 5
# configuration.mit_num_attention_heads = 5


indices = sample_frame_indices(clip_len=5, frame_sample_rate=1, seg_len=container.streams.video[0].frames)
video = read_video_pyav(container, indices)
print('video size:', video.shape)

processor = AutoProcessor.from_pretrained("microsoft/xclip-base-patch32")
model = XCLIPVisionModel(configuration)

state_dict = load_file('/content/hub/models--microsoft--xclip-base-patch32/snapshots/a2e27a78a2b5d802e894b8a1ef14f3a8ce490963/model.safetensors')
model.load_state_dict(state_dict, strict=False)

pixel_values = processor(videos=list(video), return_tensors="pt").pixel_values

batch_size, num_frames, num_channels, height, width = pixel_values.shape
pixel_values = pixel_values.reshape(-1, num_channels, height, width)

outputs = model(pixel_values)
last_hidden_state = outputs.last_hidden_state

print('extracted frame wise features size:', last_hidden_state.shape)

video size: (5, 360, 640, 3)




extracted frame wise features size: torch.Size([5, 50, 768])


#Extracting Frame-wise Feature (2 frames customized)

In [None]:
from safetensors.torch import load_file
from transformers import AutoProcessor, XCLIPVisionModel, XCLIPVisionConfig

num_frames = 2

configuration = XCLIPVisionConfig()
configuration.num_frames = num_frames

indices = sample_frame_indices(clip_len=num_frames, frame_sample_rate=1, seg_len=container.streams.video[0].frames)
video = read_video_pyav(container, indices)
print('video size:', video.shape)

processor = AutoProcessor.from_pretrained("microsoft/xclip-base-patch32")
model = XCLIPVisionModel(configuration)

state_dict = load_file('/content/hub/models--microsoft--xclip-base-patch32/snapshots/a2e27a78a2b5d802e894b8a1ef14f3a8ce490963/model.safetensors')
model.load_state_dict(state_dict, strict=False)

pixel_values = processor(videos=list(video), return_tensors="pt").pixel_values
print('pixel_values1:', pixel_values.shape)
batch_size, num_frames, num_channels, height, width = pixel_values.shape
pixel_values = pixel_values.reshape(-1, num_channels, height, width)
print('pixel_values2:', pixel_values.shape)

outputs = model(pixel_values)
last_hidden_state = outputs.last_hidden_state

print('extracted frame wise features size:', last_hidden_state.shape)

video size: (2, 360, 640, 3)
pixel_values1: torch.Size([1, 2, 3, 224, 224])
pixel_values2: torch.Size([2, 3, 224, 224])
extracted frame wise features size: torch.Size([2, 50, 768])


In [None]:
model

XCLIPVisionModel(
  (vision_model): XCLIPVisionTransformer(
    (embeddings): XCLIPVisionEmbeddings(
      (patch_embedding): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
      (position_embedding): Embedding(50, 768)
    )
    (pre_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (encoder): XCLIPVisionEncoder(
      (layers): ModuleList(
        (0-11): 12 x XCLIPVisionEncoderLayer(
          (message_fc): Linear(in_features=768, out_features=768, bias=True)
          (message_ln): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (message_attn): XCLIPAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (drop_path): Identity()
          (self_attn): XCL

In [None]:
from transformers import AutoProcessor, XCLIPModel, XCLIPProcessor, XCLIPConfig
configuration = XCLIPConfig()
configuration.num_frames = 2
configuration.vision_config.num_frames = 2
configuration.vision_config.mit_num_attention_heads = 2
model_xclip = XCLIPModel(configuration)

In [None]:
proj = model_xclip.visual_projection
out = proj(last_hidden_state)
mit = model_xclip.mit
out = mit (out.permute(1,0,2))
out[0].shape, out[1].shape

(torch.Size([50, 2, 512]), torch.Size([50, 512]))

In [None]:
model_xclip.mit

XCLIPMultiframeIntegrationTransformer(
  (encoder): XCLIPEncoder(
    (layers): ModuleList(
      (0): XCLIPEncoderLayer(
        (self_attn): XCLIPAttention(
          (k_proj): Linear(in_features=512, out_features=512, bias=True)
          (v_proj): Linear(in_features=512, out_features=512, bias=True)
          (q_proj): Linear(in_features=512, out_features=512, bias=True)
          (out_proj): Linear(in_features=512, out_features=512, bias=True)
        )
        (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): XCLIPMLP(
          (activation_fn): QuickGELUActivation()
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
        )
        (layer_norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      )
    )
  )
)

In [None]:
from transformers import AutoProcessor, XCLIPModel, XCLIPProcessor
from transformers import XCLIPConfig

configuration = XCLIPConfig()
configuration.num_frames = 5
configuration.vision_config.num_frames = 5
configuration.vision_config.mit_num_attention_heads = 5

model_xclip= XCLIPModel.from_pretrained("microsoft/xclip-base-patch32")
# model_xclip= XCLIPModel(configuration)

out = model_xclip.visual_projection(last_hidden_state)
print('Final Video Feature:', out.shape)
# out = model_xclip.mit(out.permute(1,0,2))
# print('Final Video Feature:', out.shape)

Final Video Feature: torch.Size([2, 50, 512])


In [None]:
out[1].shape

torch.Size([50, 512])

#Extracting Video Feature (5 frames customized)

In [None]:
from transformers import XCLIPConfig
from safetensors.torch import load_file
from transformers import AutoProcessor, XCLIPModel, XCLIPProcessor

configuration = XCLIPConfig()
configuration.num_frames = 5
configuration.vision_config.num_frames = 5
configuration.vision_config.mit_num_attention_heads = 5


processor = AutoProcessor.from_pretrained("microsoft/xclip-base-patch32")
# processor = XCLIPProcessor(configuration)
model = XCLIPModel(configuration)
state_dict = load_file('/content/hub/models--microsoft--xclip-base-patch32/snapshots/a2e27a78a2b5d802e894b8a1ef14f3a8ce490963/model.safetensors')
model.load_state_dict(state_dict, strict=False)

indices = sample_frame_indices(clip_len=5, frame_sample_rate=1, seg_len=container.streams.video[0].frames)
video = read_video_pyav(container, indices)
print('video size:', video.shape)

inputs = processor(videos=list(video), return_tensors="pt")

video_features = model.get_video_features(**inputs)
print('extracted video features size:', video_features.shape)

ValueError: embed_dim must be divisible by num_heads (got `embed_dim`: 512 and `num_heads`: 5).

In [None]:
model.visual_projection

Linear(in_features=768, out_features=512, bias=False)

In [None]:
configuration

XCLIPConfig {
  "initializer_factor": 1.0,
  "logit_scale_init_value": 2.6592,
  "model_type": "xclip",
  "num_frames": 5,
  "projection_dim": 512,
  "prompt_alpha": 0.1,
  "prompt_attention_dropout": 0.0,
  "prompt_hidden_act": "quick_gelu",
  "prompt_layers": 2,
  "prompt_num_attention_heads": 8,
  "prompt_projection_dropout": 0.0,
  "text_config": {
    "model_type": "xclip_text_model"
  },
  "transformers_version": "4.44.2",
  "vision_config": {
    "model_type": "xclip_vision_model",
    "num_frames": 5
  }
}

In [None]:
model.vision_model.config

XCLIPVisionConfig {
  "attention_dropout": 0.0,
  "drop_path_rate": 0.0,
  "hidden_act": "quick_gelu",
  "hidden_size": 768,
  "image_size": 224,
  "initializer_factor": 1.0,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "mit_hidden_size": 512,
  "mit_intermediate_size": 2048,
  "mit_num_attention_heads": 8,
  "mit_num_hidden_layers": 1,
  "model_type": "xclip_vision_model",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_frames": 8,
  "num_hidden_layers": 12,
  "patch_size": 32,
  "transformers_version": "4.44.2"
}

In [None]:
model.config

XCLIPConfig {
  "initializer_factor": 1.0,
  "logit_scale_init_value": 2.6592,
  "model_type": "xclip",
  "num_frames": 5,
  "projection_dim": 512,
  "prompt_alpha": 0.1,
  "prompt_attention_dropout": 0.0,
  "prompt_hidden_act": "quick_gelu",
  "prompt_layers": 2,
  "prompt_num_attention_heads": 5,
  "prompt_projection_dropout": 0.0,
  "text_config": {
    "model_type": "xclip_text_model"
  },
  "transformers_version": "4.44.2",
  "vision_config": {
    "model_type": "xclip_vision_model"
  }
}