In [1]:
!pip install --upgrade transformers
!pip install pytorchvideo evaluate -q
!pip install --upgrade datasets
!pip install decord
!pip install -U huggingface_hub

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m52.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m27.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m100.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90

In [2]:
video_dataset = "minoosh/IEMOCAP_videos"

In [3]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
from huggingface_hub import login

login("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")

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [5]:
from huggingface_hub import hf_hub_download
import pathlib

hf_dataset_identifier = "minoosh/IEMOCAP_videos"
filename = "IEMOCAP_videos"
file_path = hf_hub_download(
    repo_id=hf_dataset_identifier, filename=filename, repo_type="dataset"
)

!tar xf {file_path}

dataset_root_path = "IEMOCAP_videos"
dataset_root_path = pathlib.Path(dataset_root_path)

Downloading IEMOCAP_videos:   0%|          | 0.00/1.13G [00:00<?, ?B/s]

In [6]:
def read_video_pyav(container, indices):
    frames = []
    container.seek(0)
    start_index = indices[0]
    end_index = indices[-1]
    for i, frame in enumerate(container.decode(video=0)):
        if i > end_index:
            break
        if i >= start_index and i in indices:
            frames.append(frame)
    return np.stack([x.to_ndarray(format="rgb24") for x in frames])


def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
    converted_len = int(clip_len * frame_sample_rate)
    end_idx = np.random.randint(converted_len, seg_len)
    start_idx = end_idx - converted_len
    indices = np.linspace(start_idx, end_idx, num=clip_len)
    indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
    return indices

In [7]:
import av
import numpy as np
from transformers import AutoFeatureExtractor, AutoModelForVideoClassification

def generate_and_save_output_layer(video_file_paths):

  feature_extractor = AutoFeatureExtractor.from_pretrained(video_mdl)
  model = AutoModelForVideoClassification.from_pretrained(video_mdl).to(device)

  video_samples = [str(item) for item in video_file_paths]
  video_samples.sort(key=lambda video_samples: video_samples[28:])

  turns = [sample[28:-4] for sample in video_samples]
  emotions = [sample[24:27] for sample in video_samples]
  emotion_dict = {'ang':0,
                  'hap':1,
                  'neu':2,
                  'sad':3}
  emotions = [emotion_dict[emo] for emo in emotions]

  softmax_layer = torch.nn.Softmax(dim=1)
  video_outputs = torch.empty((0,4)).to(device)

  for video_sample in video_samples:
    # video clip consists of 300 frames (10 seconds at 30 FPS)
    container = av.open(video_sample)

    # sample 16 frames
    indices = sample_frame_indices(clip_len=16, frame_sample_rate=1, seg_len=container.streams.video[0].frames)
    video = read_video_pyav(container, indices)

    inputs = feature_extractor(list(video), return_tensors="pt")
    inputs = {k:torch.tensor(v).to(device) for k,v in inputs.items()}

    with torch.no_grad():
      logits = model(**inputs).logits
      output = softmax_layer(logits)
      video_outputs = torch.cat((video_outputs, output), 0)

  return video_outputs, len(video_file_paths), emotions

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

In [10]:
def main(excluded_session):

  sessions = [1, 2, 3, 4, 5]
  sessions.remove(excluded_session)

  train_video_file_paths = (
      list(dataset_root_path.glob(f"Session{sessions[0]}/*/*.mp4"))
    + list(dataset_root_path.glob(f"Session{sessions[1]}/*/*.mp4"))
    + list(dataset_root_path.glob(f"Session{sessions[2]}/*/*.mp4"))
    + list(dataset_root_path.glob(f"Session{sessions[3]}/*/*.mp4")))

  test_video_file_paths = list(dataset_root_path.glob(f"Session{excluded_session}/*/*.mp4"))

  train_output, train_num, train_emotions = generate_and_save_output_layer(train_video_file_paths)
  print(f"***Train set - (Session{excluded_session} excluded:)")
  print(f"Number of videos: {train_num}")
  torch.save(train_output, f'train_video_{excluded_session}.pt')
  print(f"Number of saved output: {train_output.size()}")

  test_output, test_num, test_emotions = generate_and_save_output_layer(test_video_file_paths)
  print(f"***Test set - (Session{excluded_session})")
  print(f"Number of videos: {test_num}")
  torch.save(test_output, f'test_video_{excluded_session}.pt')
  print(f"Number of saved output: {test_output.size()}")

  torch.save(train_emotions, f'train_video_labels{excluded_session}.pt')
  print(f"{len(train_emotions)} train labels_saved successfully.")

  torch.save(test_emotions, f'test_video_labels{excluded_session}.pt')
  print(f"{len(test_emotions)} test labels_saved successfully.")

In [11]:
excluded_session = 1
video_mdl = f"minoosh/videomae-base-finetuned-IEMOCAP_{excluded_session}"
main(excluded_session)

Downloading (…)rocessor_config.json:   0%|          | 0.00/473 [00:00<?, ?B/s]



Downloading (…)lve/main/config.json:   0%|          | 0.00/981 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/345M [00:00<?, ?B/s]

  return torch.tensor(value)
  inputs = {k:torch.tensor(v).to(device) for k,v in inputs.items()}


***Train set - (Session1 excluded:)
Number of videos: 4446
Number of saved output: torch.Size([4446, 4])
***Test set - (Session1)
Number of videos: 1085
Number of saved output: torch.Size([1085, 4])
4446 train labels_saved successfully.
1085 test labels_saved successfully.


In [12]:
excluded_session = 2
video_mdl = f"minoosh/videomae-base-finetuned-IEMOCAP_{excluded_session}"
main(excluded_session)

Downloading (…)rocessor_config.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/981 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/345M [00:00<?, ?B/s]

  inputs = {k:torch.tensor(v).to(device) for k,v in inputs.items()}


***Train set - (Session2 excluded:)
Number of videos: 4508
Number of saved output: torch.Size([4508, 4])
***Test set - (Session2)
Number of videos: 1023
Number of saved output: torch.Size([1023, 4])
4508 train labels_saved successfully.
1023 test labels_saved successfully.


In [13]:
excluded_session = 3
video_mdl = f"minoosh/videomae-base-finetuned-IEMOCAP_{excluded_session}"
main(excluded_session)

Downloading (…)rocessor_config.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/981 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/345M [00:00<?, ?B/s]

  inputs = {k:torch.tensor(v).to(device) for k,v in inputs.items()}


***Train set - (Session3 excluded:)
Number of videos: 4380
Number of saved output: torch.Size([4380, 4])
***Test set - (Session3)
Number of videos: 1151
Number of saved output: torch.Size([1151, 4])
4380 train labels_saved successfully.
1151 test labels_saved successfully.


In [14]:
excluded_session = 4
video_mdl = f"minoosh/videomae-base-finetuned-IEMOCAP_{excluded_session}"
main(excluded_session)

Downloading (…)rocessor_config.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/981 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/345M [00:00<?, ?B/s]

  inputs = {k:torch.tensor(v).to(device) for k,v in inputs.items()}


***Train set - (Session4 excluded:)
Number of videos: 4500
Number of saved output: torch.Size([4500, 4])
***Test set - (Session4)
Number of videos: 1031
Number of saved output: torch.Size([1031, 4])
4500 train labels_saved successfully.
1031 test labels_saved successfully.


In [15]:
excluded_session = 5
video_mdl = f"minoosh/videomae-base-finetuned-IEMOCAP_{excluded_session}"
main(excluded_session)

Downloading (…)rocessor_config.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/981 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/345M [00:00<?, ?B/s]

  inputs = {k:torch.tensor(v).to(device) for k,v in inputs.items()}


***Train set - (Session5 excluded:)
Number of videos: 4290
Number of saved output: torch.Size([4290, 4])
***Test set - (Session5)
Number of videos: 1241
Number of saved output: torch.Size([1241, 4])
4290 train labels_saved successfully.
1241 test labels_saved successfully.
