In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
os.environ['HUGGINGFACE_HUB_CACHE'] = '/scratch/gsk6me/huggingface_cache'
os.environ['TOKENIZERS_PARALLELISM'] = 'true'

In [3]:
import argparse
import logging
import math
import os
import random
import shutil
from pathlib import Path

import accelerate
import datasets
import numpy as np
import torch
import torch.nn.functional as F
import torch.utils.checkpoint
import torch.utils.data
import transformers
from accelerate import Accelerator
from accelerate.logging import get_logger
from accelerate.state import AcceleratorState
from accelerate.utils import ProjectConfiguration, set_seed
from datasets import load_dataset
from huggingface_hub import create_repo, upload_folder
from packaging import version
from torchvision import transforms
from tqdm.auto import tqdm
from transformers import CLIPTextModel, CLIPTokenizer
from transformers.utils import ContextManagers

import diffusers
from diffusers import AutoencoderKLTemporalDecoder, DDPMScheduler, StableVideoDiffusionPipeline, UNetSpatioTemporalConditionModel
from diffusers.optimization import get_scheduler
from diffusers.training_utils import EMAModel, compute_snr
from diffusers.image_processor import VaeImageProcessor
from diffusers.utils import check_min_version, deprecate, is_wandb_available, make_image_grid
from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card
from diffusers.utils.import_utils import is_xformers_available
from diffusers.utils.torch_utils import is_compiled_module

from rt1_dataset_wrapper import RT1Dataset


if is_wandb_available():
    import wandb

  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(
2024-02-29 16:28:20.292144: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Load Pipelines

We have a base pipeline, which is only image-conditioned, and we have a LoRA pipeline, which has had low-rank weight updates made.

In [4]:
# Load the base pipeline
pipeline_base = StableVideoDiffusionPipeline.from_pretrained(
    "stabilityai/stable-video-diffusion-img2vid-xt",
    torch_dtype=torch.float16,
)
pipeline_base = pipeline_base.to(device='cuda')

pipeline_lora = StableVideoDiffusionPipeline.from_pretrained(
    "stabilityai/stable-video-diffusion-img2vid-xt",
    torch_dtype=torch.float16,
)
pipeline_lora = pipeline_lora.to(device='cuda')

Loading pipeline components...:   0%|          | 0/5 [00:00<?, ?it/s]

Loading pipeline components...:   0%|          | 0/5 [00:00<?, ?it/s]

## Inject LoRA

Because the regular pipeline doesn't have an easy way to load LoRA adapters directly, I just manually add them to the U-Net weights. We need to use HuggingFace's SafeTensors library for this.

In [5]:
from peft import LoraConfig

rank = 16
unet_lora_config = LoraConfig(
    r=rank,
    lora_alpha=rank,
    init_lora_weights="gaussian",
    target_modules=["to_k", "to_q", "to_v", "to_out.0"],
)
pipeline_lora.unet.add_adapter(unet_lora_config)

from safetensors import safe_open

tensors = {}
with safe_open("./sd-model-finetuned/unet/diffusion_pytorch_model.safetensors", framework="pt", device=0) as f:
    for k in f.keys():
        tensors[k] = f.get_tensor(k)
        # print("Loading tensor", k)

pipeline_lora.unet.load_state_dict(tensors)

<All keys matched successfully>

## Unify with Text Encoder

Here, we create a unified trajectory synthesis pipeline, which is conditioned on text inputs.

In [6]:
from text_and_image_conditioned_video_diffusion_model import VisualTrajectorySynthesizer

# Load the text encoder. This only works for the LoRA model.
text_encoder_path = "laion/CLIP-ViT-H-14-laion2B-s32B-b79K"
text_encoder = CLIPTextModel.from_pretrained(text_encoder_path).to(device='cuda', dtype=torch.float16)
tokenizer = CLIPTokenizer.from_pretrained(text_encoder_path)

pipeline_lora_unified = VisualTrajectorySynthesizer.from_stable_video_diffusion_pipeline(pipeline_lora, text_encoder)

## Load Data

We only trained on the first $20$ samples from the RT-1 dataset, and of those, we only trained on the first $25$ frames of each video. Let's see how they turned out. We will use the custom class I wrote, `RT1Dataset`, for this. We also need to do a bit of image processing before we can directly send inputs into the model.

In [7]:
from rt1_dataset_wrapper import RT1Dataset

# Originally from `args`
image_height = 256
image_width = 320
vae_image_processor = pipeline_lora.image_processor

# We don't technically even have to do this because the `vae_image_processor` is already called in the StableVideoDiffusionPipeline
def collate_fn(batch):
    # input: (text, image_sequence)[]
    # return: (text batch, text attention masks, text sequence lengths, images)
    text_batch = [text for (text, imgseq) in batch]
    tokenization = tokenizer(text_batch, padding='longest', return_tensors='pt')
    text_tokens = tokenization['input_ids']
    text_attention_masks = tokenization['attention_mask']

    imgseqs = [vae_image_processor.preprocess(imgseq, height=image_height, width=image_width) for (_, imgseq) in batch]

    return (text_tokens, text_attention_masks, torch.stack(imgseqs))

dataset = RT1Dataset('/scratch/gsk6me/WORLDMODELS/datasets/rt-1-data-release')

In [26]:
from diffusers.utils import export_to_video

for n in range(10, 20):
    text, image_sequence = dataset[n]
    input_ids = tokenizer(text, return_tensors='pt').input_ids
    
    # Use the first image as conditioning
    # Note that because training only occurred over the first 25 frames of each video,
    # the model has not seen robots performing task completion yet.
    import PIL.Image
    
    conditioning_image_tensor = image_sequence[0]
    conditioning_image_np = np.array((image_sequence[0] * 255).permute(1, 2, 0)).astype(np.uint8)
    conditioning_image_pil = PIL.Image.fromarray(conditioning_image_np)

    frames = pipeline_lora_unified.custom_call(conditioning_image_pil, input_ids, width=image_width, height=image_height)
    export_to_video(frames[0], f"generated_lora_trained_more_{n}_raw.mp4", fps=7)

    frames = pipeline_base(conditioning_image_pil, width=image_width, height=image_height)
    export_to_video(frames[0][0], f"generated_non_lora_{n}_raw.mp4", fps=7)

    os.system(f"ffmpeg -i generated_lora_trained_more_{n}_raw.mp4 generated_lora_trained_more_{n}.mp4")
    os.system(f"ffmpeg -i generated_non_lora_{n}_raw.mp4 generated_non_lora_{n}.mp4")
    os.system(f"rm generated_lora_trained_more_{n}_raw.mp4")
    os.system(f"rm generated_non_lora_{n}_raw.mp4")

# conditioning_image_pil

BaseModelOutputWithPooling(last_hidden_state=tensor([[[-0.2546, -0.3799,  0.0656,  ...,  0.1837,  0.0816, -0.3835],
         [ 0.3843,  0.6553, -0.3020,  ...,  1.1660,  0.7383, -0.9102],
         [ 0.2107,  0.1082, -2.0918,  ..., -0.1875,  1.8955, -1.3477],
         [ 0.1605,  0.4241,  0.0748,  ..., -0.1385, -0.3008,  0.8154],
         [ 0.3262,  0.2437, -0.3157,  ...,  0.2820, -1.0430, -0.0968]]],
       device='cuda:0', dtype=torch.float16), pooler_output=tensor([[ 0.3262,  0.2437, -0.3157,  ...,  0.2820, -1.0430, -0.0968]],
       device='cuda:0', dtype=torch.float16), hidden_states=None, attentions=None)
torch.Size([1, 1024])


  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

ffmpeg version 5.1.2 Copyright (c) 2000-2022 the FFmpeg developers
  built with gcc 11.3.0 (conda-forge gcc 11.3.0-19)
  configuration: --prefix=/home/conda/feedstock_root/build_artifacts/ffmpeg_1674566204550/_h_env_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_plac --cc=/home/conda/feedstock_root/build_artifacts/ffmpeg_1674566204550/_build_env/bin/x86_64-conda-linux-gnu-cc --cxx=/home/conda/feedstock_root/build_artifacts/ffmpeg_1674566204550/_build_env/bin/x86_64-conda-linux-gnu-c++ --nm=/home/conda/feedstock_root/build_artifacts/ffmpeg_1674566204550/_build_env/bin/x86_64-conda-linux-gnu-nm --ar=/home/conda/feedstock_root/build_artifacts/ffmpeg_1674566204550/_build_env/bin/x86_64-conda-linux-gnu-ar --disable-doc --disable-openssl --enable-demuxer=dash --enable-hardcoded-tables --enable-libfreetype --enable-libfontconfig --enable-libopenh264 --enable-gnu

BaseModelOutputWithPooling(last_hidden_state=tensor([[[-0.2546, -0.3799,  0.0656,  ...,  0.1837,  0.0816, -0.3835],
         [ 0.6206,  1.1445, -0.2727,  ..., -0.5166,  0.4487, -1.1611],
         [ 0.1656, -1.0615,  0.4207,  ..., -0.7412,  2.0137,  0.2788],
         ...,
         [ 1.4023,  0.1138,  0.7871,  ..., -0.4246,  0.2612,  0.5205],
         [-0.1406,  0.5957,  0.0765,  ..., -0.3831,  0.5156, -1.4229],
         [ 1.7959,  1.1963,  0.6157,  ..., -0.4788, -0.0842,  0.7095]]],
       device='cuda:0', dtype=torch.float16), pooler_output=tensor([[ 1.7959,  1.1963,  0.6157,  ..., -0.4788, -0.0842,  0.7095]],
       device='cuda:0', dtype=torch.float16), hidden_states=None, attentions=None)
torch.Size([1, 1024])


frame=   25 fps=0.0 q=24.0 Lsize=     137kB time=00:00:03.14 bitrate= 355.9kbits/s speed=13.3x    
video:135kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.848271%
[libx264 @ 0x5587fd0da540] frame I:1     Avg QP:23.44  size:  6355
[libx264 @ 0x5587fd0da540] frame P:6     Avg QP:23.25  size:  6986
[libx264 @ 0x5587fd0da540] frame B:18    Avg QP:24.50  size:  4982
[libx264 @ 0x5587fd0da540] consecutive B-frames:  4.0%  0.0%  0.0% 96.0%
[libx264 @ 0x5587fd0da540] mb I  I16..4:  0.9% 92.2%  6.9%
[libx264 @ 0x5587fd0da540] mb P  I16..4:  1.5% 58.9%  7.9%  P16..4: 13.8% 12.8%  5.2%  0.0%  0.0%    skip: 0.0%
[libx264 @ 0x5587fd0da540] mb B  I16..4:  1.8% 16.5%  3.1%  B16..8: 36.8% 26.7%  5.4%  direct: 8.6%  skip: 1.2%  L0:52.5% L1:30.0% BI:17.5%
[libx264 @ 0x5587fd0da540] 8x8 transform intra:83.1% inter:79.6%
[libx264 @ 0x5587fd0da540] coded y,uvDC,uvAC intra: 88.3% 87.5% 44.4% inter: 71.6% 78.4% 8.4%
[libx264 @ 0x5587fd0da540] i16 v,h,dc,p:  5% 70% 17%  8%
[

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

ffmpeg version 5.1.2 Copyright (c) 2000-2022 the FFmpeg developers
  built with gcc 11.3.0 (conda-forge gcc 11.3.0-19)
  configuration: --prefix=/home/conda/feedstock_root/build_artifacts/ffmpeg_1674566204550/_h_env_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_plac --cc=/home/conda/feedstock_root/build_artifacts/ffmpeg_1674566204550/_build_env/bin/x86_64-conda-linux-gnu-cc --cxx=/home/conda/feedstock_root/build_artifacts/ffmpeg_1674566204550/_build_env/bin/x86_64-conda-linux-gnu-c++ --nm=/home/conda/feedstock_root/build_artifacts/ffmpeg_1674566204550/_build_env/bin/x86_64-conda-linux-gnu-nm --ar=/home/conda/feedstock_root/build_artifacts/ffmpeg_1674566204550/_build_env/bin/x86_64-conda-linux-gnu-ar --disable-doc --disable-openssl --enable-demuxer=dash --enable-hardcoded-tables --enable-libfreetype --enable-libfontconfig --enable-libopenh264 --enable-gnu

BaseModelOutputWithPooling(last_hidden_state=tensor([[[-0.2546, -0.3799,  0.0656,  ...,  0.1837,  0.0816, -0.3835],
         [ 0.3691,  0.2310,  1.3652,  ..., -1.0732, -0.9395,  0.3718],
         [-0.2583, -0.5542,  0.5039,  ...,  1.3369,  0.4434,  0.5811],
         ...,
         [-1.0195,  0.2366, -1.1494,  ...,  0.1632,  1.4785, -2.0684],
         [ 0.9556,  0.6841, -0.4148,  ..., -0.9775, -1.4961, -0.7812],
         [ 1.1035,  0.0421,  0.4514,  ..., -0.3892,  0.5269,  0.5103]]],
       device='cuda:0', dtype=torch.float16), pooler_output=tensor([[ 1.1035,  0.0421,  0.4514,  ..., -0.3892,  0.5269,  0.5103]],
       device='cuda:0', dtype=torch.float16), hidden_states=None, attentions=None)
torch.Size([1, 1024])


frame=   25 fps=0.0 q=24.0 Lsize=     136kB time=00:00:03.14 bitrate= 355.6kbits/s speed=13.7x    
video:135kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.843212%
[libx264 @ 0x5652635742c0] frame I:1     Avg QP:22.11  size:  8228
[libx264 @ 0x5652635742c0] frame P:7     Avg QP:22.77  size:  6601
[libx264 @ 0x5652635742c0] frame B:17    Avg QP:24.14  size:  4906
[libx264 @ 0x5652635742c0] consecutive B-frames:  8.0%  0.0% 12.0% 80.0%
[libx264 @ 0x5652635742c0] mb I  I16..4:  5.3% 84.1% 10.6%
[libx264 @ 0x5652635742c0] mb P  I16..4:  6.5% 56.5% 13.0%  P16..4:  9.2%  9.8%  2.6%  0.0%  0.0%    skip: 2.4%
[libx264 @ 0x5652635742c0] mb B  I16..4:  3.3% 20.9%  5.8%  B16..8: 29.4% 22.9%  5.5%  direct: 8.5%  skip: 3.8%  L0:50.7% L1:26.0% BI:23.3%
[libx264 @ 0x5652635742c0] 8x8 transform intra:73.1% inter:71.7%
[libx264 @ 0x5652635742c0] coded y,uvDC,uvAC intra: 78.1% 90.1% 49.8% inter: 64.9% 78.1% 11.8%
[libx264 @ 0x5652635742c0] i16 v,h,dc,p: 23% 47% 16% 14%


  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

ffmpeg version 5.1.2 Copyright (c) 2000-2022 the FFmpeg developers
  built with gcc 11.3.0 (conda-forge gcc 11.3.0-19)
  configuration: --prefix=/home/conda/feedstock_root/build_artifacts/ffmpeg_1674566204550/_h_env_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_plac --cc=/home/conda/feedstock_root/build_artifacts/ffmpeg_1674566204550/_build_env/bin/x86_64-conda-linux-gnu-cc --cxx=/home/conda/feedstock_root/build_artifacts/ffmpeg_1674566204550/_build_env/bin/x86_64-conda-linux-gnu-c++ --nm=/home/conda/feedstock_root/build_artifacts/ffmpeg_1674566204550/_build_env/bin/x86_64-conda-linux-gnu-nm --ar=/home/conda/feedstock_root/build_artifacts/ffmpeg_1674566204550/_build_env/bin/x86_64-conda-linux-gnu-ar --disable-doc --disable-openssl --enable-demuxer=dash --enable-hardcoded-tables --enable-libfreetype --enable-libfontconfig --enable-libopenh264 --enable-gnu

BaseModelOutputWithPooling(last_hidden_state=tensor([[[-0.2546, -0.3799,  0.0656,  ...,  0.1837,  0.0816, -0.3835],
         [ 0.6206,  1.1445, -0.2727,  ..., -0.5166,  0.4487, -1.1611],
         [-1.4248,  0.2347, -2.3184,  ...,  0.7769,  0.4304, -0.4253],
         ...,
         [ 1.3877, -0.1938,  0.4194,  ..., -0.3726,  0.0093, -0.1362],
         [-0.0205,  0.8291, -0.1204,  ..., -0.6953,  0.8271, -1.5771],
         [ 1.0820,  1.0430,  0.7803,  ..., -0.0933,  1.1406,  0.3989]]],
       device='cuda:0', dtype=torch.float16), pooler_output=tensor([[ 1.0820,  1.0430,  0.7803,  ..., -0.0933,  1.1406,  0.3989]],
       device='cuda:0', dtype=torch.float16), hidden_states=None, attentions=None)
torch.Size([1, 1024])


frame=   25 fps=0.0 q=24.0 Lsize=      79kB time=00:00:03.14 bitrate= 207.2kbits/s speed=  16x    
video:78kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 1.435818%
[libx264 @ 0x55e6d8c87cc0] frame I:1     Avg QP:19.69  size:  5135
[libx264 @ 0x55e6d8c87cc0] frame P:10    Avg QP:20.49  size:  3545
[libx264 @ 0x55e6d8c87cc0] frame B:14    Avg QP:21.18  size:  2783
[libx264 @ 0x55e6d8c87cc0] consecutive B-frames: 16.0% 24.0% 12.0% 48.0%
[libx264 @ 0x55e6d8c87cc0] mb I  I16..4: 15.3% 78.8%  5.9%
[libx264 @ 0x55e6d8c87cc0] mb P  I16..4: 10.5% 50.6%  3.5%  P16..4: 20.8% 11.4%  2.8%  0.0%  0.0%    skip: 0.4%
[libx264 @ 0x55e6d8c87cc0] mb B  I16..4:  4.6% 22.7%  1.7%  B16..8: 39.3% 19.1%  2.9%  direct: 6.8%  skip: 2.9%  L0:62.4% L1:25.0% BI:12.6%
[libx264 @ 0x55e6d8c87cc0] 8x8 transform intra:78.3% inter:82.4%
[libx264 @ 0x55e6d8c87cc0] coded y,uvDC,uvAC intra: 59.7% 70.3% 20.1% inter: 49.8% 65.3% 2.6%
[libx264 @ 0x55e6d8c87cc0] i16 v,h,dc,p: 24% 34% 31% 12%
[l

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

ffmpeg version 5.1.2 Copyright (c) 2000-2022 the FFmpeg developers
  built with gcc 11.3.0 (conda-forge gcc 11.3.0-19)
  configuration: --prefix=/home/conda/feedstock_root/build_artifacts/ffmpeg_1674566204550/_h_env_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_plac --cc=/home/conda/feedstock_root/build_artifacts/ffmpeg_1674566204550/_build_env/bin/x86_64-conda-linux-gnu-cc --cxx=/home/conda/feedstock_root/build_artifacts/ffmpeg_1674566204550/_build_env/bin/x86_64-conda-linux-gnu-c++ --nm=/home/conda/feedstock_root/build_artifacts/ffmpeg_1674566204550/_build_env/bin/x86_64-conda-linux-gnu-nm --ar=/home/conda/feedstock_root/build_artifacts/ffmpeg_1674566204550/_build_env/bin/x86_64-conda-linux-gnu-ar --disable-doc --disable-openssl --enable-demuxer=dash --enable-hardcoded-tables --enable-libfreetype --enable-libfontconfig --enable-libopenh264 --enable-gnu

BaseModelOutputWithPooling(last_hidden_state=tensor([[[-0.2546, -0.3799,  0.0656,  ...,  0.1837,  0.0816, -0.3835],
         [ 0.6206,  1.1445, -0.2727,  ..., -0.5166,  0.4487, -1.1611],
         [-1.4248,  0.2347, -2.3184,  ...,  0.7769,  0.4304, -0.4253],
         [-0.8965,  1.5078, -0.4758,  ...,  0.9546,  0.4753, -1.5205],
         [ 0.3264,  0.5020, -0.7202,  ..., -0.4895, -1.0039, -1.9541],
         [ 0.8584,  0.9600,  0.8003,  ..., -0.0581,  1.4043, -0.6768]]],
       device='cuda:0', dtype=torch.float16), pooler_output=tensor([[ 0.8584,  0.9600,  0.8003,  ..., -0.0581,  1.4043, -0.6768]],
       device='cuda:0', dtype=torch.float16), hidden_states=None, attentions=None)
torch.Size([1, 1024])


frame=   25 fps=0.0 q=24.0 Lsize=     143kB time=00:00:03.14 bitrate= 372.5kbits/s speed=13.2x    
video:142kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.804507%
[libx264 @ 0x56315c8939c0] frame I:1     Avg QP:23.58  size:  6362
[libx264 @ 0x56315c8939c0] frame P:12    Avg QP:23.95  size:  6401
[libx264 @ 0x56315c8939c0] frame B:12    Avg QP:24.89  size:  5110
[libx264 @ 0x56315c8939c0] consecutive B-frames: 28.0% 24.0%  0.0% 48.0%
[libx264 @ 0x56315c8939c0] mb I  I16..4:  9.1% 77.2% 13.8%
[libx264 @ 0x56315c8939c0] mb P  I16..4:  6.3% 43.8% 13.7%  P16..4: 16.0% 14.5%  5.3%  0.0%  0.0%    skip: 0.4%
[libx264 @ 0x56315c8939c0] mb B  I16..4:  4.7% 20.0%  7.8%  B16..8: 27.8% 24.1%  7.3%  direct: 6.4%  skip: 2.0%  L0:50.6% L1:29.7% BI:19.7%
[libx264 @ 0x56315c8939c0] 8x8 transform intra:67.1% inter:66.8%
[libx264 @ 0x56315c8939c0] coded y,uvDC,uvAC intra: 76.3% 87.3% 48.2% inter: 67.8% 78.5% 12.6%
[libx264 @ 0x56315c8939c0] i16 v,h,dc,p: 20% 63% 12%  5%


  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

ffmpeg version 5.1.2 Copyright (c) 2000-2022 the FFmpeg developers
  built with gcc 11.3.0 (conda-forge gcc 11.3.0-19)
  configuration: --prefix=/home/conda/feedstock_root/build_artifacts/ffmpeg_1674566204550/_h_env_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_plac --cc=/home/conda/feedstock_root/build_artifacts/ffmpeg_1674566204550/_build_env/bin/x86_64-conda-linux-gnu-cc --cxx=/home/conda/feedstock_root/build_artifacts/ffmpeg_1674566204550/_build_env/bin/x86_64-conda-linux-gnu-c++ --nm=/home/conda/feedstock_root/build_artifacts/ffmpeg_1674566204550/_build_env/bin/x86_64-conda-linux-gnu-nm --ar=/home/conda/feedstock_root/build_artifacts/ffmpeg_1674566204550/_build_env/bin/x86_64-conda-linux-gnu-ar --disable-doc --disable-openssl --enable-demuxer=dash --enable-hardcoded-tables --enable-libfreetype --enable-libfontconfig --enable-libopenh264 --enable-gnu

BaseModelOutputWithPooling(last_hidden_state=tensor([[[-0.2546, -0.3799,  0.0656,  ...,  0.1837,  0.0816, -0.3835],
         [ 0.6206,  0.2366, -0.2837,  ..., -0.3538,  0.2839, -1.4775],
         [ 0.0303, -0.3821, -0.9053,  ...,  0.7842,  0.1418,  0.4258],
         [ 0.0827,  0.1802,  0.0362,  ...,  0.7319, -0.0097,  1.0098],
         [ 0.3481,  0.1459, -0.6421,  ..., -0.3748, -0.9443,  0.2432]]],
       device='cuda:0', dtype=torch.float16), pooler_output=tensor([[ 0.3481,  0.1459, -0.6421,  ..., -0.3748, -0.9443,  0.2432]],
       device='cuda:0', dtype=torch.float16), hidden_states=None, attentions=None)
torch.Size([1, 1024])


frame=   25 fps=0.0 q=24.0 Lsize=     129kB time=00:00:03.14 bitrate= 337.1kbits/s speed=13.5x    
video:128kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.883756%
[libx264 @ 0x562dfb13e9c0] frame I:1     Avg QP:22.54  size:  6785
[libx264 @ 0x562dfb13e9c0] frame P:8     Avg QP:22.42  size:  5923
[libx264 @ 0x562dfb13e9c0] frame B:16    Avg QP:23.58  size:  4775
[libx264 @ 0x562dfb13e9c0] consecutive B-frames: 12.0%  0.0% 24.0% 64.0%
[libx264 @ 0x562dfb13e9c0] mb I  I16..4: 13.8% 72.8% 13.4%
[libx264 @ 0x562dfb13e9c0] mb P  I16..4: 10.7% 44.2% 10.9%  P16..4: 14.1% 15.4%  4.1%  0.0%  0.0%    skip: 0.6%
[libx264 @ 0x562dfb13e9c0] mb B  I16..4:  7.8% 24.3%  4.4%  B16..8: 27.3% 23.5%  5.9%  direct: 5.2%  skip: 1.6%  L0:52.5% L1:28.5% BI:19.0%
[libx264 @ 0x562dfb13e9c0] 8x8 transform intra:67.3% inter:70.8%
[libx264 @ 0x562dfb13e9c0] coded y,uvDC,uvAC intra: 65.2% 76.0% 35.4% inter: 64.2% 74.2% 8.6%
[libx264 @ 0x562dfb13e9c0] i16 v,h,dc,p: 27% 56% 11%  6%
[

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

ffmpeg version 5.1.2 Copyright (c) 2000-2022 the FFmpeg developers
  built with gcc 11.3.0 (conda-forge gcc 11.3.0-19)
  configuration: --prefix=/home/conda/feedstock_root/build_artifacts/ffmpeg_1674566204550/_h_env_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_plac --cc=/home/conda/feedstock_root/build_artifacts/ffmpeg_1674566204550/_build_env/bin/x86_64-conda-linux-gnu-cc --cxx=/home/conda/feedstock_root/build_artifacts/ffmpeg_1674566204550/_build_env/bin/x86_64-conda-linux-gnu-c++ --nm=/home/conda/feedstock_root/build_artifacts/ffmpeg_1674566204550/_build_env/bin/x86_64-conda-linux-gnu-nm --ar=/home/conda/feedstock_root/build_artifacts/ffmpeg_1674566204550/_build_env/bin/x86_64-conda-linux-gnu-ar --disable-doc --disable-openssl --enable-demuxer=dash --enable-hardcoded-tables --enable-libfreetype --enable-libfontconfig --enable-libopenh264 --enable-gnu

BaseModelOutputWithPooling(last_hidden_state=tensor([[[-2.5464e-01, -3.7988e-01,  6.5552e-02,  ...,  1.8372e-01,
           8.1604e-02, -3.8354e-01],
         [ 3.6914e-01,  2.3096e-01,  1.3652e+00,  ..., -1.0732e+00,
          -9.3945e-01,  3.7183e-01],
         [ 5.1147e-02,  1.6699e-01, -9.6729e-01,  ..., -2.3596e-01,
           9.7412e-01,  1.2334e+00],
         ...,
         [ 2.3035e-01,  1.3257e-01,  8.4912e-01,  ..., -1.2383e+00,
           3.0005e-01, -3.1323e-01],
         [ 2.0504e-03, -2.1191e+00,  4.4873e-01,  ..., -5.3711e-01,
           1.1270e+00, -2.4048e-02],
         [ 8.1885e-01, -8.0273e-01,  1.2109e+00,  ...,  4.3286e-01,
           7.1240e-01,  2.0508e-01]]], device='cuda:0', dtype=torch.float16), pooler_output=tensor([[ 0.8188, -0.8027,  1.2109,  ...,  0.4329,  0.7124,  0.2051]],
       device='cuda:0', dtype=torch.float16), hidden_states=None, attentions=None)
torch.Size([1, 1024])


frame=   25 fps=0.0 q=24.0 Lsize=     160kB time=00:00:03.14 bitrate= 417.8kbits/s speed=12.6x    
video:159kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.716731%
[libx264 @ 0x556d321b9300] frame I:1     Avg QP:22.63  size:  9511
[libx264 @ 0x556d321b9300] frame P:7     Avg QP:24.72  size:  8092
[libx264 @ 0x556d321b9300] frame B:17    Avg QP:25.61  size:  5654
[libx264 @ 0x556d321b9300] consecutive B-frames:  8.0%  0.0% 12.0% 80.0%
[libx264 @ 0x556d321b9300] mb I  I16..4:  5.9% 75.9% 18.1%
[libx264 @ 0x556d321b9300] mb P  I16..4:  4.0% 45.4% 21.7%  P16..4:  9.3% 13.9%  5.6%  0.0%  0.0%    skip: 0.0%
[libx264 @ 0x556d321b9300] mb B  I16..4:  3.0% 13.8%  6.5%  B16..8: 30.7% 29.5%  9.6%  direct: 6.1%  skip: 0.9%  L0:41.2% L1:34.7% BI:24.1%
[libx264 @ 0x556d321b9300] 8x8 transform intra:63.3% inter:65.9%
[libx264 @ 0x556d321b9300] coded y,uvDC,uvAC intra: 82.3% 93.1% 62.2% inter: 70.9% 77.0% 12.8%
[libx264 @ 0x556d321b9300] i16 v,h,dc,p: 12% 69% 11%  8%


  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

ffmpeg version 5.1.2 Copyright (c) 2000-2022 the FFmpeg developers
  built with gcc 11.3.0 (conda-forge gcc 11.3.0-19)
  configuration: --prefix=/home/conda/feedstock_root/build_artifacts/ffmpeg_1674566204550/_h_env_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_plac --cc=/home/conda/feedstock_root/build_artifacts/ffmpeg_1674566204550/_build_env/bin/x86_64-conda-linux-gnu-cc --cxx=/home/conda/feedstock_root/build_artifacts/ffmpeg_1674566204550/_build_env/bin/x86_64-conda-linux-gnu-c++ --nm=/home/conda/feedstock_root/build_artifacts/ffmpeg_1674566204550/_build_env/bin/x86_64-conda-linux-gnu-nm --ar=/home/conda/feedstock_root/build_artifacts/ffmpeg_1674566204550/_build_env/bin/x86_64-conda-linux-gnu-ar --disable-doc --disable-openssl --enable-demuxer=dash --enable-hardcoded-tables --enable-libfreetype --enable-libfontconfig --enable-libopenh264 --enable-gnu

BaseModelOutputWithPooling(last_hidden_state=tensor([[[-0.2546, -0.3799,  0.0656,  ...,  0.1837,  0.0816, -0.3835],
         [-0.2162,  0.4993,  0.3022,  ..., -0.3953,  1.1807, -1.3965],
         [ 1.1836,  0.8750,  0.4929,  ..., -0.0605, -0.3896,  0.8750],
         ...,
         [ 0.6421,  1.0059, -0.5840,  ..., -0.6587,  2.9746,  0.0475],
         [-0.0411,  0.7104,  0.1602,  ..., -0.0659,  0.6299,  0.8633],
         [ 2.2930,  1.5586, -0.4060,  ...,  0.1470,  1.0508,  2.0527]]],
       device='cuda:0', dtype=torch.float16), pooler_output=tensor([[ 2.2930,  1.5586, -0.4060,  ...,  0.1470,  1.0508,  2.0527]],
       device='cuda:0', dtype=torch.float16), hidden_states=None, attentions=None)
torch.Size([1, 1024])


frame=   25 fps=0.0 q=24.0 Lsize=     161kB time=00:00:03.14 bitrate= 420.8kbits/s speed=12.8x    
video:160kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.696890%
[libx264 @ 0x5576320a6800] frame I:2     Avg QP:22.52  size:  6436
[libx264 @ 0x5576320a6800] frame P:10    Avg QP:24.93  size:  7637
[libx264 @ 0x5576320a6800] frame B:13    Avg QP:25.16  size:  5710
[libx264 @ 0x5576320a6800] consecutive B-frames: 16.0% 32.0% 36.0% 16.0%
[libx264 @ 0x5576320a6800] mb I  I16..4:  6.7% 83.0% 10.3%
[libx264 @ 0x5576320a6800] mb P  I16..4:  3.7% 59.9% 14.8%  P16..4:  7.8%  9.7%  4.2%  0.0%  0.0%    skip: 0.1%
[libx264 @ 0x5576320a6800] mb B  I16..4:  4.0% 32.6%  7.2%  B16..8: 21.2% 20.7%  6.6%  direct: 6.4%  skip: 1.2%  L0:46.0% L1:29.7% BI:24.3%
[libx264 @ 0x5576320a6800] 8x8 transform intra:76.5% inter:72.4%
[libx264 @ 0x5576320a6800] coded y,uvDC,uvAC intra: 81.6% 93.5% 58.3% inter: 73.2% 85.7% 31.2%
[libx264 @ 0x5576320a6800] i16 v,h,dc,p: 15% 44% 15% 26%


  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

ffmpeg version 5.1.2 Copyright (c) 2000-2022 the FFmpeg developers
  built with gcc 11.3.0 (conda-forge gcc 11.3.0-19)
  configuration: --prefix=/home/conda/feedstock_root/build_artifacts/ffmpeg_1674566204550/_h_env_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_plac --cc=/home/conda/feedstock_root/build_artifacts/ffmpeg_1674566204550/_build_env/bin/x86_64-conda-linux-gnu-cc --cxx=/home/conda/feedstock_root/build_artifacts/ffmpeg_1674566204550/_build_env/bin/x86_64-conda-linux-gnu-c++ --nm=/home/conda/feedstock_root/build_artifacts/ffmpeg_1674566204550/_build_env/bin/x86_64-conda-linux-gnu-nm --ar=/home/conda/feedstock_root/build_artifacts/ffmpeg_1674566204550/_build_env/bin/x86_64-conda-linux-gnu-ar --disable-doc --disable-openssl --enable-demuxer=dash --enable-hardcoded-tables --enable-libfreetype --enable-libfontconfig --enable-libopenh264 --enable-gnu

BaseModelOutputWithPooling(last_hidden_state=tensor([[[-0.2546, -0.3799,  0.0656,  ...,  0.1837,  0.0816, -0.3835],
         [ 0.3843,  0.6553, -0.3020,  ...,  1.1660,  0.7383, -0.9102],
         [ 0.0795,  0.8687, -0.8696,  ...,  0.5879,  1.0918, -1.8838],
         [ 0.5923, -0.3479,  1.4609,  ...,  0.1324,  0.5474,  0.1586],
         [ 0.5884,  0.8911, -0.2983,  ...,  0.5786, -1.0576,  0.0876]]],
       device='cuda:0', dtype=torch.float16), pooler_output=tensor([[ 0.5884,  0.8911, -0.2983,  ...,  0.5786, -1.0576,  0.0876]],
       device='cuda:0', dtype=torch.float16), hidden_states=None, attentions=None)
torch.Size([1, 1024])


frame=   25 fps=0.0 q=24.0 Lsize=     161kB time=00:00:03.14 bitrate= 418.8kbits/s speed=13.3x    
video:160kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.719878%
[libx264 @ 0x55dd2510ff40] frame I:1     Avg QP:22.42  size:  9101
[libx264 @ 0x55dd2510ff40] frame P:8     Avg QP:23.00  size:  8142
[libx264 @ 0x55dd2510ff40] frame B:16    Avg QP:24.70  size:  5527
[libx264 @ 0x55dd2510ff40] consecutive B-frames: 12.0%  8.0%  0.0% 80.0%
[libx264 @ 0x55dd2510ff40] mb I  I16..4:  7.8% 80.0% 12.2%
[libx264 @ 0x55dd2510ff40] mb P  I16..4:  3.5% 40.2%  6.2%  P16..4: 21.9% 19.5%  8.6%  0.0%  0.0%    skip: 0.2%
[libx264 @ 0x55dd2510ff40] mb B  I16..4:  1.9% 13.9%  1.5%  B16..8: 38.0% 24.3%  6.0%  direct:11.4%  skip: 3.0%  L0:47.8% L1:23.8% BI:28.4%
[libx264 @ 0x55dd2510ff40] 8x8 transform intra:80.4% inter:70.2%
[libx264 @ 0x55dd2510ff40] coded y,uvDC,uvAC intra: 85.3% 81.0% 28.2% inter: 69.5% 71.8% 4.8%
[libx264 @ 0x55dd2510ff40] i16 v,h,dc,p: 10% 63% 14% 13%
[

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

ffmpeg version 5.1.2 Copyright (c) 2000-2022 the FFmpeg developers
  built with gcc 11.3.0 (conda-forge gcc 11.3.0-19)
  configuration: --prefix=/home/conda/feedstock_root/build_artifacts/ffmpeg_1674566204550/_h_env_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_plac --cc=/home/conda/feedstock_root/build_artifacts/ffmpeg_1674566204550/_build_env/bin/x86_64-conda-linux-gnu-cc --cxx=/home/conda/feedstock_root/build_artifacts/ffmpeg_1674566204550/_build_env/bin/x86_64-conda-linux-gnu-c++ --nm=/home/conda/feedstock_root/build_artifacts/ffmpeg_1674566204550/_build_env/bin/x86_64-conda-linux-gnu-nm --ar=/home/conda/feedstock_root/build_artifacts/ffmpeg_1674566204550/_build_env/bin/x86_64-conda-linux-gnu-ar --disable-doc --disable-openssl --enable-demuxer=dash --enable-hardcoded-tables --enable-libfreetype --enable-libfontconfig --enable-libopenh264 --enable-gnu

BaseModelOutputWithPooling(last_hidden_state=tensor([[[-0.2546, -0.3799,  0.0656,  ...,  0.1837,  0.0816, -0.3835],
         [ 0.6206,  0.2366, -0.2837,  ..., -0.3538,  0.2839, -1.4775],
         [ 0.6841, -0.4045, -2.0879,  ..., -0.3159,  2.0449,  0.4309],
         [ 0.4495,  0.4197,  0.1516,  ..., -0.2869,  0.4019,  0.6602],
         [ 0.1169,  0.0258, -0.1057,  ...,  0.2147, -0.9463,  0.3245]]],
       device='cuda:0', dtype=torch.float16), pooler_output=tensor([[ 0.1169,  0.0258, -0.1057,  ...,  0.2147, -0.9463,  0.3245]],
       device='cuda:0', dtype=torch.float16), hidden_states=None, attentions=None)
torch.Size([1, 1024])


frame=   25 fps=0.0 q=24.0 Lsize=     148kB time=00:00:03.14 bitrate= 384.5kbits/s speed=13.8x    
video:146kB audio:0kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.787333%
[libx264 @ 0x560ba43d6600] frame I:2     Avg QP:21.69  size:  8819
[libx264 @ 0x560ba43d6600] frame P:8     Avg QP:24.17  size:  6794
[libx264 @ 0x560ba43d6600] frame B:15    Avg QP:24.98  size:  5146
[libx264 @ 0x560ba43d6600] consecutive B-frames: 12.0% 24.0%  0.0% 64.0%
[libx264 @ 0x560ba43d6600] mb I  I16..4:  7.2% 81.9% 10.9%
[libx264 @ 0x560ba43d6600] mb P  I16..4:  6.4% 50.1% 12.9%  P16..4: 11.4% 14.6%  4.6%  0.0%  0.0%    skip: 0.0%
[libx264 @ 0x560ba43d6600] mb B  I16..4:  4.3% 23.2%  5.6%  B16..8: 28.5% 24.9%  5.8%  direct: 6.9%  skip: 0.9%  L0:52.1% L1:24.3% BI:23.7%
[libx264 @ 0x560ba43d6600] 8x8 transform intra:73.0% inter:73.6%
[libx264 @ 0x560ba43d6600] coded y,uvDC,uvAC intra: 80.4% 89.0% 47.0% inter: 73.9% 79.2% 9.1%
[libx264 @ 0x560ba43d6600] i16 v,h,dc,p: 12% 68% 14%  6%
[

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

ffmpeg version 5.1.2 Copyright (c) 2000-2022 the FFmpeg developers
  built with gcc 11.3.0 (conda-forge gcc 11.3.0-19)
  configuration: --prefix=/home/conda/feedstock_root/build_artifacts/ffmpeg_1674566204550/_h_env_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_placehold_plac --cc=/home/conda/feedstock_root/build_artifacts/ffmpeg_1674566204550/_build_env/bin/x86_64-conda-linux-gnu-cc --cxx=/home/conda/feedstock_root/build_artifacts/ffmpeg_1674566204550/_build_env/bin/x86_64-conda-linux-gnu-c++ --nm=/home/conda/feedstock_root/build_artifacts/ffmpeg_1674566204550/_build_env/bin/x86_64-conda-linux-gnu-nm --ar=/home/conda/feedstock_root/build_artifacts/ffmpeg_1674566204550/_build_env/bin/x86_64-conda-linux-gnu-ar --disable-doc --disable-openssl --enable-demuxer=dash --enable-hardcoded-tables --enable-libfreetype --enable-libfontconfig --enable-libopenh264 --enable-gnu