In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd /content/drive/MyDrive/btkg

/content/drive/MyDrive/btkg


In [3]:
!ls

checkpoints   models	     test_inference.ipynb   video_feature_utils
config.py     __pycache__    tmp		    videos
data	      pycocoevalcap  train.py
inference.py  run.py	     utils.py
loader	      splits	     video_feature_configs


# Import necessary libraries


In [4]:
# For video captioning
from loader.MSVD import MSVD
from config import TrainConfig as C
from models.abd_transformer import ABDTransformer
import torch
from utils import dict_to_cls
# Inception-ResNet-V2 for image feature extraction
import os
import cv2 # Thêm cv2 để đọc video
import numpy as np # Thêm numpy để xử lý mảng
import tensorflow as tf
from tensorflow.keras.applications import InceptionResNetV2
from tensorflow.keras.applications.inception_resnet_v2 import preprocess_input
# I3D for motion feature extraction
from models.i3d.extract_i3d import ExtractI3D
from video_feature_utils.utils import build_cfg_path
from omegaconf import OmegaConf
# Mask R-CNN for object feature extraction
import torch.nn.functional as F
from torchvision.models.detection import maskrcnn_resnet50_fpn, MaskRCNN_ResNet50_FPN_Weights
from torchvision import transforms
from PIL import Image
from tqdm import tqdm

# Thiết lập chung


In [5]:
# 1) Thiết lập chung
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DEVICE

device(type='cuda')

# Load pretrained model for feature extraction


In [6]:
# --- Tải mô hình Inception-ResNet-V2 ---
print("Loading Inception-ResNet-V2 model...")
# include_top=False để lấy đặc trưng, pooling='avg' để ra vector 1D
inception_resnet_model = InceptionResNetV2(weights='imagenet', include_top=False, pooling='avg')
print(">> Model loaded.")

# --- Tải mô hình Mask R-CNN ---
print("Loading Mask R-CNN model...")
# Tải mô hình Mask R-CNN được huấn luyện sẵn trên COCO
weights = MaskRCNN_ResNet50_FPN_Weights.DEFAULT
maskrcnn_model = maskrcnn_resnet50_fpn(weights=weights)
# Chuyển mô hình sang thiết bị và đặt ở chế độ đánh giá
maskrcnn_model = maskrcnn_model.to(DEVICE).eval()
print(">> Model loaded.")

Loading Inception-ResNet-V2 model...
Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/inception_resnet_v2/inception_resnet_v2_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m219055592/219055592[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 0us/step
>> Model loaded.
Loading Mask R-CNN model...
Downloading: "https://download.pytorch.org/models/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth" to /root/.cache/torch/hub/checkpoints/maskrcnn_resnet50_fpn_coco-bf2d0c1e.pth


100%|██████████| 170M/170M [00:00<00:00, 187MB/s]


>> Model loaded.


# Extract features for video captioning


## Inception-ResNet-V2

In [7]:
def extract_image_features(video_frames):
    """
    Trích xuất đặc trưng hình ảnh từ danh sách khung hình (BGR numpy arrays).
    Trả về mảng NumPy có shape (num_frames, feature_dim).
    """
    # Danh sách để lưu đặc trưng
    all_image_features = []

    # Hàm tiền xử lý khung hình
    def preprocess_frame(frame_bgr):
        """ Tiền xử lý một khung hình (BGR numpy array) cho Inception-ResNet-V2. """
        # Chuyển BGR sang RGB
        frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
        # Thay đổi kích thước về 299x299
        frame_resized = cv2.resize(frame_rgb, (299, 299))
        # Chuyển thành mảng numpy float32
        img_array = np.asarray(frame_resized, dtype=np.float32)
        # Mở rộng chiều để tạo batch (batch_size, height, width, channels)
        img_array_batch = np.expand_dims(img_array, axis=0)
        # Tiền xử lý theo chuẩn ImageNet (chuẩn hóa pixel)
        preprocessed_img = preprocess_input(img_array_batch)
        return preprocessed_img

    print("Extracting features from frames...")
    for i, frame in enumerate(video_frames):
        # In mỗi 10 frames hoặc với i == 0
        if (i + 1) % 10 == 0 or i == 0:
            print(f">> Extract image feature for frame {i+1}/{len(video_frames)}")
        # Tiền xử lý khung hình
        preprocessed_frame = preprocess_frame(frame)
        # Trích xuất đặc trưng: features.shape sẽ là (1, 1536) do dùng pooling='avg'
        features = inception_resnet_model.predict(
            preprocessed_frame, verbose=0)  # verbose=0 để giảm log
        # Loại bỏ chiều batch và thêm vào danh sách
        all_image_features.append(features[0])  # features[0].shape là (1536,)

    # Chuyển đổi danh sách đặc trưng thành mảng NumPy có shape (num_frames, 1536)
    image_feats = np.array(all_image_features)
    print("Feature extraction complete.")

    return image_feats

## Mask RCNN

In [8]:
# --- Cấu hình ---
TOP_K_PER_FRAME = 20  # Số lượng detection tốt nhất được giữ lại *mỗi khung hình*
COORD_MODE = "cxcywh"  # Chế độ mã hóa tọa độ bounding box
CONFIDENCE_THRESHOLD = 0.7  # Ngưỡng confidence cho detection
# Số lượng đặc trưng cuối cùng mong muốn (theo yêu cầu trước)
TARGET_TOTAL_FEATURES = 50
# --- Kết thúc Cấu hình ---

# 4. Hàm tiền xử lý: Chuyển PIL Image -> Tensor (C,H,W), giá trị [0,1]
preproc = transforms.Compose([transforms.ToTensor()])


def compute_box_coords(boxes, image_size, mode="cxcywh_log"):
    """
    Tính toán và chuẩn hóa tọa độ bounding box.
    """
    H, W = image_size
    x1 = boxes[:, 0]
    y1 = boxes[:, 1]
    x2 = boxes[:, 2]
    y2 = boxes[:, 3]
    w = (x2 - x1).clamp(min=1.0)
    h = (y2 - y1).clamp(min=1.0)
    cx = x1 + 0.5 * w
    cy = y1 + 0.5 * h

    if mode == "xyxy_norm":
        coords = torch.stack([x1 / W, y1 / H, x2 / W, y2 / H], dim=1)
    elif mode == "cxcywh":
        coords = torch.stack([cx / W, cy / H, w / W, h / H], dim=1)
    else:  # cxcywh_log (mặc định)
        coords = torch.stack(
            [cx / W, cy / H, torch.log(w / W), torch.log(h / H)], dim=1)
    return coords


@torch.no_grad()  # Tắt gradient để tăng tốc độ và tiết kiệm bộ nhớ trong suy luận
def extract_instance_feats_with_coords(model, pil_img, topk=50, coord_mode="cxcywh", conf_threshold=0.0):
    """
    Trích xuất đặc trưng 1028-D từ các đối tượng trong hình ảnh sử dụng Mask R-CNN.
    Args:
        model: Mô hình Mask R-CNN đã được tải.
        pil_img: PIL Image.
        topk: Số lượng detection tốt nhất được giữ lại.
        coord_mode: Chế độ mã hóa tọa độ ('cxcywh', 'cxcywh_log', 'xyxy_norm').
        conf_threshold: Ngưỡng confidence tối thiểu.
    Returns:
        torch.Tensor: Tensor đặc trưng có shape (N, 1028).
                      N là số lượng đối tượng được phát hiện (<= topk).
    """
    img = pil_img.convert("RGB")

    # 1) Chạy mô hình để có detections (boxes, scores, labels, masks...)
    # model.forward nhận danh sách tensor
    img_tensor = preproc(img).to(DEVICE)  # Chuyển PIL -> Tensor và lên GPU
    # Trả về danh sách kết quả cho từng ảnh trong batch
    outputs = model([img_tensor])
    out = outputs[0]  # Lấy kết quả cho ảnh đầu tiên (batch size = 1)

    # Trích xuất boxes, scores, labels
    boxes = out.get("boxes", torch.empty(
        (0, 4), device=DEVICE))  # Boxes trên device
    scores = out.get("scores", torch.empty((0,), device=DEVICE))
    labels = out.get("labels", torch.empty(
        (0,), device=DEVICE))  # Có thể dùng nếu cần

    # 2) Lọc theo ngưỡng confidence
    if scores.numel() > 0 and conf_threshold > 0.0:
        keep_conf = scores >= conf_threshold
        boxes = boxes[keep_conf]
        scores = scores[keep_conf]
        labels = labels[keep_conf]  # Nếu dùng labels

    # 3) Xử lý trường hợp không có detection nào sau khi lọc
    if boxes.numel() == 0:
        print("Warning: No objects detected (or survived filtering). Returning empty features.")
        # Trả về tensor rỗng với shape đúng
        return torch.empty((0, 1028), dtype=torch.float32, device='cpu')

    # 4) Chọn top-k detections dựa trên score
    if scores is not None and scores.numel() > 0:
        # torch.topk thường hiệu quả hơn argsort + slice nếu k << total
        k = min(topk, boxes.shape[0])
        top_scores, order = torch.topk(scores, k, largest=True, sorted=True)
        keep = order
        boxes = boxes[keep]
        scores = scores[keep]
        labels = labels[keep]  # Nếu dùng labels
    else:
        # Nếu không có scores hoặc scores rỗng (hiếm khi xảy ra)
        boxes = boxes[:topk]

    # --- Bắt đầu trích xuất đặc trưng ---
    # 5) Tiền xử lý lại ảnh để đưa vào model.transform
    # (model.transform xử lý normalization, resizing nếu cần)
    # img_tensor đã được tạo ở trên
    # images, _ = model.transform([img_tensor]) # Có thể dùng nếu cần transform lại
    # Tuy nhiên, vì img_tensor đã được chuẩn hóa đúng cách bởi preproc và model.transform
    # sẽ không làm gì thêm nếu kích thước phù hợp và không có chuyển đổi khác,
    # ta có thể bỏ qua bước này và dùng trực tiếp img_tensor.
    # Nhưng để đúng logic và chắc chắn, ta vẫn gọi transform.
    # img_tensor_for_transform = preproc(img).unsqueeze(0) # Thêm batch dim
    images, _ = model.transform([img_tensor])  # Trả về ImageList
    images_t = images.tensors.to(DEVICE)    # (1, 3, H', W')
    # [(H', W')] - kích thước sau transform
    image_sizes = images.image_sizes

    # 6) Backbone -> features dict
    features = model.backbone(images_t)  # OrderedDict of feature maps

    # 7) ROI pooling (boxes phải ở không gian tọa độ ảnh gốc, đã được transform xử lý)
    # boxes vẫn đang ở device
    pooled = model.roi_heads.box_roi_pool(
        features, [boxes], image_sizes)  # (N, 256, 7, 7)

    # 8) box head -> (N, 1024)
    # Đây là bước quan trọng để lấy feature vector 1024-D
    box_repr = model.roi_heads.box_head(pooled)  # (N, 1024)

    # 9) Tính toán và chuẩn hóa tọa độ
    H_img, W_img = image_sizes[0]  # ints - kích thước ảnh sau khi transform
    coords = compute_box_coords(boxes, (H_img, W_img), mode=coord_mode)
    coords = coords.to(box_repr.dtype).to(
        box_repr.device)  # Đảm bảo dtype và device khớp

    # 10) Kết hợp đặc trưng và tọa độ -> (N, 1028)
    feat1028 = torch.cat([box_repr, coords], dim=1)  # (N, 1028)

    return feat1028.cpu()  # Chuyển về CPU để dễ xử lý sau này


def aggregate_features(feature_list, target_num_features):
    """
    Tổng hợp danh sách các tensor đặc trưng thành một tensor cố định.
    Args:
        feature_list: List of torch.Tensor, each with shape (N_i, 1028).
        target_num_features: int, số lượng đặc trưng mong muốn.
    Returns:
        torch.Tensor: Tensor với shape (target_num_features, 1028).
    """
    if not feature_list:
        print("Warning: feature_list is empty. Returning zero tensor.")
        return torch.zeros(target_num_features, 1028, dtype=torch.float32)

    # Gộp tất cả các đặc trưng từ các khung hình
    # Bỏ qua các tensor rỗng (0, 1028) nếu có
    non_empty_features = [f for f in feature_list if f.shape[0] > 0]

    if not non_empty_features:
        print("Warning: No features detected in any frame. Returning zero tensor.")
        return torch.zeros(target_num_features, 1028, dtype=torch.float32)

    # Shape: (Total_Detections, 1028)
    all_features = torch.cat(non_empty_features, dim=0)
    total_detections = all_features.shape[0]
    print(
        f"Total object detections across all sampled frames: {total_detections}")

    if total_detections == 0:
        return torch.zeros(target_num_features, 1028, dtype=torch.float32)

    if total_detections >= target_num_features:
        # Lấy mẫu đều để giảm xuống target_num_features
        indices = np.linspace(0, total_detections - 1,
                              target_num_features, dtype=int)
        selected_features = all_features[indices]
    else:
        # Nếu không đủ, pad bằng cách lặp lại đặc trưng cuối cùng
        print(
            f"Warning: Only {total_detections} features found. Padding to {target_num_features}.")
        num_to_pad = target_num_features - total_detections
        if num_to_pad > 0:
            # [num_to_pad, 1028]
            padding = all_features[-1:].repeat(num_to_pad, 1)
            # [target_num_features, 1028]
            selected_features = torch.cat([all_features, padding], dim=0)
        else:
            selected_features = all_features

    # Đảm bảo shape cuối cùng chính xác
    assert selected_features.shape == (
        target_num_features, 1028), f"Aggregation failed: {selected_features.shape}"
    return selected_features  # Shape: (target_num_features, 1028)


def extract_object_features_from_video(video_frames_bgr):
    """
    Trích xuất đặc trưng đối tượng từ danh sách khung hình video (BGR numpy arrays).
    Args:
        video_frames_bgr: List of BGR numpy arrays.
    Returns:
        torch.Tensor: Tensor đặc trưng có shape (1, TARGET_TOTAL_FEATURES, 1028).
    """

    # Tạo placeholder cho relationship features, kích thước (50, 300)
    relationship_feats = torch.zeros((TARGET_TOTAL_FEATURES, 300), dtype=torch.float32)

    # Danh sách lưu trữ đặc trưng từ từng khung hình
    all_frame_features = []

    # Lặp qua từng khung hình BGR
    print("Extracting features from each frame...")
    for i, frame_bgr in enumerate(video_frames_bgr):
        # In mỗi 10 frames hoặc với i == 0
        if (i + 1) % 10 == 0 or i == 0:
            print(f"  Processing frame {i+1}/{len(video_frames_bgr)}")
        # Chuyển đổi BGR (OpenCV) -> RGB -> PIL Image
        frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
        pil_img = Image.fromarray(frame_rgb)

        # Trích xuất đặc trưng từ khung hình
        try:
            frame_features = extract_instance_feats_with_coords(
                maskrcnn_model, pil_img,
                topk=TOP_K_PER_FRAME,
                coord_mode=COORD_MODE,
                conf_threshold=CONFIDENCE_THRESHOLD
            )
            # Thêm tensor (N_i, 1028) hoặc (0, 1028)
            all_frame_features.append(frame_features)
        except Exception as e:
            print(f"    Error extracting features from frame {i+1}: {e}")
            # Thêm tensor rỗng nếu lỗi
            all_frame_features.append(
                torch.empty((0, 1028), dtype=torch.float32))

    # Tổng hợp đặc trưng để có số lượng cố định
    print("Aggregating features to target shape...")
    object_feats = aggregate_features(
        all_frame_features, TARGET_TOTAL_FEATURES)  # Tensor (50, 1028)

    return object_feats, relationship_feats

## I3D

In [9]:
def extract_motion_features(video_path):
    # Select the feature type
    feature_type = 'i3d'

    # Load and patch the config
    args = OmegaConf.load(build_cfg_path(feature_type))
    args.video_paths = [video_path]
    # args.show_pred = True
    args.stack_size = 64
    args.step_size = 64
    args.extraction_fps = 50
    args.flow_type = 'raft'
    # args.streams = 'flow'

    # Load the model
    extractor = ExtractI3D(args)

    # Extract features
    for video_path in args.video_paths:
        print(f'Extracting for {video_path}')
        feature_dict = extractor.extract(video_path)

    # Kết hợp đặc trưng RGB và Flow từ 2 stream
    motion_feats = feature_dict['rgb'] + feature_dict['flow']
    return motion_feats

## Full pipeline for feature extraction


In [10]:
def sample_frames(video_path, target_fps):
    """ Hàm lấy mẫu khung hình từ video ở fps mục tiêu.
    Ảnh trả về là danh sách các mảng numpy (BGR).
    """
    cap = cv2.VideoCapture(video_path)
    orig_fps = cap.get(cv2.CAP_PROP_FPS) or 30.0
    interval = max(1, int(round(orig_fps / target_fps)))
    frames = []
    idx = 0
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        if idx % interval == 0:
            # Giữ nguyên ở dạng BGR hoặc chuyển sang RGB nếu mô hình yêu cầu (InceptionResNetV2 dùng RGB)
            # frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) # Nếu cần RGB
            frames.append(frame)  # Giữ BGR, sẽ xử lý sau
        idx += 1
    cap.release()
    return frames


def resample_fixed(feats, N):
    """ Hàm lấy mẫu lại chuỗi đặc trưng để có đúng N đặc trưng bằng cách lấy mẫu đều.
    """
    T, D = feats.shape
    idxs = np.linspace(0, T-1, N).astype(int)
    return feats[idxs]


def process_video(video_path, dataset='MSVD'):
    """ Full pipeline để xử lý video và trích xuất đặc trưng.
    """
    N = 50 if dataset == 'MSVD' else 60
    target_fps = 5 if dataset == 'MSVD' else 3

    # --- Tải video và lấy mẫu frames ---
    print("Sampling video frames...")
    # Danh sách các mảng numpy (BGR)
    video_frames = sample_frames(video_path, target_fps)
    print(f"Sampled {len(video_frames)} frames at ~{target_fps} fps.")

    # Nếu có nhiều hơn 50 frames, lấy 50 frames cách đều nhau để giảm bớt
    if len(video_frames) > 50:
        indices = np.linspace(0, len(video_frames) - 1, 50, dtype=int)
        video_frames = [video_frames[i] for i in indices]
        print(f"Reduced to {len(video_frames)} frames for processing.")

    # Trích xuất đặc trưng với mô hình Inception-ResNet-V2
    image_feats = extract_image_features(video_frames)
    # Trích xuất đặc trưng với mô hình Mask R-CNN
    object_feats, rel_feats = extract_object_features_from_video(video_frames)
    # Trích xuất đặc trưng với mô hình I3D
    motion_feats = extract_motion_features(video_path)

    # Resample để có đúng N đặc trưng
    image_feats_res = resample_fixed(image_feats, N)  # (N, 1536)
    object_feats_res = resample_fixed(object_feats.numpy(), N)  # (N, 1028)
    rel_feats_res = resample_fixed(rel_feats.numpy(), N)  # (N, 300)
    motion_feats_res = resample_fixed(motion_feats, N)  # (N, 1024)

    # Đảm bảo tất cả đặc trưng đều có kiểu dữ liệu float32
    image_feats_res = image_feats_res.astype(np.float32)
    object_feats_res = object_feats_res.astype(np.float32)
    rel_feats_res = rel_feats_res.astype(np.float32)
    motion_feats_res = motion_feats_res.astype(np.float32)

    # Chuyển sang tensor và thêm batch dim
    image_feats_tensor = torch.from_numpy(image_feats_res).to(
        DEVICE).unsqueeze(0)  # (1, N, 1536)
    object_feats_tensor = torch.from_numpy(object_feats_res).to(
        DEVICE).unsqueeze(0)  # (1, N, 1028)
    rel_feats_tensor = torch.from_numpy(rel_feats_res).to(
        DEVICE).unsqueeze(0)  # (1, N, 300)
    motion_feats_tensor = torch.from_numpy(motion_feats_res).to(
        DEVICE).unsqueeze(0)  # (1, N, 1024)

    # Trả về kết quả
    return {
        'image_feats': image_feats_tensor,
        'motion_feats': motion_feats_tensor,
        'object_feats': object_feats_tensor,
        'rel_feats': rel_feats_tensor
    }

# Load checkpoint and config


In [11]:
checkpoint = torch.load("checkpoints/best.ckpt", map_location="cpu")
config = dict_to_cls(checkpoint['config'])

In [12]:
corpus = MSVD(config)

# Build Models


In [13]:
vocab = corpus.vocab
""" Build Models """
try:
    model = ABDTransformer(vocab, config.feat.size, config.transformer.d_model, config.transformer.d_ff,
                           config.transformer.n_heads, config.transformer.n_layers, config.transformer.dropout,
                           config.feat.feature_mode, n_heads_big=config.transformer.n_heads_big,
                           select_num=config.transformer.select_num)
except:
    model = ABDTransformer(vocab, config.feat.size, config.transformer.d_model, config.transformer.d_ff,
                           config.transformer.n_heads, config.transformer.n_layers, config.transformer.dropout,
                           config.feat.feature_mode, n_heads_big=config.transformer.n_heads_big)
model.load_state_dict(checkpoint['abd_transformer'])
model.device = DEVICE

# Move model to device
model = model.to(DEVICE)
print(DEVICE)

cuda


# Inference with beam search


In [14]:
def generate_video_caption(video_path):
    # Rút trích đặc trưng từ video
    feats_dict = process_video(video_path)
    feats = (
        feats_dict['image_feats'],
        feats_dict['motion_feats'],
        feats_dict['object_feats'],
        feats_dict['rel_feats']
    )
    # Tạo caption cho video
    model.eval()
    beam_size = config.beam_size
    max_len = config.loader.max_caption_len
    with torch.no_grad():
        r2l_captions, l2r_captions = model.beam_search_decode(feats, beam_size, max_len)
        # r2l_captions = [idxs_to_sentence(caption, vocab.idx2word, BOS_idx) for caption in r2l_captions]
        l2r_captions = [" ".join(caption[0].value) for caption in l2r_captions]
        r2l_captions = [" ".join(caption[0].value) for caption in r2l_captions]

    print(f"Left to Right Captions: {l2r_captions}")
    return l2r_captions

In [15]:
# Duyệt qua từng video trong folder 'videos' và tạo caption
import os
for filename in os.listdir('videos'):
    video_path = os.path.join('videos', filename)
    print(f"Generating caption for {filename}...")
    captions = generate_video_caption(video_path)
    print(f">> Captions for {filename}: {captions}\n")
    print("--------------------------------------------------\n")

Generating caption for bento.mp4...
Sampling video frames...
Sampled 25 frames at ~5 fps.
Extracting features from frames...
>> Extract image feature for frame 1/25
>> Extract image feature for frame 10/25
>> Extract image feature for frame 20/25
Feature extraction complete.
Extracting features from each frame...
  Processing frame 1/25
  Processing frame 10/25
  Processing frame 20/25
Aggregating features to target shape...
Total object detections across all sampled frames: 106
Extracting for videos/bento.mp4


  with autocast(enabled=self.mixed_precision):
  with autocast(enabled=self.mixed_precision):
  with autocast(enabled=self.mixed_precision):


Left to Right Captions: ['a person is putting a piece of food into a']
>> Captions for bento.mp4: ['a person is putting a piece of food into a']

--------------------------------------------------

Generating caption for lifting.mp4...
Sampling video frames...
Sampled 89 frames at ~5 fps.
Reduced to 50 frames for processing.
Extracting features from frames...
>> Extract image feature for frame 1/50
>> Extract image feature for frame 10/50
>> Extract image feature for frame 20/50
>> Extract image feature for frame 30/50
>> Extract image feature for frame 40/50
>> Extract image feature for frame 50/50
Feature extraction complete.
Extracting features from each frame...
  Processing frame 1/50
  Processing frame 10/50
  Processing frame 20/50
  Processing frame 30/50
  Processing frame 40/50
  Processing frame 50/50
Aggregating features to target shape...
Total object detections across all sampled frames: 58
Extracting for videos/lifting.mp4


  with autocast(enabled=self.mixed_precision):
  with autocast(enabled=self.mixed_precision):
  with autocast(enabled=self.mixed_precision):


Left to Right Captions: ['a man is lifting a weight']
>> Captions for lifting.mp4: ['a man is lifting a weight']

--------------------------------------------------

Generating caption for slicing_carrot.mp4...
Sampling video frames...
Sampled 30 frames at ~5 fps.
Extracting features from frames...
>> Extract image feature for frame 1/30
>> Extract image feature for frame 10/30
>> Extract image feature for frame 20/30
>> Extract image feature for frame 30/30
Feature extraction complete.
Extracting features from each frame...
  Processing frame 1/30
  Processing frame 10/30
  Processing frame 20/30
  Processing frame 30/30
Aggregating features to target shape...
Total object detections across all sampled frames: 111
Extracting for videos/slicing_carrot.mp4


  with autocast(enabled=self.mixed_precision):
  with autocast(enabled=self.mixed_precision):
  with autocast(enabled=self.mixed_precision):


Left to Right Captions: ['a woman is cutting a tomato']
>> Captions for slicing_carrot.mp4: ['a woman is cutting a tomato']

--------------------------------------------------

Generating caption for walking_the_dog.mp4...
Sampling video frames...
Sampled 70 frames at ~5 fps.
Reduced to 50 frames for processing.
Extracting features from frames...
>> Extract image feature for frame 1/50
>> Extract image feature for frame 10/50
>> Extract image feature for frame 20/50
>> Extract image feature for frame 30/50
>> Extract image feature for frame 40/50
>> Extract image feature for frame 50/50
Feature extraction complete.
Extracting features from each frame...
  Processing frame 1/50
  Processing frame 10/50
  Processing frame 20/50
  Processing frame 30/50
  Processing frame 40/50
  Processing frame 50/50
Aggregating features to target shape...
Total object detections across all sampled frames: 214
Extracting for videos/walking_the_dog.mp4


  with autocast(enabled=self.mixed_precision):
  with autocast(enabled=self.mixed_precision):
  with autocast(enabled=self.mixed_precision):


Left to Right Captions: ['a man is running']
>> Captions for walking_the_dog.mp4: ['a man is running']

--------------------------------------------------

Generating caption for slicing_cucumber.mp4...
Sampling video frames...
Sampled 35 frames at ~5 fps.
Extracting features from frames...
>> Extract image feature for frame 1/35
>> Extract image feature for frame 10/35
>> Extract image feature for frame 20/35
>> Extract image feature for frame 30/35
Feature extraction complete.
Extracting features from each frame...
  Processing frame 1/35
  Processing frame 10/35
  Processing frame 20/35
  Processing frame 30/35
Aggregating features to target shape...
Total object detections across all sampled frames: 74
Extracting for videos/slicing_cucumber.mp4


  with autocast(enabled=self.mixed_precision):
  with autocast(enabled=self.mixed_precision):
  with autocast(enabled=self.mixed_precision):


Left to Right Captions: ['a woman is slicing a cake']
>> Captions for slicing_cucumber.mp4: ['a woman is slicing a cake']

--------------------------------------------------

Generating caption for pour_water_to_rice.mp4...
Sampling video frames...
Sampled 55 frames at ~5 fps.
Reduced to 50 frames for processing.
Extracting features from frames...
>> Extract image feature for frame 1/50
>> Extract image feature for frame 10/50
>> Extract image feature for frame 20/50
>> Extract image feature for frame 30/50
>> Extract image feature for frame 40/50
>> Extract image feature for frame 50/50
Feature extraction complete.
Extracting features from each frame...
  Processing frame 1/50
  Processing frame 10/50
  Processing frame 20/50
  Processing frame 30/50
  Processing frame 40/50
  Processing frame 50/50
Aggregating features to target shape...
Total object detections across all sampled frames: 66
Extracting for videos/pour_water_to_rice.mp4


  with autocast(enabled=self.mixed_precision):
  with autocast(enabled=self.mixed_precision):
  with autocast(enabled=self.mixed_precision):


Left to Right Captions: ['a man is cooking']
>> Captions for pour_water_to_rice.mp4: ['a man is cooking']

--------------------------------------------------

Generating caption for fry_meet.mp4...
Sampling video frames...
Sampled 65 frames at ~5 fps.
Reduced to 50 frames for processing.
Extracting features from frames...
>> Extract image feature for frame 1/50
>> Extract image feature for frame 10/50
>> Extract image feature for frame 20/50
>> Extract image feature for frame 30/50
>> Extract image feature for frame 40/50
>> Extract image feature for frame 50/50
Feature extraction complete.
Extracting features from each frame...
  Processing frame 1/50
  Processing frame 10/50
  Processing frame 20/50
  Processing frame 30/50
  Processing frame 40/50
  Processing frame 50/50
Aggregating features to target shape...
Total object detections across all sampled frames: 84
Extracting for videos/fry_meet.mp4


  with autocast(enabled=self.mixed_precision):
  with autocast(enabled=self.mixed_precision):
  with autocast(enabled=self.mixed_precision):


Left to Right Captions: ['a person is cooking']
>> Captions for fry_meet.mp4: ['a person is cooking']

--------------------------------------------------

Generating caption for fry_egg.mp4...
Sampling video frames...
Sampled 40 frames at ~5 fps.
Extracting features from frames...
>> Extract image feature for frame 1/40
>> Extract image feature for frame 10/40
>> Extract image feature for frame 20/40
>> Extract image feature for frame 30/40
>> Extract image feature for frame 40/40
Feature extraction complete.
Extracting features from each frame...
  Processing frame 1/40
  Processing frame 10/40
  Processing frame 20/40
  Processing frame 30/40
  Processing frame 40/40
Aggregating features to target shape...
Total object detections across all sampled frames: 74
Extracting for videos/fry_egg.mp4


  with autocast(enabled=self.mixed_precision):
  with autocast(enabled=self.mixed_precision):
  with autocast(enabled=self.mixed_precision):


Left to Right Captions: ['a woman is cooking']
>> Captions for fry_egg.mp4: ['a woman is cooking']

--------------------------------------------------

