In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

!pip install transformers

import warnings
warnings.filterwarnings('ignore')

Mounted at /content/drive
Collecting transformers
  Downloading transformers-4.35.2-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m67.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.19.4-py3-none-any.whl (311 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.7/311.7 kB[0m [31m43.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.19,>=0.14 (from transformers)
  Downloading tokenizers-0.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m67.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m87.4 MB/s[0

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import os
import logging
from transformers import BertTokenizerFast
import torch.nn.functional as F

# Logger setup
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

# Tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

# 3D CNN Encoder
class EncoderCNN3D(nn.Module):
  def __init__(self, channel_size=60, output_feature_size=512):
    super(EncoderCNN3D, self).__init__()
    self.conv1 = nn.Conv3d(channel_size, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))
    self.pool = nn.MaxPool3d(kernel_size=(1, 2, 2), stride=(1, 2, 2))
    self.conv2 = nn.Conv3d(64, 128, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1))
    self.flatten = nn.Flatten()
    self.fc = nn.Linear(2097152, output_feature_size)

  def forward(self, x):
        # Use full precision for convolution operations
        x = F.relu(self.conv1(x))
        x = self.pool(x)
        x = F.relu(self.conv2(x))
        x = self.pool(x)
        x = self.flatten(x)
        x = self.fc(x)
        return x

# DecoderRNN
class DecoderRNN(nn.Module):
  def __init__(self, embed_size, hidden_size, vocab_size, feature_size):
    super(DecoderRNN, self).__init__()
    self.embedding = nn.Embedding(vocab_size, embed_size)
    self.lstm = nn.LSTM(input_size=embed_size + feature_size, hidden_size=hidden_size, num_layers=1, batch_first=True)
    self.linear = nn.Linear(hidden_size, vocab_size)

  def forward(self, features, captions):
    embeddings = self.embedding(captions).view(captions.size(0), captions.size(1), -1)
    features = features.unsqueeze(1).repeat(1, captions.size(1), 1)
    combined = torch.cat((features, embeddings), dim=2)
    hiddens, _ = self.lstm(combined)
    outputs = self.linear(hiddens)
    return outputs

# EncoderDecoderModel
class EncoderDecoderModel(nn.Module):
    def __init__(self, encoder, decoder, embed_size):
        super(EncoderDecoderModel, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, frames, captions):
        features = self.encoder(frames)
        outputs = self.decoder(features, captions[:, :-1])
        return outputs

# Dataset
class VideoDataset(Dataset):
    def __init__(self, data_dir, csv_file):
        self.data_dir = data_dir
        self.data = pd.read_csv(csv_file)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        path = os.path.join(self.data_dir, self.data.iloc[idx, 0])
        frames = torch.load(path)

        description = self.data.iloc[idx, 2]
        encoding = tokenizer(description, max_length=10, truncation=True, return_tensors='pt')
        captions = encoding['input_ids'].squeeze(0)  # This should be a 1D tensor

        return frames, captions

class TestDataset(Dataset):
    def __init__(self, file_path):
        self.file_path = file_path
    def __len__(self):
        return 1  # Only one item in this dataset
    def __getitem__(self, idx):
        try:
            data = torch.load(self.file_path)
        except Exception as e:
            logger.error(f"Error loading {self.file_path}: {e}")
            return None

        frames = data['frames']
        captions = data['input_ids'][idx]  # Get the tokenized description for the specific index

        return frames, captions

# Model
encoder = EncoderCNN3D()
decoder = DecoderRNN(embed_size=256, hidden_size=512, vocab_size=tokenizer.vocab_size, feature_size=512)
model = EncoderDecoderModel(encoder, decoder, embed_size=256)

# Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# DataLoader
test_dataset = TestDataset('/content/drive/MyDrive/Video-to-Text/processed_data/batch_1572.pt')
dataloader = DataLoader(test_dataset, batch_size=1)

# NLLLoss
criterion = nn.NLLLoss(ignore_index=tokenizer.pad_token_id, reduction='none')

# Define a consistent sequence length for captions
max_len = 20

# Custom sequence loss
def sequence_loss(outputs, targets, mask):
    log_probs = F.log_softmax(outputs, dim=2)

    # Ensure targets are correctly shaped
    # targets should be: [batch_size, seq_len] with each element being a class label index
    if targets.dim() != 2:
        raise ValueError(f"targets tensor has incorrect number of dimensions: {targets.dim()}")

    # Expanding targets to match log_probs dimensions
    targets_expanded = targets.unsqueeze(-1)  # Shape: [batch_size, seq_len, 1]

    log_probs_for_targets = log_probs.gather(2, targets_expanded).squeeze(-1)

    log_probs_for_targets *= mask

    loss = -log_probs_for_targets.sum() / mask.sum()
    return loss

for epoch in range(1):
    for frames, captions in dataloader:
        # Pad captions to a consistent length
        padded_captions = F.pad(captions, (0, max_len - captions.shape[1]), value=tokenizer.pad_token_id)

        # Prepare inputs and targets for the model
        inputs = padded_captions[:, :-1]  # All tokens except the last
        targets = padded_captions[:, 1:]  # All tokens except the first

        # Forward pass
        outputs = model(frames, inputs)

        # Ensure targets are aligned with the outputs
        if targets.shape[1] > outputs.shape[1]:
            targets = targets[:, :outputs.shape[1]]

        # Prepare the mask
        mask = (inputs != tokenizer.pad_token_id).float()[:, :outputs.shape[1]]

        # Calculate loss
        loss = sequence_loss(outputs, targets, mask)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        print(f"Epoch {epoch}, Loss: {loss.item()}")

(…)cased/resolve/main/tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

(…)bert-base-uncased/resolve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

(…)base-uncased/resolve/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

(…)rt-base-uncased/resolve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Epoch 0, Loss: 10.341843605041504
