In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

!pip install transformers

import warnings
warnings.filterwarnings('ignore')

Mounted at /content/drive
Collecting transformers
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m30.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.19.0-py3-none-any.whl (311 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.2/311.2 kB[0m [31m36.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m68.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m71.6 MB/s[0

In [None]:
import os
import pandas as pd

# Paths
gif_dir = '/content/drive/MyDrive/Video-to-Text/gifs'
tsv_path = '/content/drive/MyDrive/Video-to-Text/tgif-v1.0-updated.tsv'
output_path = '/content/drive/MyDrive/Video-to-Text/cleaned_tgif.tsv'

# Get list of all valid gif basenames
valid_basenames = []
for gif in os.listdir(gif_dir):
  if gif.lower().endswith('.gif'):
    valid_basenames.append(os.path.basename(gif))

In [None]:
import pandas as pd
import os

# Read in original tgif data
original_df = pd.read_csv('/content/drive/MyDrive/Video-to-Text/tgif-v1.0-updated.tsv',
                          sep='\t', names=['column_A', 'column_B'])

# Deduplicate column A
deduped_df = original_df.drop_duplicates(subset=['column_A'])
print(f"Total rows after deduplicating: {len(deduped_df)}")

# Load and deduplicate valid basenames
with open('valid_basenames.txt') as f:
    valid_basenames = {line.strip() for line in f}

print(f"Unique valid basenames: {len(valid_basenames)}")
assert len(valid_basenames) == 100669, "Invalid basenames length"

# Extract basenames from column A
deduped_df['basenames'] = deduped_df['column_A'].apply(lambda x: os.path.basename(x.rstrip('/')))

# Filter rows where basename is in valid_basenames
filtered_df = deduped_df[deduped_df['basenames'].isin(valid_basenames)]

print(f"Rows after filtering: {len(filtered_df)}")
print(filtered_df.head()) # print some rows

# Keep only desired columns
filtered_df = filtered_df[['column_A', 'column_B']]

# Write filtered DataFrame to Google Drive
output_path = '/content/drive/MyDrive/Video-to-Text/filtered.tsv'
filtered_df.to_csv(output_path, sep='\t', index=False)

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from pathlib import Path
from PIL import Image, ImageSequence
import pandas as pd
from tqdm import tqdm
import torchvision.transforms as T
from transformers import BertTokenizerFast
import random
from torch.utils.data.dataloader import default_collate
import gc
import os
import logging

# Logging setup
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

# Constants
CACHE_DATASET = False
NUM_WORKERS = 4
BATCH_SIZE = 64
SAVE_DIR = "/content/drive/MyDrive/Video-to-Text/processed_data"
os.makedirs(SAVE_DIR, exist_ok=True)

# Transforms
resize_tfm = T.Resize((128, 128), interpolation=T.InterpolationMode.LANCZOS)
tensor_tfm = T.Compose([
    T.ToTensor(),
    T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

# Additional Transforms for Data Augmentation
augmentation_tfm = T.RandomHorizontalFlip(p=0.5)

# Tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

class GIFDataset(Dataset):

    def __init__(self, csv_file, gif_dir, max_frames=16):
        self.csv_data = pd.read_csv(csv_file)
        self.gif_dir = Path(gif_dir)
        self.max_frames = max_frames

    def __getitem__(self, idx):
        basename = self.csv_data.iloc[idx, 1]
        gif_path = self.gif_dir / basename
        frames = []

        try:
            gif = Image.open(gif_path)
            frames = [f.convert('RGB') for f in ImageSequence.Iterator(gif)]
            frames = [resize_tfm(f) for f in frames]
            frames = random.sample(frames, k=min(self.max_frames, len(frames)))
            frames = [tensor_tfm(f) for f in frames]
        except Exception as e:
            logger.warning(f"Error processing {gif_path}: {e}")
            frames = [torch.zeros(3, 128, 128) for _ in range(self.max_frames)]  # Match the tensor dimensions

        description = self.csv_data.iloc[idx, 2]
        encoding = tokenizer(description, truncation=True, padding='max_length', return_tensors='pt')

        sample = {
            'frames': torch.stack(frames),
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'error': len(frames) == 0  # Flag to indicate if the frames are from a faulty GIF
        }

        return sample

    def __len__(self):
        return len(self.csv_data)

    def __repr__(self):
        return f"GIFDataset(len={len(self)})"

def collate_fn(batch):
    batch = [b for b in batch if b is not None]
    max_frames = max([s['frames'].size(0) for s in batch])

    for s in batch:
        n_frames = s['frames'].size(0)
        if n_frames < max_frames:
            padding = torch.zeros(max_frames - n_frames, *s['frames'].size()[1:], dtype=s['frames'].dtype)
            s['frames'] = torch.cat([s['frames'], padding], dim=0)

    return default_collate(batch)

# Dataset and DataLoader
dataset = GIFDataset('/content/drive/MyDrive/Video-to-Text/filtered.csv', '/content/drive/MyDrive/Video-to-Text/gifs')
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, collate_fn=collate_fn, pin_memory=True)

# Train loop
for i, batch in enumerate(tqdm(dataloader)):
    try:
        # Additional Processing Step
        # Feature Extraction - Taking mean of RGB channels
        batch['frames'] = torch.mean(batch['frames'], dim=2)

        # Data Augmentation - Random Horizontal Flip
        for j in range(batch['frames'].size(0)):
            for k in range(batch['frames'].size(1)):
                batch['frames'][j, k] = augmentation_tfm(batch['frames'][j, k])

        # Save batch to Google Drive
        save_path = os.path.join(SAVE_DIR, f"batch_{i}.pt")
        torch.save(batch, save_path)

        if i % 100 == 0:
            gc.collect()

    except Exception as e:
        logger.error(f"Error processing batch {i}: {e}")

print("Processing complete! Saved to Google Drive.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


ERROR:__main__:Error processing batch 1207: Parent directory /content/drive/MyDrive/Video-to-Text/processed_data does not exist.
ERROR:__main__:Error processing batch 1208: Parent directory /content/drive/MyDrive/Video-to-Text/processed_data does not exist.
100%|██████████| 1573/1573 [4:49:09<00:00, 11.03s/it]

Processing complete! Saved to Google Drive.





In [None]:
import torch.nn as nn
from torchvision import models

class EncoderCNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.resnet = models.resnet50(pretrained=True)

    def forward(self, images):
        return self.resnet(images)

class DecoderRNN(nn.Module):
    def __init__(self, embed_size, hidden_size, vocab_size):
        super().__init__()
        self.lstm = nn.LSTM(
            input_size=embed_size,
            hidden_size=hidden_size,
            num_layers=1,
            batch_first=True
        )

        self.linear = nn.Linear(hidden_size, vocab_size)

    def forward(self, features):
        output, _ = self.lstm(features)
        return self.linear(output)

model = EncoderDecoder(encoder=EncoderCNN(), decoder=DecoderRNN(300, 256, 10000))

# Training

num_epochs = 10
optimizer = torch.optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()

for epoch in range(num_epochs):

  for i, batch in enumerate(dataloader):

    frames = batch['frames']
    input_ids = batch['input_ids']
    attention_mask = batch['attention_mask']

    outputs = model(frames, input_ids, attention_mask)
    loss = criterion(outputs, labels)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

  print(f'Epoch {epoch+1} | Loss {loss.item():.4f}')

print('Training complete!')