In [4]:
!pip install torch torchaudio torchvision tqdm decord soundfile lazy-loader resampy

Defaulting to user installation because normal site-packages is not writeable
Collecting lazy-loader
  Downloading lazy_loader-0.4-py3-none-any.whl.metadata (7.6 kB)
Downloading lazy_loader-0.4-py3-none-any.whl (12 kB)
Installing collected packages: lazy-loader
Successfully installed lazy-loader-0.4


DEPRECATION: Loading egg at c:\program files\python311\lib\site-packages\vboxapi-1.0-py3.11.egg is deprecated. pip 25.1 will enforce this behaviour change. A possible replacement is to use pip for package installation. Discussion can be found at https://github.com/pypa/pip/issues/12330


In [1]:
# ================================================================
#   MULTIMODAL TRANSFORMER NOTEBOOK
# ================================================================
# This notebook shows how to:
# 1) Build a vocabulary from a chat file
# 2) Precompute embeddings for video (via ResNet) and audio (via a placeholder or pretrained model)
# 3) Save them in .pt files, one per chat line
# 4) Create a dataset + dataloader
# 5) Train a Transformer-based multimodal model
# 6) Run inference (generate comments) with top-k sampling
# ---------------------------------------------------------------

import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

# We'll import from our local modules:
from data_utils import (
    build_vocabulary,
    TwitchCommentDataset,
    my_collate_fn
)
from preprocess import preprocess_files
from model import MultiModal
from train import train_one_epoch
from inference import generate_comment

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

CHAT_FILE = "data/v2424877187.irc"  # The .irc file with chat lines
special_tokens = ["<PAD>", "<UNK>", "<SOS>", "<EOS>"]

VIDEO_FILE = "data/2424877187.mkv"
AUDIO_FILE = "data/2424877187.wav"
OUTPUT_DIR = "precomputed_data"
MODEL_PATH = "multimodal_transformer.pth"

# ==============================
# 1) BUILD VOCAB
# ==============================

print("Building vocabulary from:", CHAT_FILE)
word2idx, idx2word = build_vocabulary(
    chat_file=CHAT_FILE,
    min_freq=1,
    max_size=5000,           # Or any limit
    special_tokens=special_tokens
)
vocab_size = len(word2idx)
print("Vocabulary size:", vocab_size)

  torchaudio.set_audio_backend("soundfile")
Using cache found in C:\Users\Alexey/.cache\torch\hub\harritaylor_torchvggish_master


Using device: cuda
Building vocabulary from: data/v2424877187.irc
Vocabulary size: 5004


  from tqdm.autonotebook import tqdm


In [1]:
# ==============================
# 2) PREPROCESS & SAVE EMBEDDINGS
# ==============================


if not os.path.isdir(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR, exist_ok=True)

print("\nPreprocessing and embedding video/audio => storing in", OUTPUT_DIR)
preprocess_files(
    video_path=VIDEO_FILE,
    audio_path=AUDIO_FILE,
    chat_file=CHAT_FILE,
    output_dir=OUTPUT_DIR,
    snippet_duration=10.0,  # 10 seconds before each chat line
    sample_rate=16000,      # For audio
    fps_for_sampling=1.0,                # 1 frame per second for video
    word2idx=word2idx
)
print("Preprocessing complete.")

NameError: name 'os' is not defined

In [2]:
# ==============================
# 3) CREATE DATASET/DATALOADER
# ==============================
print("\nCreating dataset and dataloader ...")

dataset = TwitchCommentDataset(
    cache_dir=OUTPUT_DIR,
    chat_file=CHAT_FILE,
    word2idx=word2idx
)

dataloader = DataLoader(
    dataset,
    batch_size=4,        # adjust as needed
    shuffle=True,
    collate_fn=my_collate_fn
)

print("Dataset size:", len(dataset))
sample_batch = next(iter(dataloader))
print("Sample batch shapes:")
print(" video:", sample_batch['video'].shape)
print(" audio:", sample_batch['audio'].shape)
print(" text: ", sample_batch['text'].shape)

  sample_dict = torch.load(out_path)



Creating dataset and dataloader ...
Dataset size: 13923
Sample batch shapes:
 video: torch.Size([4, 1, 2048])
 audio: torch.Size([4, 1, 128])
 text:  torch.Size([4, 7])


In [3]:
# ==============================
# 4) INIT MODEL
# ==============================
print("\nInitializing Transformer model ...")

# If your preprocess_files used single embeddings => (2048,) for video, (128,) for audio
# Then your video_feature_dim=2048, audio_feature_dim=128
model = MultiModal(
    vocab_size=vocab_size,
    d_model=512,
    video_feature_dim=2048,
    audio_feature_dim=128,
    nhead=8,
    num_encoder_layers=4,
    num_decoder_layers=4,
    dim_feedforward=2048,
    dropout=0.1
).to(device)

# ==============================
# 5) TRAIN LOOP
# ==============================
optimizer = optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss(ignore_index=word2idx["<PAD>"])

num_epochs = 10
for epoch in range(1, num_epochs+1):
    avg_loss = train_one_epoch(model, dataloader, optimizer, criterion, epoch, device)
    print(f"[Epoch {epoch}] Loss: {avg_loss:.4f}")

# Save final model
torch.save(model.state_dict(), MODEL_PATH)
print("\nModel saved to:", MODEL_PATH)


Initializing Transformer model ...


Epoch 1:   0%|          | 0/3481 [00:00<?, ?it/s]

[Epoch 1] Loss: 10.1997


Epoch 2:   0%|          | 0/3481 [00:00<?, ?it/s]

[Epoch 2] Loss: 4.6357


Epoch 3:   0%|          | 0/3481 [00:00<?, ?it/s]

[Epoch 3] Loss: 2.5302


Epoch 4:   0%|          | 0/3481 [00:00<?, ?it/s]

[Epoch 4] Loss: 1.3396


Epoch 5:   0%|          | 0/3481 [00:00<?, ?it/s]

[Epoch 5] Loss: 0.5157


Epoch 6:   0%|          | 0/3481 [00:00<?, ?it/s]

[Epoch 6] Loss: 0.1770


Epoch 7:   0%|          | 0/3481 [00:00<?, ?it/s]

[Epoch 7] Loss: 0.0950


Epoch 8:   0%|          | 0/3481 [00:00<?, ?it/s]

[Epoch 8] Loss: 0.0607


Epoch 9:   0%|          | 0/3481 [00:00<?, ?it/s]

[Epoch 9] Loss: 0.0321


Epoch 10:   0%|          | 0/3481 [00:00<?, ?it/s]

[Epoch 10] Loss: 0.0353

Model saved to: multimodal_transformer.pth


In [30]:
# ==============================
# 6) INFERENCE
# ==============================
print("\nRunning sample inference ...", end='\n')
rev_vocab = {v: k for k, v in word2idx.items()}

# We'll pick an entry from the dataset
SAMPLE_IDX = 5000  # or any
sample_data = dataset[SAMPLE_IDX]
video_emb = sample_data['video']  # (2048,) or (T, 2048)
audio_emb = sample_data['audio']  # (128,) or (A_time, 128)

orig_text = [rev_vocab.get(tid, "<UNK>") for tid in sample_data['text'].cpu().numpy()]
print("Original comment:", " ".join(orig_text))  # Original text
print(f"Original comment (tokens): {sample_data['text']}", end='\n')

start_tok = word2idx.get("<SOS>", 0)
end_tok   = word2idx.get("<EOS>", 0)

model = MultiModal(
    vocab_size=vocab_size,
    d_model=512,
    video_feature_dim=2048,
    audio_feature_dim=128,
    nhead=8,
    num_encoder_layers=4,
    num_decoder_layers=4,
    dim_feedforward=2048,
    dropout=0.1
).to(device)

model.load_state_dict(torch.load('multimodal_transformer_alpha_1.pth', weights_only=True))

gen_tokens = generate_comment(
    model=model,
    video_tensor=video_emb,
    audio_tensor=audio_emb,
    start_token_idx=start_tok,
    end_token_idx=end_tok,
    max_len=20,
    device=device,
    temperature=0.8,
    top_k=5
)
# Convert token IDs => text
decoded = [rev_vocab.get(tid, "<UNK>") for tid in gen_tokens]
print("Generated comment:", " ".join(decoded))
print(f"Generated comment (tokens): {gen_tokens}")


Running sample inference ...
Original comment: <SOS> ww <EOS>
Original comment (tokens): tensor([ 2, 10,  3])
Generated comment: @mer01337 взаимно ало? с аганом ))) делал смотрят <EOS>
Generated comment (tokens): [108, 4942, 4099, 30, 1920, 164, 1403, 1174, 3]
