In [1]:
%cd /home/hice1/mdoutre3/CS7643_Project_1


/home/hice1/mdoutre3/CS7643_Project_1


In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import numpy as np
from pathlib import Path
from glob import glob
import math
import json

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Running on: {device}")

Running on: cuda


In [32]:
class PositionalEncoding(nn.Module):
    """Positional encoding for temporal sequences."""
    def __init__(self, d_model, max_len=600, dropout=0.1):
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, d_model, 2).float() *
                            (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
    
    def forward(self, x):
        x = x + self.pe[:, :x.size(1)]
        return self.dropout(x)


# =============================================================================
# INDIVIDUAL MODALITY MODELS
# =============================================================================
class VideoOnlyModel(nn.Module):
    """Video-only baseline using transformer."""
    def __init__(self, input_dim=768, d_model=256, nhead=4, num_layers=2,
                 num_classes=16, dropout=0.1, max_seq_len=100):
        super().__init__()
        self.input_proj = nn.Linear(input_dim, d_model)
        self.pos_encoder = PositionalEncoding(d_model, max_seq_len, dropout=dropout)
        
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model, nhead=nhead, dim_feedforward=d_model,
            dropout=dropout, batch_first=True
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers)
        
        self.classifier = nn.Sequential(
            nn.Linear(d_model, d_model // 2),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(d_model // 2, num_classes)
        )
    
    def forward(self, video_x, video_mask, text_emb, audio_emb):
        x = self.input_proj(video_x)
        x = self.pos_encoder(x)
        x = self.encoder(x, src_key_padding_mask=~video_mask)
        x = x.mean(dim=1)  # Average pooling
        return self.classifier(x)


class TextOnlyModel(nn.Module):
    """Text-only baseline."""
    def __init__(self, input_dim=768, hidden_dim=256, num_classes=16, dropout=0.1):
        super().__init__()
        self.classifier = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.LayerNorm(hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.LayerNorm(hidden_dim // 2),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim // 2, num_classes)
        )
    
    def forward(self, video_x, video_mask, text_emb, audio_emb):
        return self.classifier(text_emb)


class AudioOnlyModel(nn.Module):
    """Audio-only baseline."""
    def __init__(self, input_dim=1024, hidden_dim=256, num_classes=16, dropout=0.1):
        super().__init__()
        self.classifier = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.LayerNorm(hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.LayerNorm(hidden_dim // 2),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim // 2, num_classes)
        )
    
    def forward(self, video_x, video_mask, text_emb, audio_emb):
        return self.classifier(audio_emb)


# =============================================================================
# BIMODAL FUSION MODELS
# =============================================================================
class VideoTextFusion(nn.Module):
    """Video + Text fusion."""
    def __init__(self, video_dim=768, text_dim=768, d_model=256, nhead=4,
                 num_layers=2, num_classes=16, dropout=0.1, max_seq_len=100):
        super().__init__()
        
        self.video_proj = nn.Sequential(nn.Linear(video_dim, d_model), nn.Dropout(dropout))
        self.text_proj = nn.Sequential(nn.Linear(text_dim, d_model), nn.Dropout(dropout))
        
        self.pos_encoder = PositionalEncoding(d_model, max_seq_len + 1, dropout=dropout)
        
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model, nhead=nhead, dim_feedforward=d_model,
            dropout=dropout, batch_first=True, norm_first=True
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers)
        
        self.classifier = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(d_model, num_classes)
        )
    
    def forward(self, video_x, video_mask, text_emb, audio_emb):
        B = video_x.size(0)
        video_x = self.video_proj(video_x)
        text_x = self.text_proj(text_emb).unsqueeze(1)
        
        x = torch.cat([text_x, video_x], dim=1)
        text_mask = torch.ones(B, 1, dtype=torch.bool, device=video_mask.device)
        full_mask = torch.cat([text_mask, video_mask], dim=1)
        
        x = self.pos_encoder(x)
        x = self.encoder(x, src_key_padding_mask=~full_mask)
        
        return self.classifier(x[:, 0])


class VideoAudioFusion(nn.Module):
    """Video + Audio fusion."""
    def __init__(self, video_dim=768, audio_dim=1024, d_model=256, nhead=4,
                 num_layers=2, num_classes=16, dropout=0.1, max_seq_len=100):
        super().__init__()
        
        self.video_proj = nn.Sequential(nn.Linear(video_dim, d_model), nn.Dropout(dropout))
        self.audio_proj = nn.Sequential(nn.Linear(audio_dim, d_model), nn.Dropout(dropout))
        
        self.pos_encoder = PositionalEncoding(d_model, max_seq_len + 1, dropout=dropout)
        
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model, nhead=nhead, dim_feedforward=d_model,
            dropout=dropout, batch_first=True, norm_first=True
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers)
        
        self.classifier = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(d_model, num_classes)
        )
    
    def forward(self, video_x, video_mask, text_emb, audio_emb):
        B = video_x.size(0)
        video_x = self.video_proj(video_x)
        audio_x = self.audio_proj(audio_emb).unsqueeze(1)
        
        x = torch.cat([audio_x, video_x], dim=1)
        audio_mask = torch.ones(B, 1, dtype=torch.bool, device=video_mask.device)
        full_mask = torch.cat([audio_mask, video_mask], dim=1)
        
        x = self.pos_encoder(x)
        x = self.encoder(x, src_key_padding_mask=~full_mask)
        
        return self.classifier(x[:, 0])


class TextAudioFusion(nn.Module):
    """Text + Audio fusion."""
    def __init__(self, text_dim=768, audio_dim=1024, hidden_dim=256,
                 num_classes=16, dropout=0.1):
        super().__init__()
        
        self.text_proj = nn.Linear(text_dim, hidden_dim)
        self.audio_proj = nn.Linear(audio_dim, hidden_dim)
        
        self.fusion = nn.Sequential(
            nn.Linear(hidden_dim * 2, hidden_dim),
            nn.LayerNorm(hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.LayerNorm(hidden_dim // 2),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim // 2, num_classes)
        )
    
    def forward(self, video_x, video_mask, text_emb, audio_emb):
        text_x = self.text_proj(text_emb)
        audio_x = self.audio_proj(audio_emb)
        x = torch.cat([text_x, audio_x], dim=1)
        return self.fusion(x)


# =============================================================================
# TRIMODAL FUSION MODEL
# =============================================================================
class TrimodalFusion(nn.Module):
    """Video + Text + Audio fusion."""
    def __init__(self, video_dim=768, text_dim=768, audio_dim=1024,
                 d_model=256, nhead=4, num_layers=2, num_classes=16,
                 dropout=0.1, max_seq_len=100):
        super().__init__()
        
        self.video_proj = nn.Sequential(nn.Linear(video_dim, d_model), nn.Dropout(dropout))
        self.text_proj = nn.Sequential(nn.Linear(text_dim, d_model), nn.Dropout(dropout))
        self.audio_proj = nn.Sequential(nn.Linear(audio_dim, d_model), nn.Dropout(dropout))
        
        self.pos_encoder = PositionalEncoding(d_model, max_seq_len + 2, dropout=dropout)
        
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model, nhead=nhead, dim_feedforward=d_model,
            dropout=dropout, batch_first=True, norm_first=True
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers)
        
        self.classifier = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(d_model, num_classes)
        )
    
    def forward(self, video_x, video_mask, text_emb, audio_emb):
        B = video_x.size(0)
        
        video_x = self.video_proj(video_x)
        text_x = self.text_proj(text_emb).unsqueeze(1)
        audio_x = self.audio_proj(audio_emb).unsqueeze(1)
        
        x = torch.cat([text_x, audio_x, video_x], dim=1)
        
        special_mask = torch.ones(B, 2, dtype=torch.bool, device=video_mask.device)
        full_mask = torch.cat([special_mask, video_mask], dim=1)
        
        x = self.pos_encoder(x)
        x = self.encoder(x, src_key_padding_mask=~full_mask)
        
        return self.classifier(x[:, 0])


# =============================================================================
# DATASET
# =============================================================================
class SoccerTrimodalDataset(Dataset):
    """Dataset for video + text + audio fusion."""
    def __init__(self, video_paths, text_paths, audio_paths, labels,
                 max_seq_len=100, augment=False):
        self.video_paths = video_paths
        self.text_paths = text_paths
        self.audio_paths = audio_paths
        self.labels = labels
        self.max_seq_len = max_seq_len
        self.augment = augment
    
    def __len__(self):
        return len(self.video_paths)
    
    def __getitem__(self, idx):
        # Load video features (768-dim)
        video = np.load(self.video_paths[idx])
        video = torch.from_numpy(video).float()
        T = video.shape[0]
        
        # Pad or truncate video
        if T > self.max_seq_len:
            video = video[:self.max_seq_len]
            mask = torch.ones(self.max_seq_len, dtype=torch.bool)
        else:
            pad = torch.zeros(self.max_seq_len - T, 768)
            video = torch.cat([video, pad], dim=0)
            mask = torch.cat([
                torch.ones(T, dtype=torch.bool),
                torch.zeros(self.max_seq_len - T, dtype=torch.bool)
            ])
        
        # Load text embedding (768-dim)
        text_data = torch.load(self.text_paths[idx])
        text_emb = text_data["embedding"]
        if text_emb.dim() > 1:
            text_emb = text_emb.squeeze(0)
        
        # Load audio embedding (1024-dim)
        audio_emb = torch.load(self.audio_paths[idx])
        if audio_emb.dim() > 1:
            audio_emb = audio_emb.squeeze(0)
        
        return video, mask, text_emb, audio_emb, self.labels[idx]


# =============================================================================
# DATA PREPARATION
# =============================================================================
def match_video_text_pairs(video_paths, text_paths):
    """Match video files with corresponding text embeddings."""
    def get_base_key(video_path):
        stem = Path(video_path).stem
        parts = stem.split("_")
        return "_".join(parts[:-1])
    
    matched_pairs = []
    for v in video_paths:
        base = get_base_key(v)
        matches = [t for t in text_paths if Path(t).stem.startswith(base)]
        if matches:
            matched_pairs.append((v, matches[0]))
    
    return matched_pairs


def extract_event_label(video_path):
    """Extract event type from video filename."""
    stem = Path(video_path).stem
    return stem.split("_")[-1]


def prepare_dataset(video_dir="fusion/embeddings 2",
                   text_dir="fusion/text_embeddings_events",
                   audio_dir="fusion/audio_embeddings_events",
                   max_seq_len=100):
    """Prepare matched dataset with labels."""
    
    video_paths = sorted(glob(f"{video_dir}/*.npy"))
    text_paths = sorted(glob(f"{text_dir}/*.pt"))
    audio_paths = sorted(glob(f"{audio_dir}/*.pt"))
    
    print(f"Found {len(video_paths)} video files")
    print(f"Found {len(text_paths)} text files")
    print(f"Found {len(audio_paths)} audio files")
    
    # Match pairs
    if Path("fusion/matched_pairs.json").exists():
        with open("fusion/matched_pairs.json", "r") as f:
            matched_pairs = [tuple(x) for x in json.load(f)]
    else:
        matched_pairs = match_video_text_pairs(video_paths, text_paths)
        with open("fusion/matched_pairs.json", "w") as f:
            json.dump([(v, t) for v, t in matched_pairs], f)
    
    video_paths_matched = [v for v, t in matched_pairs]
    text_paths_matched = [t for v, t in matched_pairs]
    
    # Match audio
    def get_base_key(video_path):
        stem = Path(video_path).stem
        parts = stem.split("_")
        return "_".join(parts[:-1])
    
    audio_paths_matched = []
    valid_indices = []
    for idx, v in enumerate(video_paths_matched):
        base = get_base_key(v)
        audio_matches = [a for a in audio_paths if Path(a).stem.startswith(base)]
        if audio_matches:
            audio_paths_matched.append(audio_matches[0])
            valid_indices.append(idx)
    
    video_paths_matched = [video_paths_matched[i] for i in valid_indices]
    text_paths_matched = [text_paths_matched[i] for i in valid_indices]
    
    event_types = [extract_event_label(v) for v in video_paths_matched]
    
    unique_events = sorted(set(event_types))
    event_to_idx = {ev: i for i, ev in enumerate(unique_events)}
    labels = [event_to_idx[ev] for ev in event_types]
    
    print(f"\nMatched {len(video_paths_matched)} complete triplets")
    print(f"Event classes: {len(unique_events)}")
    
    return (video_paths_matched, text_paths_matched, audio_paths_matched,
            labels, event_to_idx, max_seq_len)


# =============================================================================
# TRAINING FUNCTIONS
# =============================================================================
def train_epoch(model, loader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    
    for video, mask, text_emb, audio_emb, y in loader:
        video = video.to(device)
        mask = mask.to(device)
        text_emb = text_emb.to(device)
        audio_emb = audio_emb.to(device)
        y = y.to(device)
        
        optimizer.zero_grad()
        logits = model(video, mask, text_emb, audio_emb)
        loss = criterion(logits, y)
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=0.5)
        optimizer.step()
        
        total_loss += loss.item()
        correct += (logits.argmax(1) == y).sum().item()
        total += y.size(0)
    
    return total_loss / len(loader), correct / total


def validate(model, loader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for video, mask, text_emb, audio_emb, y in loader:
            video = video.to(device)
            mask = mask.to(device)
            text_emb = text_emb.to(device)
            audio_emb = audio_emb.to(device)
            y = y.to(device)
            
            logits = model(video, mask, text_emb, audio_emb)
            loss = criterion(logits, y)
            
            total_loss += loss.item()
            correct += (logits.argmax(1) == y).sum().item()
            total += y.size(0)
    
    return total_loss / len(loader), correct / total


def train_model(model, train_loader, val_loader, epochs=30, lr=1e-3,
                patience=15, device='cuda', model_name='model'):
    model.to(device)
    
    criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=0.05)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
        optimizer, T_max=30, eta_min=1e-6
    )
    
    best_val_acc = 0
    patience_counter = 0
    
    for epoch in range(epochs):
        train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, device)
        val_loss, val_acc = validate(model, val_loader, criterion, device)
        
        scheduler.step()
        
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            patience_counter = 0
            torch.save(model.state_dict(), f'best_{model_name}.pt')
        else:
            patience_counter += 1
            if patience_counter >= patience:
                break
    
    model.load_state_dict(torch.load(f'best_{model_name}.pt'))
    _, final_acc = validate(model, val_loader, criterion, device)
    return final_acc


# =============================================================================
# MAIN EXECUTION
# =============================================================================


In [33]:
if __name__ == "__main__":
    # Prepare data
    (video_paths, text_paths, audio_paths, labels,
     event_to_idx, max_seq_len) = prepare_dataset(max_seq_len=100)
    
    # Split indices
    indices = list(range(len(labels)))
    np.random.seed(42)
    np.random.shuffle(indices)
    train_size = int(0.8 * len(indices))
    train_indices = indices[:train_size]
    val_indices = indices[train_size:]
    
    # Create datasets
    train_dataset = SoccerTrimodalDataset(
        video_paths=[video_paths[i] for i in train_indices],
        text_paths=[text_paths[i] for i in train_indices],
        audio_paths=[audio_paths[i] for i in train_indices],
        labels=[labels[i] for i in train_indices],
        max_seq_len=max_seq_len
    )
    
    val_dataset = SoccerTrimodalDataset(
        video_paths=[video_paths[i] for i in val_indices],
        text_paths=[text_paths[i] for i in val_indices],
        audio_paths=[audio_paths[i] for i in val_indices],
        labels=[labels[i] for i in val_indices],
        max_seq_len=max_seq_len
    )
    
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True,
                             num_workers=2, pin_memory=True, drop_last=True)
    val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False,
                           num_workers=2, pin_memory=True)
    
    print(f"\nTrain: {len(train_dataset)} samples")
    print(f"Val: {len(val_dataset)} samples")
    
    # Define all experiments
    experiments = [
        ("Video Only (V)", VideoOnlyModel(num_classes=len(event_to_idx), dropout=0.1)),
        ("Text Only (T)", TextOnlyModel(num_classes=len(event_to_idx), dropout=0.1)),
        ("Audio Only (A)", AudioOnlyModel(num_classes=len(event_to_idx), dropout=0.1)),
        ("Video + Text (VT)", VideoTextFusion(num_classes=len(event_to_idx), dropout=0.1)),
        ("Video + Audio (VA)", VideoAudioFusion(num_classes=len(event_to_idx), dropout=0.1)),
        ("Text + Audio (TA)", TextAudioFusion(num_classes=len(event_to_idx), dropout=0.1)),
        ("Video + Text + Audio (VTA)", TrimodalFusion(num_classes=len(event_to_idx), dropout=0.1)),
    ]
    
    results = {}
    
    print("\n" + "="*70)
    print("COMPLETE MULTIMODAL FUSION COMPARISON")
    print("="*70)
    
    for name, model in experiments:
        print(f"\n[Training {name}]")
        num_params = sum(p.numel() for p in model.parameters())
        print(f"Parameters: {num_params:,}")
        
        model_key = name.split('(')[1].strip(')')
        val_acc = train_model(
            model, train_loader, val_loader,
            epochs=30, lr=1e-3, patience=15,
            device=device, model_name=model_key
        )
        
        results[name] = val_acc
        print(f"{name} Val Acc: {val_acc:.3f}")
    
    # Print summary
    print("\n" + "="*70)
    print("FINAL RESULTS SUMMARY")
    print("="*70)
    print(f"{'Model':<30} | {'Val Accuracy'}")
    print("-"*70)
    
    for name in ["Video Only (V)", "Text Only (T)", "Audio Only (A)",
                 "Video + Text (VT)", "Video + Audio (VA)", "Text + Audio (TA)",
                 "Video + Text + Audio (VTA)"]:
        print(f"{name:<30} | {results[name]:.3f}")
    
    print("="*70)
    
    # Calculate improvements
    best_single = max(results["Video Only (V)"], results["Text Only (T)"], results["Audio Only (A)"])
    best_bimodal = max(results["Video + Text (VT)"], results["Video + Audio (VA)"], results["Text + Audio (TA)"])
    trimodal = results["Video + Text + Audio (VTA)"]
    
    print(f"\nBest single modality: {best_single:.3f}")
    print(f"Best bimodal fusion: {best_bimodal:.3f}")
    print(f"Trimodal fusion: {trimodal:.3f}")
    print(f"\nBimodal improvement over single: +{(best_bimodal - best_single):.3f}")
    print(f"Trimodal improvement over bimodal: +{(trimodal - best_bimodal):.3f}")
    print("="*70)

Found 5820 video files
Found 5832 text files
Found 5832 audio files

Matched 5816 complete triplets
Event classes: 16

Train: 4652 samples
Val: 1164 samples

COMPLETE MULTIMODAL FUSION COMPARISON

[Training Video Only (V)]
Parameters: 1,023,376


  output = torch._nested_tensor_from_mask(


Video Only (V) Val Acc: 0.658

[Training Text Only (T)]
Parameters: 232,592
Text Only (T) Val Acc: 0.527

[Training Audio Only (A)]
Parameters: 298,128
Audio Only (A) Val Acc: 0.293

[Training Video + Text (VT)]
Parameters: 1,189,392
Video + Text (VT) Val Acc: 0.747

[Training Video + Audio (VA)]
Parameters: 1,254,928
Video + Audio (VA) Val Acc: 0.662

[Training Text + Audio (TA)]
Parameters: 626,320
Text + Audio (TA) Val Acc: 0.516

[Training Video + Text + Audio (VTA)]
Parameters: 1,451,792
Video + Text + Audio (VTA) Val Acc: 0.740

FINAL RESULTS SUMMARY
Model                          | Val Accuracy
----------------------------------------------------------------------
Video Only (V)                 | 0.658
Text Only (T)                  | 0.527
Audio Only (A)                 | 0.293
Video + Text (VT)              | 0.747
Video + Audio (VA)             | 0.662
Text + Audio (TA)              | 0.516
Video + Text + Audio (VTA)     | 0.740

Best single modality: 0.658
Best bimodal fusio

In [26]:
import torch
from glob import glob
from pathlib import Path

# Get one audio file
audio_paths = sorted(glob("fusion/audio_embeddings_events/*.pt"))

if audio_paths:
    print(f"Testing audio file: {audio_paths[0]}")
    audio_data = torch.load(audio_paths[0])
    
    print(f"\nType: {type(audio_data)}")
    
    if isinstance(audio_data, dict):
        print(f"Dict keys: {audio_data.keys()}")
        if "embedding" in audio_data:
            emb = audio_data["embedding"]
            print(f"Embedding shape: {emb.shape}")
            print(f"Embedding dtype: {emb.dtype}")
    elif isinstance(audio_data, torch.Tensor):
        print(f"Direct tensor shape: {audio_data.shape}")
        print(f"Direct tensor dtype: {audio_data.dtype}")
    else:
        print(f"Unknown format: {audio_data}")
    
    # Compare with text file format
    text_paths = sorted(glob("fusion/text_embeddings_events/*.pt"))
    if text_paths:
        print(f"\n\nFor comparison, testing text file: {text_paths[0]}")
        text_data = torch.load(text_paths[0])
        print(f"Type: {type(text_data)}")
        if isinstance(text_data, dict):
            print(f"Dict keys: {text_data.keys()}")
            if "embedding" in text_data:
                emb = text_data["embedding"]
                print(f"Embedding shape: {emb.shape}")

Testing audio file: fusion/audio_embeddings_events/2014-11-05 - 22-45 Bayern Munich 2 - 0 AS Roma_1_224p_2709.00.pt

Type: <class 'torch.Tensor'>
Direct tensor shape: torch.Size([1024])
Direct tensor dtype: torch.float32


For comparison, testing text file: fusion/text_embeddings_events/2014-11-05 - 22-45 Bayern Munich 2 - 0 AS Roma_1_224p_2709.00.pt
Type: <class 'dict'>
Dict keys: dict_keys(['match_name', 'event_key', 'embedding', 'label', 'timestamp', 'audio_path'])
Embedding shape: torch.Size([768])


In [35]:
import json
from pathlib import Path
from datetime import datetime



print(f"\nBest single modality: {best_single:.3f}")
print(f"Best bimodal fusion: {best_bimodal:.3f}")
print(f"Trimodal fusion: {trimodal:.3f}")
print(f"\nBimodal improvement over single: +{(best_bimodal - best_single):.3f}")
print(f"Trimodal improvement over bimodal: +{(trimodal - best_bimodal):.3f}")
print("="*70)
# ---------- SAVE RESULTS TO FILE ----------
# Build a summary dict
results_summary = {
    "results_per_model": results,  # e.g. {"Video Only (V)": 0.73, ...}
    "best_single": float(best_single),
    "best_bimodal": float(best_bimodal),
    "trimodal": float(trimodal),
    "bimodal_minus_single": float(best_bimodal - best_single),
    "trimodal_minus_bimodal": float(trimodal - best_bimodal),
    "num_train_samples": len(train_dataset),
    "num_val_samples": len(val_dataset),
    "timestamp": datetime.now().isoformat(timespec="seconds")
}
# Output folder + filename
out_dir = Path("fusion/experiment_results")
out_dir.mkdir(parents=True, exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
out_path = out_dir / f"trimodal_fusion_results_{timestamp}.json"
with out_path.open("w") as f:
    json.dump(results_summary, f, indent=2)
print(f"\nSaved results to: {out_path}")



Best single modality: 0.658
Best bimodal fusion: 0.747
Trimodal fusion: 0.740

Bimodal improvement over single: +0.089
Trimodal improvement over bimodal: +-0.008

Saved results to: fusion/experiment_results/trimodal_fusion_results_20251130-224251.json


In [39]:
per_class_metrics = {}  # model_key -> per-class metric vector

for name, model in experiments:
    print(f"\n[Training {name}]")
    num_params = sum(p.numel() for p in model.parameters())
    print(f"Parameters: {num_params:,}")
    
    model_key = name.split('(')[1].strip(')')
    
    # ↓ assume train_model now returns (val_acc, per_class_vals)
    val_acc, per_class_vals = train_model(
        model, train_loader, val_loader,
        epochs=30, lr=1e-3, patience=15,
        device=device, model_name=model_key
    )
    
    results[name] = val_acc
    per_class_metrics[model_key] = per_class_vals  # shape: [num_classes]
    print(f"{name} Val Acc: {val_acc:.3f}")



[Training Video Only (V)]
Parameters: 1,023,376


TypeError: cannot unpack non-iterable float object

In [36]:
    # ---------- PER-CLASS TABLE ----------
    print("\n" + "="*70)
    print("PER-CLASS VALIDATION METRICS BY MODEL")
    print("="*70)

    # Invert event_to_idx to get ordered class names
    idx_to_event = {idx: cls for cls, idx in event_to_idx.items()}
    num_classes = len(idx_to_event)
    class_names = [idx_to_event[i] for i in range(num_classes)]

    # Column order (must match keys in per_class_metrics)
    model_keys = ["V", "T", "A", "VT", "VA", "TA", "VTA"]

    # Header
    header = f"{'Class':<25}" + "".join(f"{mk:>10}" for mk in model_keys)
    print(header)
    print("-" * len(header))

    # Rows: one per event class
    for i, cls_name in enumerate(class_names):
        row = f"{cls_name:<25}"
        for mk in model_keys:
            val = per_class_metrics[mk][i]
            val = float(val)  # in case it's a tensor
            row += f"{val:10.3f}"
        print(row)



PER-CLASS VALIDATION METRICS BY MODEL
Class                             V         T         A        VT        VA        TA       VTA
-----------------------------------------------------------------------------------------------


NameError: name 'per_class_metrics' is not defined