In [None]:
# This file is to extract the raw video frames, raw audio waveform, and character-level input from the dataset.

import torch
import torch.nn as nn
import torchaudio
import torchvision

class EarlyFusionVideoClassifier(nn.Module):
    def __init__(self, num_classes=10):
        super().__init__()
        
        # Visual stream (3D CNN for raw video frames)
        self.visual_frontend = nn.Sequential(
            nn.Conv3d(3, 64, kernel_size=(3, 7, 7), padding=(1, 3, 3)),
            nn.BatchNorm3d(64),
            nn.ReLU(),
            nn.MaxPool3d(kernel_size=(1, 2, 2)),
            
            nn.Conv3d(64, 128, kernel_size=3, padding=1),
            nn.BatchNorm3d(128),
            nn.ReLU(),
            nn.MaxPool3d(kernel_size=2)
        )
        
        # Audio stream (1D CNN for raw waveform)
        self.audio_frontend = nn.Sequential(
            nn.Conv1d(1, 64, kernel_size=80, stride=4),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=4),
            
            nn.Conv1d(64, 128, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=4)
        )
        
        # Text stream (1D CNN for character-level input)
        self.text_frontend = nn.Sequential(
            nn.Embedding(256, 64),  # Character-level embedding
            nn.Conv1d(64, 128, kernel_size=3, padding=1),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2)
        )
        
        # Fusion layers
        self.fusion_net = nn.Sequential(
            nn.Linear(128 * 3, 512),  # Concatenated features from all modalities
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(256, num_classes)
        )
        
    def forward(self, video_frames, audio_waveform, text_chars):
        """
        Args:
            video_frames: Raw video tensor (B, C, T, H, W)
                B: batch size
                C: channels (3 for RGB)
                T: number of frames
                H: height
                W: width
            
            audio_waveform: Raw audio tensor (B, 1, T)
                B: batch size
                1: mono audio channel
                T: number of time steps
            
            text_chars: Character indices (B, L)
                B: batch size
                L: sequence length
        """
        # Process each modality
        visual_features = self.visual_frontend(video_frames)
        visual_features = torch.mean(visual_features, dim=[2, 3, 4])  # Pool spatial-temporal dims
        
        audio_features = self.audio_frontend(audio_waveform)
        audio_features = torch.mean(audio_features, dim=2)  # Pool temporal dim
        
        text_embedded = self.text_frontend(text_chars).transpose(1, 2)
        text_features = torch.mean(text_features, dim=2)  # Pool sequence dim
        
        # Early fusion by concatenation
        fused_features = torch.cat([visual_features, audio_features, text_features], dim=1)
        
        # Classification
        output = self.fusion_net(fused_features)
        return output

# Example preprocessing functions
def preprocess_video(video_path, num_frames=32):
    """Load and preprocess raw video frames."""
    transform = torchvision.transforms.Compose([
        torchvision.transforms.Resize((112, 112)),
        torchvision.transforms.Normalize(mean=[0.485, 0.456, 0.406], 
                                      std=[0.229, 0.224, 0.225])
    ])
    
    # Load video frames (implementation depends on your video loading library)
    # frames = load_video_frames(video_path, num_frames)
    # frames = transform(frames)
    return frames

def preprocess_audio(audio_path, sample_rate=16000):
    """Load and preprocess raw audio waveform."""
    waveform, sr = torchaudio.load(audio_path)
    if sr != sample_rate:
        waveform = torchaudio.transforms.Resample(sr, sample_rate)(waveform)
    return waveform

def preprocess_text(text, max_length=512):
    """Convert text to character indices."""
    chars = list(text.lower())
    char_indices = [ord(c) % 256 for c in chars[:max_length]]
    return torch.tensor(char_indices)

# Training loop
def train_step(model, batch, optimizer, criterion):
    video_frames, audio_waveform, text_chars, labels = batch
    
    # Forward pass
    predictions = model(video_frames, audio_waveform, text_chars)
    loss = criterion(predictions, labels)
    
    # Backward pass
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    return loss.item()

def train(model, train_loader, num_epochs=10):
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
    criterion = nn.CrossEntropyLoss()
    
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        
        for batch in train_loader:
            loss = train_step(model, batch, optimizer, criterion)
            total_loss += loss
        
        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")

In [1]:
# visual
import os
from PIL import Image
import torchvision
import torch
import numpy as np
from torch import nn
from tqdm import tqdm

class VisualFrontend(nn.Module):
    def __init__(self):
        super().__init__()
        self.frontend = nn.Sequential(
            nn.Conv3d(3, 64, kernel_size=(3, 7, 7), padding=(1, 3, 3)),
            nn.BatchNorm3d(64),
            nn.ReLU(),
            nn.MaxPool3d(kernel_size=(1, 2, 2)),

            nn.Conv3d(64, 1024, kernel_size=3, padding=1),  # Changed to 1024 channels
            nn.BatchNorm3d(1024),
            nn.ReLU(),
            nn.MaxPool3d(kernel_size=(1, 28, 28))  # Global spatial pooling
        )
        
        self.temporal_pool = nn.AdaptiveAvgPool3d((1, 1, 1))
        
    def forward(self, x):
        features = self.frontend(x)  # Shape: (B, 1024, T', 1, 1)
        features = self.temporal_pool(features)  # Shape: (B, 1024, 1, 1, 1)
        features = features.view(-1, 1024)  # Shape: (B, 1024)
        return features

def process_videos(folder_list, visual_frontend, device='cuda', chunk_size=32):
    result = {}
    transform = nn.Sequential(
        torchvision.transforms.Resize((112, 112)),
        torchvision.transforms.Normalize(
            mean=[0.485, 0.456, 0.406],
            std=[0.229, 0.224, 0.225]
        )
    )
    
    for folder in tqdm(folder_list):
        frame_files = sorted(os.listdir('video/Frames/'+folder))
        all_features = []
        
        # Process in chunks
        for i in range(0, len(frame_files), chunk_size):
            chunk_files = frame_files[i:i + chunk_size]
            frames = []
            for filename in chunk_files:
                img_path = os.path.join('video/Frames/'+folder, filename)
                img = np.array(Image.open(img_path).convert('RGB'))
                frames.append(img)
            
            frames = np.stack(frames).transpose(0, 3, 1, 2)
            frames = torch.from_numpy(frames).float() / 255.0
            
            with torch.no_grad():
                frames = transform(frames)
                frames = frames.unsqueeze(0).to(device)
                frames = frames.permute(0, 2, 1, 3, 4)
                features = visual_frontend(frames)
                all_features.append(features.cpu())
            
            # Clear GPU memory
            torch.cuda.empty_cache()
        
        # Average features from all chunks
        result[folder] = torch.cat(all_features, dim=0).mean(dim=0)
    
    return result


In [3]:
folders = os.listdir('video/Frames')
visual_frontend = VisualFrontend().to('cuda')
visual_frontend.eval()
visual_features = process_videos(folders, visual_frontend, device='cuda')


100%|██████████| 690/690 [13:26<00:00,  1.17s/it]


In [15]:
for key,val in visual_features.items():
    visual_features[key] = {'visual':val}

In [17]:
import json

with open('raw_features.json', 'w') as f:    
    json.dump(visual_features, f, indent=4)

In [35]:
import torchaudio
def preprocess_audio(audio_path, sample_rate=16000):
    """Load and preprocess raw audio waveform."""
    waveform, sr = torchaudio.load(audio_path)
    if waveform.shape[0] > 1:
        waveform = torch.mean(waveform, dim=0, keepdim=True)
    if len(waveform.shape) == 2:  # (channels, time)
        waveform = waveform.unsqueeze(0)  # (batch, channels, time)
    if sr != sample_rate:
        waveform = torchaudio.transforms.Resample(sr, sample_rate)(waveform)
    return waveform

class AudioFrontend(nn.Module):
   def __init__(self):
       super().__init__()
       self.frontend = nn.Sequential(
           nn.Conv1d(1, 64, kernel_size=80, stride=4),
           nn.BatchNorm1d(64),
           nn.ReLU(), 
           nn.MaxPool1d(kernel_size=4),

           nn.Conv1d(64, 1024, kernel_size=3, stride=1, padding=1),  # Changed to 1024
           nn.BatchNorm1d(1024),
           nn.ReLU(),
           nn.MaxPool1d(kernel_size=4)
       )
       
       self.temporal_pool = nn.AdaptiveAvgPool1d(1)
       
   def forward(self, x):
       x = self.frontend(x)  # Shape: (B, 1024, T)
       x = self.temporal_pool(x)  # Shape: (B, 1024, 1) 
       x = x.squeeze(-1)  # Shape: (B, 1024)
       return x

audio = 'audio/final/1_60.wav'
waveform = preprocess_audio(audio)
model = AudioFrontend()
output = model(waveform).detach().numpy()

In [36]:
audios = os.listdir('audio/final')
features = {}
for audio in tqdm(audios):
    waveform = preprocess_audio('audio/final/'+audio)
    output = model(waveform).detach().numpy()
    features[audio] = output

100%|██████████| 690/690 [00:20<00:00, 33.33it/s]


In [43]:
for key, val in features.items():
    visual_features[key[:-4]]['audio'] = val.tolist()[0]

In [41]:
features['1_60.wav'].tolist()[0]

[0.6470228433609009,
 0.5100659132003784,
 0.6061540246009827,
 0.5412774682044983,
 0.8191898465156555,
 0.5914872288703918,
 0.754771888256073,
 0.7260295748710632,
 0.6955481171607971,
 0.7063736319541931,
 0.4698455333709717,
 0.7866929769515991,
 0.6400021910667419,
 0.651842474937439,
 0.8297082781791687,
 0.5800343751907349,
 0.7268152832984924,
 0.8067913055419922,
 0.4963915944099426,
 0.7817744612693787,
 0.5690537095069885,
 0.8221748471260071,
 0.8021746873855591,
 0.7453657388687134,
 0.7331979274749756,
 0.6898176670074463,
 0.6568590998649597,
 0.535631000995636,
 0.499861478805542,
 0.7407749891281128,
 0.6003255844116211,
 0.4865726828575134,
 0.696954607963562,
 0.6917916536331177,
 0.8010429739952087,
 0.8060300350189209,
 0.6841771006584167,
 0.6115818619728088,
 0.7608847618103027,
 0.7353417873382568,
 0.7712544202804565,
 0.736838698387146,
 0.7368763089179993,
 0.8027270436286926,
 0.6138143539428711,
 0.5349960327148438,
 0.7852173447608948,
 0.690779447555542,

In [57]:
with open('raw_features.json', 'w') as f:    
    json.dump(visual_features, f, indent=4)

In [53]:
def preprocess_text(text, max_length=512):
    """Convert text to character indices."""
    chars = list(text.lower())
    char_indices = [ord(c) % 256 for c in chars[:max_length]]
    return torch.tensor(char_indices)

class TextFrontend(nn.Module):
    def __init__(self):
        super().__init__()
        self.embedding = nn.Embedding(256, 64)  # Character-level embedding
        self.frontend = nn.Sequential(
            nn.Conv1d(64, 1024, kernel_size=3, padding=1),  # Changed to 1024 channels
            nn.BatchNorm1d(1024),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2)
        )
        self.temporal_pool = nn.AdaptiveAvgPool1d(1)
        
    def forward(self, x):
        x = self.embedding(x)  # Shape: (B, L, 64)
        x = x.transpose(1, 2)  # Shape: (B, 64, L) - Conv1d expects channels first
        x = self.frontend(x)  # Shape: (B, 1024, L/2)
        x = self.temporal_pool(x)  # Shape: (B, 1024, 1)
        x = x.squeeze(-1)  # Shape: (B, 1024)
        return x

def preprocess_text(text, max_length=512):
    """Convert text to character indices."""
    chars = list(text.lower())
    char_indices = [ord(c) % 256 for c in chars[:max_length]]
    # Pad if necessary
    if len(char_indices) < max_length:
        char_indices += [0] * (max_length - len(char_indices))
    return torch.tensor(char_indices).unsqueeze(0)  # Add batch dimension

data = json.load(open('sarcasm_data.json'))

text_features = {}
for key, val in data.items():
    x = preprocess_text(data[key]['utterance'])
    model = TextFrontend()
    x = model(x)
    visual_features[key]['text'] = x.tolist()[0]

[0.2998717427253723,
 0.2896241843700409,
 0.31718719005584717,
 0.3120085597038269,
 0.2946896553039551,
 0.27652832865715027,
 0.34906071424484253,
 0.2736313045024872,
 0.41159266233444214,
 0.296808660030365,
 0.29254665970802307,
 0.3052225112915039,
 0.26303333044052124,
 0.4202508330345154,
 0.28006142377853394,
 0.26119375228881836,
 0.3642711043357849,
 0.29003772139549255,
 0.4056095480918884,
 0.30147552490234375,
 0.31137964129447937,
 0.4261818528175354,
 0.41670045256614685,
 0.27882489562034607,
 0.3705172538757324,
 0.294374942779541,
 0.287452757358551,
 0.3394344449043274,
 0.3451014757156372,
 0.26192593574523926,
 0.30810341238975525,
 0.2878558337688446,
 0.40629374980926514,
 0.2829652726650238,
 0.3109474182128906,
 0.27351492643356323,
 0.24514618515968323,
 0.39172953367233276,
 0.32213279604911804,
 0.2856083810329437,
 0.2856599688529968,
 0.2914198040962219,
 0.29276949167251587,
 0.2542950510978699,
 0.3003697395324707,
 0.3991069197654724,
 0.3019622266292