In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:x.size(0), :]

class ComplexTransformerModel(nn.Module):
    def __init__(self, input_size=128, num_classes=6, d_model=128, nhead=8, num_encoder_layers=6, dim_feedforward=512, dropout=0.1, max_len=73):
        super(ComplexTransformerModel, self).__init__()

        # Embedding layer to project input to model dimension (d_model)
        self.embedding = nn.Linear(input_size, d_model)

        # Positional Encoding
        self.pos_encoder = PositionalEncoding(d_model, max_len)

        # Transformer Encoder
        encoder_layers = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers=num_encoder_layers)

        # Classification Head (Linear Layer)
        self.fc = nn.Linear(d_model, num_classes)

        # Dropout layer
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # x shape: (batch_size, input_size, seq_length) -> (batch_size, seq_length, input_size)
        x = x.transpose(1, 2)  # Reshape to (batch_size, seq_length, input_size)
        # print(f"Input shape after transpose: {x.shape}")

        # Embedding
        x = self.embedding(x)  # (batch_size, seq_length, d_model)
        # print(f"Shape after embedding: {x.shape}")

        # Transpose for transformer input: (seq_length, batch_size, d_model)
        x = x.transpose(0, 1)  # (seq_length, batch_size, d_model)
        # print(f"Shape after transpose for transformer: {x.shape}")

        # Add positional encoding
        x = self.pos_encoder(x)

        # Transformer Encoder
        x = self.transformer_encoder(x)  # (seq_length, batch_size, d_model)

        # Pooling: Taking the mean of the sequence output
        x = x.mean(dim=0)  # (batch_size, d_model)

        # Apply Dropout and Classification layer
        x = self.dropout(x)
        x = self.fc(x)  # (batch_size, num_classes)

        # Return log-softmax for better numerical stability in classification
        return F.log_softmax(x, dim=-1)

# Example usage
if __name__ == "__main__":
    model = ComplexTransformerModel(input_size=128, num_classes=6, max_len=73)
    example_input = torch.randn(32, 128, 73)  # Batch size of 32, 128 mel bins, 73 time steps
    output = model(example_input)
    print(output.shape)  # Expected: torch.Size([32, 6])




torch.Size([32, 6])


## Load train and text csv files

In [2]:
import pandas as pd

# Load train and test CSV files
train_df = pd.read_csv('/content/drive/MyDrive/Crema/train.csv', sep="\t")
test_df = pd.read_csv('/content/drive/MyDrive/Crema/test.csv', sep="\t")

# Check data
print(train_df.head())  # Should show filepath and label columns


              name                                               path  emotion
0  1061_TSI_ANG_XX  /content/drive/MyDrive/Crema/angry/1061_TSI_AN...    angry
1  1055_ITS_FEA_XX  /content/drive/MyDrive/Crema/fear/1055_ITS_FEA...     fear
2  1037_ITH_SAD_XX  /content/drive/MyDrive/Crema/sadness/1037_ITH_...  sadness
3  1039_TAI_HAP_XX  /content/drive/MyDrive/Crema/happy/1039_TAI_HA...    happy
4  1040_IEO_DIS_LO  /content/drive/MyDrive/Crema/disgust/1040_IEO_...  disgust


##Generate Mel Spectograms

In [3]:
import librosa
import numpy as np

def generate_mel_spectrogram(file_path, n_mels=128, fixed_length=73):
    # Load audio file (sr=None keeps original sampling rate)
    y, sr = librosa.load(file_path, sr=None)

    # Generate mel spectrogram with n_mels
    mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels)

    # Convert to log scale (dB)
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)

    # Check the time dimension (number of frames)
    time_steps = mel_spec_db.shape[1]

    # If too short, pad with zeros; if too long, truncate
    if time_steps < fixed_length:
        pad_width = fixed_length - time_steps
        mel_spec_db = np.pad(mel_spec_db, pad_width=((0, 0), (0, pad_width)), mode='constant')
    elif time_steps > fixed_length:
        mel_spec_db = mel_spec_db[:, :fixed_length]

    return mel_spec_db

# Example usage
file_path = train_df['path'].iloc[0]  # First file
mel_spectrogram = generate_mel_spectrogram(file_path)
print(mel_spectrogram.shape)  # Output shape will be (128, 73)


(128, 73)


## Create PyTorch Dataset

In [4]:
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder

class AudioDataset(Dataset):
    def __init__(self, df, n_mels=128):
        self.df = df
        self.n_mels = n_mels
        self.label_encoder = LabelEncoder()
        self.labels = self.label_encoder.fit_transform(df['emotion'])  # Convert labels to numeric

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        file_path = self.df['path'].iloc[idx]
        label = self.labels[idx]

        # Generate mel spectrogram
        mel_spec = generate_mel_spectrogram(file_path, n_mels=self.n_mels)

        # Convert to torch tensor
        mel_spec = torch.tensor(mel_spec, dtype=torch.float32)

        return mel_spec, torch.tensor(label, dtype=torch.long)

# Create dataset
train_dataset = AudioDataset(train_df)
test_dataset = AudioDataset(test_df)

# DataLoader to batch and shuffle the data
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


## Train Model

In [None]:
import torch.optim as optim
import torch.nn.functional as F
from torch.nn import CrossEntropyLoss
from tqdm import tqdm

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = ComplexTransformerModel(input_size=128, num_classes=6).to(device)

# Loss and optimizer
criterion = CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
def train_model(model, train_loader, criterion, optimizer, num_epochs=10):
    model.train()

    for epoch in range(num_epochs):
        running_loss = 0.0

        for mel_specs, labels in tqdm(train_loader):
            mel_specs = mel_specs.to(device)
            labels = labels.to(device)

            # Zero gradients
            optimizer.zero_grad()

            # Forward pass
            outputs = model(mel_specs)
            loss = criterion(outputs, labels)

            # Backward pass and optimize
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}")

# Train the model
train_model(model, train_loader, criterion, optimizer, num_epochs=10)


  6%|▋         | 12/187 [04:47<1:10:38, 24.22s/it]

In [None]:
# After training is completed
torch.save(model.state_dict(), "/content/drive/MyDrive/Crema/complex_transformer_model.pth")
print("Model saved successfully!")


In [None]:
def evaluate_model(model, test_loader):
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for mel_specs, labels in test_loader:
            mel_specs = mel_specs.to(device)
            labels = labels.to(device)

            # Forward pass
            outputs = model(mel_specs)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    print(f'Accuracy: {accuracy:.2f}%')

# Evaluate the model
evaluate_model(model, test_loader)


In [11]:
import torch.optim as optim

# Create dummy dataset for example purposes
def generate_dummy_data(batch_size, seq_length, input_size, num_classes):
    # Input tensor with random data
    inputs = torch.randn(batch_size, seq_length, input_size)
    # Random labels from 0 to num_classes-1
    labels = torch.randint(0, num_classes, (batch_size,))
    return inputs, labels

# Training function
def train_model(model, criterion, optimizer, num_epochs=10, batch_size=3200, seq_length=50, input_size=41, num_classes=6):
    for epoch in range(num_epochs):
        # Generate some dummy data
        inputs, labels = generate_dummy_data(batch_size, seq_length, input_size, num_classes)

        # Move data to the appropriate device (if using GPU)
        inputs, labels = inputs.to(device), labels.to(device)

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs)

        # Calculate loss
        loss = criterion(outputs, labels)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        # Print training stats
        if (epoch + 1) % 2 == 0:
            print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}')

# Initialize model, loss function, and optimizer
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = ComplexTransformerModel().to(device)
criterion = nn.NLLLoss()  # Cross-entropy loss with log-softmax
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train the model
train_model(model, criterion, optimizer, num_epochs=50)


Epoch [2/50], Loss: 2.5093
Epoch [4/50], Loss: 2.0127
Epoch [6/50], Loss: 1.8258
Epoch [8/50], Loss: 1.8528
Epoch [10/50], Loss: 1.8169
Epoch [12/50], Loss: 1.8139
Epoch [14/50], Loss: 1.8195
Epoch [16/50], Loss: 1.8160
Epoch [18/50], Loss: 1.8068
Epoch [20/50], Loss: 1.8042
Epoch [22/50], Loss: 1.8083
Epoch [24/50], Loss: 1.8106
Epoch [26/50], Loss: 1.8052
Epoch [28/50], Loss: 1.8048
Epoch [30/50], Loss: 1.8048
Epoch [32/50], Loss: 1.8039
Epoch [34/50], Loss: 1.8034
Epoch [36/50], Loss: 1.8054
Epoch [38/50], Loss: 1.8042
Epoch [40/50], Loss: 1.8037
Epoch [42/50], Loss: 1.8053
Epoch [44/50], Loss: 1.8042
Epoch [46/50], Loss: 1.8018
Epoch [48/50], Loss: 1.8012
Epoch [50/50], Loss: 1.8010


In [12]:
# Save the trained model
torch.save(model.state_dict(), 'transformer_model.pth')
print("Model saved!")


Model saved!


In [13]:
# Load the model (ensure the model architecture is defined exactly the same way)
loaded_model = ComplexTransformerModel().to(device)

# Load the saved model weights
loaded_model.load_state_dict(torch.load('transformer_model.pth'))
loaded_model.eval()  # Set the model to evaluation mode
print("Model loaded and ready for inference!")


Model loaded and ready for inference!


  loaded_model.load_state_dict(torch.load('transformer_model.pth'))


In [14]:
# Inference function
def make_predictions(model, input_data):
    # Ensure the model is in evaluation mode
    model.eval()

    # No need to compute gradients during inference
    with torch.no_grad():
        # Move input data to the device (GPU or CPU)
        input_data = input_data.to(device)

        # Forward pass
        output = model(input_data)

        # Get the predicted class (index with the maximum log-probability)
        predictions = torch.argmax(output, dim=1) + 1  # Add 1 to shift range from 0-5 to 1-6

        return predictions

# Example new data (shape: batch_size, seq_length, input_size)
new_data = torch.randn(8, 50, 41).to(device)  # Batch of 8 sequences of length 50 and input size 41

# Make predictions with the loaded model
predicted_labels = make_predictions(loaded_model, new_data)

# Display the predictions
print("Predicted labels:", predicted_labels)


Predicted labels: tensor([2, 2, 2, 2, 2, 2, 2, 2], device='cuda:0')
