In [None]:
import os
import numpy as np
import pandas as pd
import wave
from google.colab import drive
import tensorflow.compat.v1 as tf
import zipfile
import librosa
from keras import layers
import math
import shutil
from tqdm import tqdm
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel

prefix = '/content/drive/MyDrive/DAIC-WOZ'

In [None]:
# Load CSV files
train_split_df = pd.read_csv(os.path.join(prefix, 'train_split_Depression_AVEC2017.csv'))
test_split_df = pd.read_csv(os.path.join(prefix, 'dev_split_Depression_AVEC2017.csv'))

train_split_num = train_split_df['Participant_ID'].tolist()
test_split_num = test_split_df['Participant_ID'].tolist()
train_split_label = train_split_df['PHQ8_Binary'].tolist()
test_split_label = test_split_df['PHQ8_Binary'].tolist()

# Sample Usage
save_dir = os.path.join('/content/drive/MyDrive', 'DAIC-Features')

audio_train = np.load(os.path.join(save_dir, 'train_audio_clf.npz'))
text_train = np.load(os.path.join(save_dir, 'train_text_clf.npz'))
lablels_train = np.load(os.path.join(save_dir, 'train_label_clf.npz'))

audio_test = np.load(os.path.join(save_dir, 'test_audio_clf.npz'))
text_test = np.load(os.path.join(save_dir, 'test_text_clf.npz'))
labels_test = np.load(os.path.join(save_dir, 'test_label_clf.npz'))

In [None]:
train_numbers = [num for num in train_split_num]
test_numbers = [num for num in test_split_num]

text_subs_train = [text_train[f'text_{num}'] for num in train_numbers]
audio_subs_train = [audio_train[f'audio_{num}'] for num in train_numbers]
labels_train = [lablels_train[f'label_{num}'] for num in train_numbers]

text_subs_test = [text_test[f'text_{num}'] for num in test_numbers]
audio_subs_test = [audio_test[f'audio_{num}'] for num in test_numbers]
labels_test = [labels_test[f'label_{num}'] for num in test_numbers]

In [None]:
audio_subs_train[1].shape

(46, 1, 256)

In [None]:
class MultimodalDataset(Dataset):
    def __init__(self, text_list, audio_list, labels_list, tokenizer, max_len):
        self.text_list = text_list
        self.audio_list = audio_list
        self.labels_list = labels_list
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.text_list)

    def __getitem__(self, index):
        text_sentences = self.text_list[index]
        audio_sentences = self.audio_list[index]
        label = self.labels_list[index]
        if len(text_sentences) == 0 or len(audio_sentences) == 0:
            return None
        text_encodings = [
            self.tokenizer(
                sentence,
                padding='max_length',
                truncation=True,
                max_length=self.max_len,
                return_tensors="pt"
            )
            for sentence in text_sentences
        ]
        text_input_ids = torch.stack([e['input_ids'].squeeze(0) for e in text_encodings])
        text_attention_masks = torch.stack([e['attention_mask'].squeeze(0) for e in text_encodings])
        audio_features = torch.tensor(audio_sentences, dtype=torch.float32)
        return {'input_ids': text_input_ids, 'attention_mask': text_attention_masks, 'audio_features': audio_features, 'label': label}

def multimodal_collate_fn(batch):
    batch = [item for item in batch if item is not None]

    if len(batch) == 0:
        return {}

    max_sentences = max(len(item['input_ids']) for item in batch)
    max_len = batch[0]['input_ids'].size(1)
    text_padded_input_ids = []
    text_padded_attention_masks = []
    audio_padded_features = []
    labels = []

    for item in batch:
        num_sentences = len(item['input_ids'])
        padding_size = max_sentences - num_sentences

        text_padded_input_ids.append(
            torch.cat([item['input_ids'], torch.zeros((padding_size, max_len), dtype=torch.long)], dim=0)
        )
        text_padded_attention_masks.append(
            torch.cat([item['attention_mask'], torch.zeros((padding_size, max_len), dtype=torch.long)], dim=0)
        )
        audio_padded_features.append(
            torch.cat([item['audio_features'], torch.zeros((padding_size, 1, 256), dtype=torch.float32)], dim=0)
        )
        labels.append(item['label'])

    return {
        'input_ids': torch.stack(text_padded_input_ids),
        'attention_mask': torch.stack(text_padded_attention_masks),
        'audio_features': torch.stack(audio_padded_features),
        'label': torch.tensor(labels, dtype=torch.long)
    }




###############################################################################
def train_model(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    for batch in dataloader:
        if not batch:
            continue
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        audio_features = batch['audio_features'].to(device)
        labels = batch['label'].to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask, audio_features)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        preds = torch.argmax(outputs, dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)
    accuracy = correct / total if total > 0 else 0
    return total_loss / len(dataloader), accuracy

def evaluate_model(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in dataloader:
            if not batch:
                continue
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            audio_features = batch['audio_features'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids, attention_mask, audio_features)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            preds = torch.argmax(outputs, dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
    accuracy = correct / total if total > 0 else 0
    return total_loss / len(dataloader), accuracy


In [None]:
class TransformerClassifier(nn.Module):
    def __init__(self, pretrained_model_name, num_classes, n_heads=4, n_layers=1, dim_feedforward=256, dropout=0.5):
        super(TransformerClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(pretrained_model_name)
        self.hidden_size = self.bert.config.hidden_size
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=self.hidden_size,
            nhead=n_heads,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            activation='relu'
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)
        self.fc = nn.Linear(self.hidden_size, num_classes)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input_ids, attention_mask):
        batch_size, num_sentences, seq_len = input_ids.size()
        input_ids = input_ids.view(-1, seq_len)
        attention_mask = attention_mask.view(-1, seq_len)
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        sentence_embeddings = outputs.last_hidden_state[:, 0, :]
        sentence_embeddings = sentence_embeddings.view(batch_size, num_sentences, self.hidden_size)
        sentence_embeddings = sentence_embeddings.permute(1, 0, 2)
        transformer_output = self.transformer_encoder(sentence_embeddings)
        transformer_output = transformer_output.permute(1, 0, 2)
        subject_embeddings = transformer_output.mean(dim=1)
        x = self.dropout(subject_embeddings)
        x = self.fc(x)
        return x


class MultimodalTransformerClassifier(nn.Module):
    def __init__(self, pretrained_model_name, num_classes, n_heads=4, n_layers=2, dim_feedforward=256, dropout=0.3):
        super(MultimodalTransformerClassifier, self).__init__()
        self.text_bert = BertModel.from_pretrained(pretrained_model_name)
        self.audio_encoder_layer = nn.TransformerEncoderLayer(
            d_model=256,
            nhead=n_heads,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            activation='relu'
        )
        self.audio_transformer_encoder = nn.TransformerEncoder(self.audio_encoder_layer, num_layers=n_layers)
        self.text_hidden_size = self.text_bert.config.hidden_size
        self.audio_hidden_size = 256
        self.fc = nn.Linear(self.text_hidden_size + self.audio_hidden_size, num_classes)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input_ids, attention_mask, audio_features):
        batch_size, num_sentences, seq_len = input_ids.size()
        input_ids = input_ids.view(-1, seq_len)
        attention_mask = attention_mask.view(-1, seq_len)

        text_outputs = self.text_bert(input_ids, attention_mask)
        text_sentence_embeddings = text_outputs.last_hidden_state[:, 0, :]
        text_sentence_embeddings = text_sentence_embeddings.view(batch_size, num_sentences, self.text_hidden_size)
        text_subject_embeddings = text_sentence_embeddings.mean(dim=1)

        audio_features = audio_features.squeeze(2)  # Remove the unnecessary dimension
        audio_features = audio_features.permute(1, 0, 2)
        audio_transformer_output = self.audio_transformer_encoder(audio_features)
        audio_transformer_output = audio_transformer_output.permute(1, 0, 2)
        audio_subject_embeddings = audio_transformer_output.mean(dim=1)

        multimodal_embeddings = torch.cat((text_subject_embeddings, audio_subject_embeddings), dim=1)
        x = self.dropout(multimodal_embeddings)
        x = self.fc(x)
        return x



In [None]:
sentences_list = text_subs_train
labels_list = [int(elem) for elem in labels_train]

sentences_list_test = text_subs_test
labels_list_test = [int(elem) for elem in labels_test]

max_len = 40
batch_size = 8
num_classes = 2
learning_rate = 4e-5
num_epochs = 30

pretrained_model_name = 'bert-base-uncased'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


tokenizer = BertTokenizer.from_pretrained(pretrained_model_name)
train_data = MultimodalDataset(text_subs_train, audio_subs_train, labels_list, tokenizer, max_len)
test_data = MultimodalDataset(text_subs_test, audio_subs_test, labels_list_test, tokenizer, max_len)

train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=multimodal_collate_fn)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False, collate_fn=multimodal_collate_fn)

model = MultimodalTransformerClassifier(pretrained_model_name, num_classes).to(device)



criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
for epoch in range(num_epochs):
    train_loss, train_acc = train_model(model, train_loader, optimizer, criterion, device)
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {train_loss:.4f}, Accuracy: {train_acc:.4f}")

test_loss, test_acc = evaluate_model(model, test_loader, criterion, device)
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_acc:.4f}")
print("Training Complete.")

Epoch 1/30, Loss: 0.6497, Accuracy: 0.7009
Epoch 2/30, Loss: 0.6045, Accuracy: 0.7196
Epoch 3/30, Loss: 0.5879, Accuracy: 0.7196
Epoch 4/30, Loss: 0.5572, Accuracy: 0.7196
Epoch 5/30, Loss: 0.5293, Accuracy: 0.7196
Epoch 6/30, Loss: 0.5006, Accuracy: 0.7196
Epoch 7/30, Loss: 0.5010, Accuracy: 0.7570
Epoch 8/30, Loss: 0.4871, Accuracy: 0.8131
Epoch 9/30, Loss: 0.4213, Accuracy: 0.8785
Epoch 10/30, Loss: 0.3226, Accuracy: 0.8785
Epoch 11/30, Loss: 0.2294, Accuracy: 0.9346
Epoch 12/30, Loss: 0.1154, Accuracy: 0.9720
Epoch 13/30, Loss: 0.0426, Accuracy: 1.0000
Epoch 14/30, Loss: 0.0205, Accuracy: 1.0000
Epoch 15/30, Loss: 0.0099, Accuracy: 1.0000
Epoch 16/30, Loss: 0.0089, Accuracy: 1.0000
Epoch 17/30, Loss: 0.0055, Accuracy: 1.0000
Epoch 18/30, Loss: 0.0052, Accuracy: 1.0000
Epoch 19/30, Loss: 0.0038, Accuracy: 1.0000
Epoch 20/30, Loss: 0.0030, Accuracy: 1.0000
Epoch 21/30, Loss: 0.0027, Accuracy: 1.0000
Epoch 22/30, Loss: 0.0025, Accuracy: 1.0000
Epoch 23/30, Loss: 0.0021, Accuracy: 1.00