In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import f1_score
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
df = pd.read_pickle('Annomi_VGGish.pkl')

In [4]:
# Check the type of the embeddings
embedding_type = type(df.iloc[0]['client_vggish_emb'])  # Assuming the embeddings column is named 'client_vggish_embeddings'
print("Embedding Type:", embedding_type)

# Check the shape or length of a few random embeddings
import random

sample_indices = random.sample(range(len(df)), 5)
for idx in sample_indices:
    embedding_sample = df.iloc[idx]['client_vggish_emb']
    
    # Checking shape if it's a numpy array or length if it's a list
    if isinstance(embedding_sample, list):
        print(f"Sample {idx} Embedding Length:", len(embedding_sample))
    elif isinstance(embedding_sample, np.ndarray):
        print(f"Sample {idx} Embedding Shape:", embedding_sample.shape)
    else:
        print(f"Sample {idx} has a different type.")


Embedding Type: <class 'NoneType'>
Sample 9985 has a different type.
Sample 5309 has a different type.
Sample 7185 has a different type.
Sample 12082 has a different type.
Sample 10600 has a different type.


In [5]:
# Two dataset where one interlocutor is client and the other is therapist
client_df = df[df['interlocutor'] == 'client']
therapist_df = df[df['interlocutor'] == 'therapist']

In [6]:
# # Determine the maximum sequence length for client embeddings
# max_sequence_length_client = client_df['client_vggish_emb'].apply(lambda x: x.shape[0] if isinstance(x, np.ndarray) else 0).max()

# # Padding function using PyTorch
# def pad_sequences_torch(embeddings, max_len):
#     padded_embeddings = torch.zeros((len(embeddings), max_len, 128))  # Assuming embeddings have 128 features
#     for idx, emb in enumerate(embeddings):
#         if isinstance(emb, np.ndarray):
#             length = min(emb.shape[0], max_len)
#             padded_embeddings[idx, :length] = torch.tensor(emb[:length])
#     return padded_embeddings

# # Pad client embeddings
# padded_client_embeddings = pad_sequences_torch(client_df['client_vggish_emb'].tolist(), max_sequence_length_client)

# print(max_sequence_length_client)
# print(padded_client_embeddings.shape)

In [7]:
def pad_sequences(sequences, padding_value=0):
    """
    Pad the sequences to the same length with the given padding value.
    
    Args:
    - sequences (list of torch.Tensor): List of sequences to be padded.
    - padding_value (float): Value used for padding.
    
    Returns:
    - torch.Tensor: Padded sequences.
    """
    
    # Get the feature size from the first 2D tensor in the list
    feature_size = next(seq.size(1) for seq in sequences if seq.dim() == 2)
    
    # Ensure all sequences have at least two dimensions with correct feature size
    sequences = [seq.unsqueeze(-1).repeat(1, feature_size) if seq.dim() == 1 else seq for seq in sequences]
    
    # Get the maximum sequence length
    max_len = max([seq.size(0) for seq in sequences])
    
    # Create a tensor for padded sequences
    padded_seqs = torch.full((len(sequences), max_len, feature_size), padding_value)
    
    # Copy sequences to the padded tensor
    for i, seq in enumerate(sequences):
        padded_seqs[i, :seq.size(0)] = seq
    
    return padded_seqs

In [8]:
# Convert embeddings column to a list of tensors
sequences = [torch.tensor(embedding) for embedding in client_df['client_vggish_emb']]
padded_client_embeddings = pad_sequences(sequences)

In [9]:
X = padded_client_embeddings
y = client_df['client_talk_type'].values  # Adjust if the column name is different or if encoding is required

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [10]:
class SimpleLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
        super(SimpleLSTM, self).__init__()
        
        # LSTM layer
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        
        # Fully connected layer
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x):
        # LSTM layer
        lstm_out, _ = self.lstm(x)
        
        # Only take the output from the final timestep
        final_out = lstm_out[:, -1, :]
        
        # Fully connected layer
        out = self.fc(final_out)
        return out
    
class DeepLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dims, num_classes, dropout_prob=0.1):
        super(DeepLSTM, self).__init__()
        
        # First LSTM layer
        self.lstm1 = nn.LSTM(input_dim, hidden_dims[0], batch_first=True)
        self.dropout1 = nn.Dropout(dropout_prob)
        
        # Second LSTM layer
        self.lstm2 = nn.LSTM(hidden_dims[0], hidden_dims[1], batch_first=True)
        self.dropout2 = nn.Dropout(dropout_prob)
        
        # Third LSTM layer
        self.lstm3 = nn.LSTM(hidden_dims[1], hidden_dims[2], batch_first=True)
        self.dropout3 = nn.Dropout(dropout_prob)
        
        # Fourth LSTM layer
        self.lstm4 = nn.LSTM(hidden_dims[2], hidden_dims[3], batch_first=True)
        self.dropout4 = nn.Dropout(dropout_prob)
        
        # Fully connected layers
        self.fc1 = nn.Linear(hidden_dims[3], 128)
        self.dropout5 = nn.Dropout(dropout_prob)
        self.fc2 = nn.Linear(128, 64)
        self.dropout6 = nn.Dropout(dropout_prob)
        self.fc3 = nn.Linear(64, num_classes)
        
    def forward(self, x):
        # First LSTM
        x, _ = self.lstm1(x)
        x = self.dropout1(x)
        
        # Second LSTM
        x, _ = self.lstm2(x)
        x = self.dropout2(x)
        
        # Third LSTM
        x, _ = self.lstm3(x)
        x = self.dropout3(x)
        
        # Fourth LSTM (only keep the output of the last sequence)
        x, _ = self.lstm4(x)
        x = self.dropout4(x[:, -1, :])
        
        # Fully connected layers
        x = F.relu(self.fc1(x))
        x = self.dropout5(x)
        x = F.relu(self.fc2(x))
        x = self.dropout6(x)
        x = self.fc3(x)
        
        return x
    
class LSTMModel(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(LSTMModel, self).__init__()
        
        self.lstm1 = nn.LSTM(input_dim, 256, batch_first=True)
        self.dropout1 = nn.Dropout(0.1)
        
        self.lstm2 = nn.LSTM(256, 128, batch_first=True)
        self.dropout2 = nn.Dropout(0.1)
        
        self.lstm3 = nn.LSTM(128, 64, batch_first=True)
        self.dropout3 = nn.Dropout(0.1)
        
        self.lstm4 = nn.LSTM(64, 32, batch_first=True)
        self.dropout4 = nn.Dropout(0.1)
        
        self.fc1 = nn.Linear(32, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, num_classes)
        
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, x):
        x, _ = self.lstm1(x)
        x = self.dropout1(x)
        
        x, _ = self.lstm2(x)
        x = self.dropout2(x)
        
        x, _ = self.lstm3(x)
        x = self.dropout3(x)
        
        x, _ = self.lstm4(x)
        x = self.dropout4(x)
        
        x = x[:, -1, :]  # Use the last time step's output
        
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout4(x)
        
        x = self.fc2(x)
        x = self.relu(x)
        x = self.dropout4(x)
        
        x = self.fc3(x)
        x = self.softmax(x)
        
        return x


In [11]:
# Initialize and fit the encoder on y_train
encoder = LabelEncoder()
encoded_y_train = encoder.fit_transform(y_train)

# Transform y_val using the same encoder
encoded_y_val = encoder.transform(y_val)

# Convert the encoded labels to PyTorch tensors
y_train_tensor = torch.tensor(encoded_y_train)
y_val_tensor = torch.tensor(encoded_y_val)

# For verification, print out the unique classes the encoder recognized
print(encoder.classes_)


['change' 'neutral' 'sustain']


In [12]:
# Convert X_train, X_val, encoded_y_train, and encoded_y_val to tensors if they aren't already
if not isinstance(X_train, torch.Tensor):
    X_train_tensor = torch.stack([torch.Tensor(e) for e in X_train])
else:
    X_train_tensor = X_train

if not isinstance(X_val, torch.Tensor):
    X_val_tensor = torch.stack([torch.Tensor(e) for e in X_val])
else:
    X_val_tensor = X_val

y_train_tensor = torch.tensor(encoded_y_train, dtype=torch.long) if not isinstance(encoded_y_train, torch.Tensor) else encoded_y_train
y_val_tensor = torch.tensor(encoded_y_val, dtype=torch.long) if not isinstance(encoded_y_val, torch.Tensor) else encoded_y_val

# Create TensorDatasets and DataLoaders
train_data = TensorDataset(X_train_tensor, y_train_tensor)
val_data = TensorDataset(X_val_tensor, y_val_tensor)


In [13]:
batch_size = 8
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
val_loader = DataLoader(val_data, shuffle=False, batch_size=batch_size)

In [14]:
# # Hyperparameters
# input_dim = 128
# hidden_dim = 64
# output_dim = len(encoder.classes_)  # Number of unique classes in the target label
# num_layers = 2

# model = SimpleLSTM(input_dim, hidden_dim, output_dim, num_layers).to(device)

In [15]:
# # Hyperparameters
# input_dim = 128  # Embedding size
# hidden_dims = [256, 128, 64, 32]
# num_classes = len(encoder.classes_)  # Number of unique classes

# model = DeepLSTM(input_dim, hidden_dims, num_classes).to(device)

In [16]:
model = LSTMModel(input_dim=128, num_classes=3).to(device)

In [17]:
num_epochs = 100
best_f1 = 0  # For tracking the best F1 score

# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        
        # Zero the parameter gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(inputs.float())
        loss = criterion(outputs, labels)
        
        # Backward pass and optimize
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item() * inputs.size(0)
    
    # Compute average training loss for the epoch
    epoch_loss = running_loss / len(train_loader.dataset)
    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {epoch_loss:.4f}")
    
    # Evaluate the model on the validation set
    model.eval()
    val_loss = 0.0
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            
            outputs = model(inputs.float())
            loss = criterion(outputs, labels)
            val_loss += loss.item() * inputs.size(0)
            
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    # Compute average validation loss and F1 score for the epoch
    val_loss = val_loss / len(val_loader.dataset)
    f1_macro = f1_score(all_labels, all_preds, average='macro')
    print(f"Validation Loss: {val_loss:.4f}, F1 Macro: {f1_macro:.4f}")
    
    # Save model with best F1 score
    if f1_macro > best_f1:
        best_f1 = f1_macro
        torch.save(model.state_dict(), 'best_model_audio_client_vggish.pth')

Epoch 1/100, Train Loss: 0.9328
Validation Loss: 0.9268, F1 Macro: 0.2563
Epoch 2/100, Train Loss: 0.9269
Validation Loss: 0.9268, F1 Macro: 0.2563
Epoch 3/100, Train Loss: 0.9269
Validation Loss: 0.9268, F1 Macro: 0.2563
Epoch 4/100, Train Loss: 0.9269
Validation Loss: 0.9268, F1 Macro: 0.2563
Epoch 5/100, Train Loss: 0.9269
Validation Loss: 0.9268, F1 Macro: 0.2563
Epoch 6/100, Train Loss: 0.9268
Validation Loss: 0.9268, F1 Macro: 0.2563
Epoch 7/100, Train Loss: 0.9268
Validation Loss: 0.9268, F1 Macro: 0.2563
Epoch 8/100, Train Loss: 0.9268
Validation Loss: 0.9268, F1 Macro: 0.2563
Epoch 9/100, Train Loss: 0.9268
Validation Loss: 0.9268, F1 Macro: 0.2563
Epoch 10/100, Train Loss: 0.9268
Validation Loss: 0.9268, F1 Macro: 0.2563
Epoch 11/100, Train Loss: 0.9268
Validation Loss: 0.9268, F1 Macro: 0.2563
Epoch 12/100, Train Loss: 0.9268
Validation Loss: 0.9268, F1 Macro: 0.2563
Epoch 13/100, Train Loss: 0.9268
Validation Loss: 0.9268, F1 Macro: 0.2563
Epoch 14/100, Train Loss: 0.9268


KeyboardInterrupt: 

In [None]:
# Classification report
from sklearn.metrics import classification_report

# Load the best model
model.load_state_dict(torch.load('best_model_audio_client_vggish.pth'))

# Get predictions
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for inputs, labels in val_loader:
        inputs, labels = inputs.to(device), labels.to(device)

        outputs = model(inputs.float())
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Print classification report
print(classification_report(all_labels, all_preds, target_names=encoder.classes_))