In [12]:
import pandas as pd 
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.metrics import f1_score

In [3]:
df = pd.read_pickle('Annomi_with_mfccs_scaled_features.pkl')

In [4]:
# Get a dataframe where interlocutor is cleint
df_client = df[df['interlocutor'] == 'client']

In [5]:
X = np.array(df_client['client_mfccs_scaled_features'].tolist())
y = np.array(df_client['client_talk_type'].tolist())

In [6]:
le = LabelEncoder()
le.fit(y)
print('Original labels: ', le.classes_)
y = le.transform(y)
print('Transformed labels: ', np.unique(y))

Original labels:  ['change' 'neutral' 'sustain']
Transformed labels:  [0 1 2]


In [7]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [8]:
# Convert numpy arrays to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)  # Assuming X is a float. Adjust dtype if necessary.
y_train_tensor = torch.tensor(y_train, dtype=torch.long)  # Assuming y is integer labels
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

# Create TensorDatasets
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_test_tensor, y_test_tensor)

# Define DataLoader instances
batch_size = 64  # You can adjust the batch size as required

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [9]:
# Calculate f1_macro score
def f1_macro(y_true, y_pred):
    y_true = y_true.cpu().numpy()
    y_pred = y_pred.cpu().numpy()
    return f1_score(y_true, y_pred, average='macro')

class AudioClassifier(nn.Module):
    def __init__(self):
        super(AudioClassifier, self).__init__()
        
        # First layer
        self.fc1 = nn.Linear(40, 100)
        self.dropout1 = nn.Dropout(0.5)
        
        # Second layer
        self.fc2 = nn.Linear(100, 200)
        self.dropout2 = nn.Dropout(0.5)
        
        # Third layer
        self.fc3 = nn.Linear(200, 100)
        self.dropout3 = nn.Dropout(0.5)
        
        # Final layer
        self.fc4 = nn.Linear(100, 3)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dropout1(x)
        
        x = F.relu(self.fc2(x))
        x = self.dropout2(x)
        
        x = F.relu(self.fc3(x))
        x = self.dropout3(x)
        
        x = F.softmax(self.fc4(x), dim=1)
        return x

In [10]:
model = AudioClassifier()

In [14]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

best_val_f1 = 0.0

num_epochs = 100  # For example, train for 10 epochs

# Training loop
for epoch in range(num_epochs):
    model.train()  # Set the model to training mode
    
    # Training
    train_loss = 0.0
    for batch_idx, (data, target) in enumerate(train_loader):
        optimizer.zero_grad()   # Reset gradients
        outputs = model(data)   # Forward pass
        loss = criterion(outputs, target)   # Compute loss
        loss.backward()   # Backward pass
        optimizer.step()   # Update weights
        train_loss += loss.item()
    
    # Validation
    model.eval()  # Set the model to evaluation mode
    val_loss = 0.0
    correct = 0
    total = 0
    val_f1 = 0.0
    with torch.no_grad():  # Deactivate autograd
        for batch_idx, (data, target) in enumerate(val_loader):
            outputs = model(data)
            loss = criterion(outputs, target)
            val_loss += loss.item()
            _, predicted = outputs.max(1)
            total += target.size(0)
            correct += predicted.eq(target).sum().item()
    
    # Calculate metrics
    val_f1 = f1_macro(target, predicted)

    # Save the best model
    if val_f1 > best_val_f1:
        best_val_f1 = val_f1
        torch.save(model.state_dict(), 'best_mfcc_model.pth')

    train_loss /= len(train_loader)
    val_loss /= len(val_loader)
    val_accuracy = 100. * correct / total
    
    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.2f}%")

Epoch 1/100, Train Loss: 0.9252, Val Loss: 0.9316, Val Accuracy: 61.99%
Epoch 2/100, Train Loss: 0.9244, Val Loss: 0.9316, Val Accuracy: 61.99%
Epoch 3/100, Train Loss: 0.9258, Val Loss: 0.9316, Val Accuracy: 61.99%
Epoch 4/100, Train Loss: 0.9266, Val Loss: 0.9316, Val Accuracy: 61.99%
Epoch 5/100, Train Loss: 0.9245, Val Loss: 0.9316, Val Accuracy: 61.99%
Epoch 6/100, Train Loss: 0.9273, Val Loss: 0.9316, Val Accuracy: 61.99%
Epoch 7/100, Train Loss: 0.9280, Val Loss: 0.9316, Val Accuracy: 61.99%
Epoch 8/100, Train Loss: 0.9252, Val Loss: 0.9316, Val Accuracy: 61.99%
Epoch 9/100, Train Loss: 0.9259, Val Loss: 0.9316, Val Accuracy: 61.99%
Epoch 10/100, Train Loss: 0.9252, Val Loss: 0.9316, Val Accuracy: 61.99%
Epoch 11/100, Train Loss: 0.9259, Val Loss: 0.9316, Val Accuracy: 61.99%
Epoch 12/100, Train Loss: 0.9266, Val Loss: 0.9316, Val Accuracy: 61.99%
Epoch 13/100, Train Loss: 0.9245, Val Loss: 0.9316, Val Accuracy: 61.99%
Epoch 14/100, Train Loss: 0.9232, Val Loss: 0.9316, Val Accu

In [None]:
# Classification report
from sklearn.metrics import classification_report
model.load_state_dict(torch.load('best_mfcc_model.pth'))
model.eval()

y_pred = []
y_true = []

with torch.no_grad():
    for data, target in val_loader:
        outputs = model(data)
        _, predicted = outputs.max(1)
        y_pred.extend(predicted.tolist())
        y_true.extend(target.tolist())

print(classification_report(y_true, y_pred, target_names=le.classes_))