In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import TrainerCallback, TrainerState, TrainerControl

In [2]:
df = pd.read_pickle('AnnoMI-wav2vec-new.pkl')

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
# Split the df_therapist into train and test sets depending on the unique id of video_title
# The train set contains 80% of the data and the test set contains 20% of the data

# video_titles = df_client['video_title'].unique()

# train_video_titles, test_video_titles = train_test_split(video_titles, test_size=0.2, random_state=42)

# train_df = df_client[df_client['video_title'].isin(train_video_titles)]
# test_df = df_client[df_client['video_title'].isin(test_video_titles)]

In [5]:
# make a dataset where intelocutor is client
# df_client = df[df['interlocutor'] == 'client']
# df_client = df_client[['client_wav2vec_emb', 'client_talk_type']]
# df_client.rename(columns={'client_wav2vec_emb': 'inputs', 'client_talk_type': 'labels'}, inplace=True)

df_therapist = df[df['interlocutor'] == 'therapist']
df_therapist = df_therapist[['therapist_wav2vec_emb', 'main_therapist_behaviour']]
df_therapist.rename(columns={'therapist_wav2vec_emb': 'inputs', 'main_therapist_behaviour': 'labels'}, inplace=True)

In [6]:
#labels = df_client['labels'].unique()
labels = df_therapist['labels'].unique()
labels

array(['question', 'therapist_input', 'reflection', 'other'], dtype=object)

In [7]:
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = i  # store as integer
    id2label[i] = label  # key is also integer


In [8]:
id2label[2]

'reflection'

In [9]:
#train_data, test_data = train_test_split(df_client, test_size=0.2, random_state=42)
train_data, test_data = train_test_split(df_therapist, test_size=0.2, random_state=42)

In [10]:
train_data['inputs'] = train_data['inputs'].apply(lambda x: x['input_values'][0])
test_data['inputs'] = test_data['inputs'].apply(lambda x: x['input_values'][0])

In [11]:
class CustomDataset(Dataset):
    def __init__(self, dataframe, label2id):  # Add label2id as an argument
        self.data = dataframe
        self.label2id = label2id  # Store it

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        label = self.data.iloc[idx]['labels']
        encoded_label = self.label2id[label]  # Use label2id to encode the label
        return {
            "input_values": self.data.iloc[idx]['inputs'],
            "labels": encoded_label  # Use the encoded label
        }

train_dataset = CustomDataset(train_data, label2id)  # Pass label2id when initializing
eval_dataset = CustomDataset(test_data, label2id)    # Pass label2id when initializing


In [12]:
from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer, AutoFeatureExtractor

In [13]:
feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")



In [14]:
import evaluate
from sklearn.metrics import f1_score


accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=eval_pred.label_ids)

def f1_score_macro(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)  # Convert logits to class index
    return {"f1_macro": f1_score(labels, predictions, average="macro")}

class FocalLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, inputs, targets):
        log_prob = F.log_softmax(inputs, dim=-1)
        prob = torch.exp(log_prob)
        return F.nll_loss(
            ((1 - prob) ** self.gamma) * log_prob,
            targets,
            reduction=self.reduction
        )
    
class ThresholdEarlyStoppingCallback(TrainerCallback):
    def on_evaluate(self, args, state, control, metrics, **kwargs):
        f1 = metrics['eval_f1_macro']
        if f1 > 0.73:
            control.should_training_stop = True
        return control


In [15]:
num_labels = len(id2label)
model = AutoModelForAudioClassification.from_pretrained(
    "facebook/wav2vec2-base", num_labels=num_labels, label2id=label2id, id2label=id2label
)

Some weights of Wav2Vec2ForSequenceClassification were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['classifier.bias', 'projector.weight', 'classifier.weight', 'projector.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
training_args = TrainingArguments(
    output_dir="./output_wav2vec",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=256,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=256,
    num_train_epochs=2000,
    warmup_ratio=0.1,
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    save_total_limit=1,
    push_to_hub=False,
)

In [17]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=feature_extractor,
    compute_metrics=f1_score_macro,
    callbacks=[ThresholdEarlyStoppingCallback],
)

In [18]:
trainer.train()

  return F.conv1d(input, weight, bias, self.stride,


Epoch,Training Loss,Validation Loss,F1 Macro
0,No log,1.377346,0.157401
1,No log,1.376328,0.157924
2,No log,1.374138,0.159069
4,No log,1.372701,0.156693
4,No log,1.374278,0.155668
5,No log,1.373642,0.152824
6,No log,1.37055,0.150319
8,No log,1.36645,0.144859
8,No log,1.367353,0.143556
9,No log,1.365476,0.15093


TrainOutput(global_step=10000, training_loss=0.5505112571716309, metrics={'train_runtime': 40644.6434, 'train_samples_per_second': 253.514, 'train_steps_per_second': 0.246, 'total_flos': 8.909368196471194e+19, 'train_loss': 0.5505112571716309, 'epoch': 1904.76})

In [19]:
from sklearn.metrics import classification_report

# Get predictions
predictions, labels, _ = trainer.predict(eval_dataset)
predictions = np.argmax(predictions, axis=1)

# Print classification report
print(classification_report(labels, predictions, target_names=label2id.keys()))

                 precision    recall  f1-score   support

       question       0.48      0.34      0.40       364
therapist_input       0.33      0.23      0.27       187
     reflection       0.42      0.33      0.37       282
          other       0.50      0.74      0.59       455

       accuracy                           0.46      1288
      macro avg       0.43      0.41      0.41      1288
   weighted avg       0.45      0.46      0.44      1288

