In [1]:
from tqdm.notebook import tqdm
import pickle
import torch
from torch import nn

In [2]:
device = torch.device('cuda')

# Load datasets

In [3]:
with open("test_data.pkl", "rb") as f:
    test_dataset = pickle.load(f)

In [4]:
loss_fn = nn.CrossEntropyLoss()

In [6]:
evs_path = 'weights/evs_model.pth'
main_model_path = "weights/main_model.pth"


evs = torch.load(evs_path, map_location=device)
main_model = torch.load(main_model_path, map_location=device)

In [7]:
import torch
import pandas as pd
from sklearn.metrics import f1_score, average_precision_score
import torch.nn.functional as F
import numpy as np

def pad(batch, classifier=None, dim: int = 512, max_len: int = None, device="cpu"):
    if max_len is None:
        max_len = max([len(x) for x in batch])
    attention_mask = []
    with torch.no_grad():
        for i in range(len(batch)):
            ones = [0] * len(batch[i])
            pad_length = max_len - len(batch[i])
            if pad_length == 0:
                attention_mask.append(ones)
                continue
            if classifier:
                padding = classifier.position_embeddings(
                    torch.ones((pad_length), dtype=torch.long, device=device)
                    * classifier.pad_token
                )
            else:
                padding = torch.zeros(pad_length, dim)
            zeros = [1] * pad_length
            batch[i] = torch.cat([batch[i].to(device), padding])
            attention_mask.append(ones + zeros)
        attention_mask = [[0] + x for x in attention_mask]
        return torch.stack([x.to(device) for x in batch]), torch.tensor(
            attention_mask
        ).to(device)

class Evaluate:
    def __init__(
        self,
        main_model,
        evs_model,
        device,
        loss_function,
        multi_model=False,
        batch_size=12,
    ):
        self.main_model = main_model.eval()
        self.evs_model = evs_model.eval()
        self.device = device
        self.multi_model = multi_model
        self.loss = loss_function
        self.batch_size = batch_size

    def eval(self, dataset):
        batch_size = self.batch_size
        real = []
        pred = []
        y_scores = []
        video_ids = []
        losses = []
        with torch.no_grad():
            for i in range(0, len(dataset), batch_size):
                batch = [x[0] for x in dataset[i : i + batch_size]]
                labels = torch.stack([x[1] for x in dataset[i : i + batch_size]])
                video_ids.append([x[-1] for x in dataset[i : i + batch_size]])
                audio_data = [torch.stack([y[1] for y in x]).squeeze(1) for x in batch]
                video_data = [[y[0] for y in x] for x in batch]
                video_data = [
                    [torch.stack(y).squeeze(1) if type(y) is list else y for y in x]
                    for x in video_data
                ]
                video_data = [
                    self.evs_model(*pad(x, self.evs_model, device=self.device))
                    for x in video_data
                ]
                video_data, v_mask = pad(
                    video_data, self.main_model, device=self.device
                )
                audio_data, a_mask = pad(
                    audio_data, self.main_model, device=self.device
                )
                if self.multi_model:
                    final_data = audio_data + video_data
                else:
                    final_data = audio_data
                label_output = self.main_model(final_data, attention_mask=a_mask)
                losses.append(
                    self.loss(label_output.view(-1, 3), labels.to(self.device).view(-1))
                )
                
                # Store raw model outputs (probabilities)
                y_scores.append(F.softmax(label_output.view(-1, 5, 3),dim=-1))#label_output.view(-1, 5, 3)
                
                pred.append(torch.argmax(label_output.view(-1, 5, 3), dim=-1))
                real.append(labels)
                
        
        real, pred = torch.cat(real).cpu(), torch.cat(pred).cpu() 
        y_scores = torch.cat(y_scores).cpu()  # Concatenate all y_scores
        y_scores_aspects = y_scores[:, :, 1:].max(dim=-1).values.numpy()
        y_scores_complaint = y_scores[:, :, 2:].max(dim=-1).values.numpy()
        
        return (
            f1_score(real >= 1, pred >= 1, average="micro"),
            f1_score(real >= 2, pred >= 2, average="micro"),
            torch.mean(torch.stack(losses)).item(),
            pd.DataFrame(
                {
                    "Video ID": [y for x in video_ids for y in x],
                    "Aspects real": (real >= 1).to(torch.long).tolist(),
                    "Aspects pred": (pred >= 1).to(torch.long).tolist(),
                    "Complaint real": (real >= 2).to(torch.long).tolist(),
                    "Complaint pred": (pred >= 2).to(torch.long).tolist(),
                    "Complaint scores":y_scores_complaint.tolist(),
                    "Aspects scores":y_scores_aspects.tolist(),
                }
            ),
            y_scores,  # Add y_scores to the returned tuple
        )


In [8]:
evaluator  = Evaluate(
    main_model, evs_model=evs, device=device, loss_function=loss_fn, multi_model=True
)
f1_aspect, f1_complaint, loss_cal, data, y_scores = evaluator.eval(test_dataset)
f1_aspect

0.6025641025641025

In [9]:
df = data.copy()
df

Unnamed: 0,Video ID,Aspects real,Aspects pred,Complaint real,Complaint pred,Complaint scores,Aspects scores
0,51e4YIHE5sU.mp4,"[0, 1, 0, 0, 0]","[0, 0, 0, 1, 0]","[0, 0, 0, 0, 0]","[0, 0, 0, 0, 0]","[0.2906479835510254, 0.07512342184782028, 0.28...","[0.2906479835510254, 0.07512342184782028, 0.30..."
1,FI65fJQ6bEM.mp4,"[0, 0, 0, 1, 0]","[0, 0, 0, 1, 0]","[0, 0, 0, 1, 0]","[0, 0, 0, 0, 0]","[0.016603317111730576, 0.08133836090564728, 0....","[0.2702391743659973, 0.1317998617887497, 0.031..."
2,1705732629717258718.mp4,"[0, 0, 0, 0, 1]","[1, 0, 0, 1, 0]","[0, 0, 0, 0, 1]","[0, 0, 0, 0, 0]","[0.08062469214200974, 0.06945031881332397, 0.0...","[0.659968376159668, 0.12646417319774628, 0.032..."
3,1705745405131051310.mp4,"[0, 0, 0, 1, 0]","[0, 0, 0, 0, 0]","[0, 0, 0, 0, 0]","[0, 0, 0, 0, 0]","[0.014080013148486614, 0.023925885558128357, 0...","[0.02765473537147045, 0.08400814235210419, 0.2..."
4,yrgnxaAOuws.mp4,"[0, 1, 0, 0, 0]","[0, 0, 0, 0, 0]","[0, 1, 0, 0, 0]","[0, 0, 0, 0, 0]","[0.12284839898347855, 0.3853851556777954, 0.02...","[0.12284839898347855, 0.3853851556777954, 0.06..."
...,...,...,...,...,...,...,...
59,k4gZTf2LVdU.mp4,"[1, 0, 1, 1, 0]","[0, 0, 1, 1, 0]","[0, 0, 0, 0, 0]","[0, 0, 0, 0, 0]","[0.09134558588266373, 0.019946636632084846, 0....","[0.4150772988796234, 0.11562275886535645, 0.83..."
60,J8-pMPud7gE.mp4,"[0, 1, 1, 1, 0]","[0, 0, 1, 1, 0]","[0, 1, 0, 1, 0]","[0, 0, 0, 0, 0]","[0.011677383445203304, 0.010007177479565144, 0...","[0.1897369623184204, 0.045635491609573364, 0.6..."
61,b6PolFcMVpI.mp4,"[0, 0, 1, 1, 1]","[0, 0, 1, 1, 0]","[0, 0, 0, 1, 0]","[0, 0, 0, 0, 0]","[0.07080940157175064, 0.05303401127457619, 0.3...","[0.3958565592765808, 0.13174274563789368, 0.51..."
62,1468611338192797702.mp4,"[0, 0, 0, 0, 1]","[0, 0, 0, 0, 1]","[0, 0, 0, 0, 1]","[0, 0, 0, 0, 1]","[0.09551646560430527, 0.12740866839885712, 0.0...","[0.09551646560430527, 0.12740866839885712, 0.0..."


In [10]:
for col in df.columns:
    if isinstance(df[col][0], list):  # Check if column contains lists (one-hot encoded arrays)
        expanded_array = np.array(df[col].tolist())
        num_categories = expanded_array.shape[1]
        category_columns = [f'{col}_{i+1}' for i in range(num_categories)]
        df[category_columns] = pd.DataFrame(expanded_array, index=df.index)
        df.drop(columns=[col], inplace=True)  
df.columns

Index(['Video ID', 'Aspects real_1', 'Aspects real_2', 'Aspects real_3',
       'Aspects real_4', 'Aspects real_5', 'Aspects pred_1', 'Aspects pred_2',
       'Aspects pred_3', 'Aspects pred_4', 'Aspects pred_5',
       'Complaint real_1', 'Complaint real_2', 'Complaint real_3',
       'Complaint real_4', 'Complaint real_5', 'Complaint pred_1',
       'Complaint pred_2', 'Complaint pred_3', 'Complaint pred_4',
       'Complaint pred_5', 'Complaint scores_1', 'Complaint scores_2',
       'Complaint scores_3', 'Complaint scores_4', 'Complaint scores_5',
       'Aspects scores_1', 'Aspects scores_2', 'Aspects scores_3',
       'Aspects scores_4', 'Aspects scores_5'],
      dtype='object')

In [11]:
from sklearn.metrics import *
# Example calculation for Hamming Loss for Aspects
y_true_aspects = df[['Aspects real_1', 'Aspects real_2', 'Aspects real_3', 'Aspects real_4', 'Aspects real_5']].values
y_pred_aspects = df[['Aspects pred_1', 'Aspects pred_2', 'Aspects pred_3', 'Aspects pred_4', 'Aspects pred_5']].values

y_true_complaint = df[['Complaint real_1', 'Complaint real_2', 'Complaint real_3', 'Complaint real_4', 'Complaint real_5']].values
y_pred_complaint = df[['Complaint pred_1', 'Complaint pred_2', 'Complaint pred_3', 'Complaint pred_4', 'Complaint pred_5']].values

hamming_loss_aspects = hamming_loss(y_true_aspects, y_pred_aspects)
print(f'Hamming Loss (Aspects): {hamming_loss_aspects}')
hamming_loss_complaint = hamming_loss(y_true_complaint, y_pred_complaint)
print(f'Hamming Loss (Complaint): {hamming_loss_complaint}')

y_complaint_scores = df[['Complaint scores_1', 'Complaint scores_2', 'Complaint scores_3', 'Complaint scores_4', 'Complaint scores_5']].values
y_aspects_scores = df[['Aspects scores_1', 'Aspects scores_2', 'Aspects scores_3', 'Aspects scores_4', 'Aspects scores_5']].values


coverage_error_aspect = coverage_error(y_true_aspects,y_aspects_scores)
print(f'Coverage Error (Aspect): {coverage_error_aspect}')
coverage_error_complaint = coverage_error(y_true_complaint,y_complaint_scores)
print(f'Coverage Error (Complaint): {coverage_error_complaint}')


average_precision_score_aspect = average_precision_score(y_true_aspects,y_aspects_scores)
print(f'Average Precision Score (Aspect): {average_precision_score_aspect}')
average_precision_score_complaint =average_precision_score(y_true_complaint,y_complaint_scores)
print(f'Avrerage Precision Score (Complaint): {average_precision_score_complaint}')


ranking_loss_aspect = label_ranking_loss(y_true_aspects,y_aspects_scores)
print(f'Ranking Loss (Aspect): {ranking_loss_aspect }')
ranking_loss_complaint = label_ranking_loss(y_true_complaint,y_complaint_scores)
print(f'Ranking Loss (Complaint): {ranking_loss_complaint}')


micro_f1_aspects = f1_score(y_true_aspects, y_pred_aspects, average='micro')

# Calculate Macro-F1 Score
macro_f1_aspects = f1_score(y_true_aspects, y_pred_aspects, average='macro')

print(f'Micro-F1 Score (Aspects): {micro_f1_aspects}')
print(f'Macro-F1 Score (Aspects): {macro_f1_aspects}')


micro_f1_complaint = f1_score(y_true_complaint, y_pred_complaint, average='micro')

# Calculate Macro-F1 Score
macro_f1_complaint = f1_score(y_true_complaint, y_pred_complaint, average='macro')

print(f'Micro-F1 Score (Complaint): {micro_f1_complaint}')
print(f'Macro-F1 Score (Complaint): {macro_f1_complaint}')


print(f'Accuracy (Aspect): {accuracy_score(y_true_aspects, y_pred_aspects)}')
print(f'Accuracy (Complaint): {accuracy_score(y_true_complaint, y_pred_complaint)}')


zero_one_loss_aspects = zero_one_loss(y_true_aspects, y_pred_aspects)
print(f'Zero-One Loss (Aspects): {zero_one_loss_aspects}')

zero_one_loss_complaint = zero_one_loss(y_true_complaint, y_pred_complaint)
print(f'Zero-One Loss (Complaint): {zero_one_loss_complaint}')


Hamming Loss (Aspects): 0.19375
Hamming Loss (Complaint): 0.115625
Coverage Error (Aspect): 2.484375
Coverage Error (Complaint): 1.171875
Average Precision Score (Aspect): 0.5468275390329145
Avrerage Precision Score (Complaint): 0.4113288894706388
Ranking Loss (Aspect): 0.22526041666666669
Ranking Loss (Complaint): 0.15104166666666666
Micro-F1 Score (Aspects): 0.6025641025641025
Macro-F1 Score (Aspects): 0.45386310604096447
Micro-F1 Score (Complaint): 0.24489795918367346
Macro-F1 Score (Complaint): 0.34523809523809523
Accuracy (Aspect): 0.203125
Accuracy (Complaint): 0.46875
Zero-One Loss (Aspects): 0.796875
Zero-One Loss (Complaint): 0.53125


In [12]:
# Initialize lists to store results
metrics_results_aspects = {}
metrics_results_complaint = {}

# Calculate metrics for Aspects columns
for i in range(1, 6):
    y_true_aspects = df[f'Aspects real_{i}'].values
    y_pred_aspects = df[f'Aspects pred_{i}'].values
    y_aspects_scores = df[f'Aspects scores_{i}'].values
    
    hamming_loss_aspect = hamming_loss(y_true_aspects, y_pred_aspects)
    #coverage_error_aspect = coverage_error(y_true_aspects, y_aspects_scores)
    average_precision_score_aspect = average_precision_score(y_true_aspects, y_aspects_scores)
    #ranking_loss_aspect = label_ranking_loss(y_true_aspects, y_aspects_scores)
    micro_f1_aspect = f1_score(y_true_aspects, y_pred_aspects, average='micro')
    macro_f1_aspect = f1_score(y_true_aspects, y_pred_aspects, average='macro')
    
    metrics_results_aspects[f'Aspect_{i}'] = {
        'Hamming Loss': hamming_loss_aspect,
        'Accuracy':accuracy_score(y_true_aspects, y_pred_aspects),
        'Average Precision Score': average_precision_score_aspect,
        'Zero-One Loss':zero_one_loss(y_true_aspects, y_pred_aspects),
        'Micro-F1 Score': micro_f1_aspect,
        'Macro-F1 Score': macro_f1_aspect
    }

# Calculate metrics for Complaint columns
for i in range(1, 6):
    y_true_complaint = df[f'Complaint real_{i}'].values
    y_pred_complaint = df[f'Complaint pred_{i}'].values
    y_complaint_scores = df[f'Complaint scores_{i}'].values
    
    hamming_loss_complaint = hamming_loss(y_true_complaint, y_pred_complaint)
    #coverage_error_complaint = coverage_error(y_true_complaint, y_complaint_scores)
    average_precision_score_complaint = average_precision_score(y_true_complaint, y_complaint_scores)
    #ranking_loss_complaint = label_ranking_loss(y_true_complaint, y_complaint_scores)
    micro_f1_complaint = f1_score(y_true_complaint, y_pred_complaint, average='micro')
    macro_f1_complaint = f1_score(y_true_complaint, y_pred_complaint, average='macro')
    
    metrics_results_complaint[f'Complaint_{i}'] = {
        'Hamming Loss': hamming_loss_complaint,
        'Accuracy':accuracy_score(y_true_complaint, y_pred_complaint),
        'Average Precision Score': average_precision_score_complaint,
        'Zero-One Loss':zero_one_loss(y_true_complaint, y_pred_complaint),
        'Micro-F1 Score': micro_f1_complaint,
        'Macro-F1 Score': macro_f1_complaint
    }

# Print results for Aspects
print("Metrics Results for Aspects:")
for aspect, metrics in metrics_results_aspects.items():
    print(f"{aspect}:")
    for metric_name, value in metrics.items():
        print(f"\t{metric_name}: {value}")

# Print results for Complaint
print("\nMetrics Results for Complaint:")
for complaint, metrics in metrics_results_complaint.items():
    print(f"{complaint}:")
    for metric_name, value in metrics.items():
        print(f"\t{metric_name}: {value}")

Metrics Results for Aspects:
Aspect_1:
	Hamming Loss: 0.234375
	Accuracy: 0.765625
	Average Precision Score: 0.34822315018292777
	Zero-One Loss: 0.234375
	Micro-F1 Score: 0.765625
	Macro-F1 Score: 0.5727636849132176
Aspect_2:
	Hamming Loss: 0.1875
	Accuracy: 0.8125
	Average Precision Score: 0.19883875318027003
	Zero-One Loss: 0.1875
	Micro-F1 Score: 0.8125
	Macro-F1 Score: 0.518796992481203
Aspect_3:
	Hamming Loss: 0.078125
	Accuracy: 0.921875
	Average Precision Score: 0.8668977592768867
	Zero-One Loss: 0.078125
	Micro-F1 Score: 0.921875
	Macro-F1 Score: 0.8885405781957506
Aspect_4:
	Hamming Loss: 0.28125
	Accuracy: 0.71875
	Average Precision Score: 0.8367561235511078
	Zero-One Loss: 0.28125
	Micro-F1 Score: 0.71875
	Macro-F1 Score: 0.708502024291498
Aspect_5:
	Hamming Loss: 0.1875
	Accuracy: 0.8125
	Average Precision Score: 0.4834219089733796
	Zero-One Loss: 0.1875
	Micro-F1 Score: 0.8125
	Macro-F1 Score: 0.5714285714285714

Metrics Results for Complaint:
Complaint_1:
	Hamming Loss: 0