In [2]:
import pandas as pd

actual_df = pd.read_csv('./evaluation_test/v1_actual.csv')
predictions_df = pd.read_csv('./evaluation_test/v2_predictions.csv')

In [3]:
actual_df.head()

Unnamed: 0,id,action,adventure,arcade,cardboard,indie,puzzle,quiz,rpg,simulation,strategy
0,226372,0,0,0,0,1,0,0,1,0,1
1,26491,0,1,0,0,1,0,0,0,0,0
2,176024,0,0,0,1,0,0,0,0,0,0
3,186954,1,0,1,0,0,1,0,0,0,0
4,12910,1,0,0,0,0,0,0,0,0,0


In [4]:
predictions_df.head()

Unnamed: 0,id,action,adventure,arcade,cardboard,indie,puzzle,quiz,rpg,simulation,strategy
0,226372,0.00881,0.159731,0.007491,0.020236,0.72606,0.016342,0.00122,0.170064,0.050393,0.986232
1,26491,0.029391,0.71836,0.012294,0.005714,0.544241,0.346233,0.002419,0.060064,0.035923,0.058145
2,176024,0.004948,0.026186,0.045605,0.677274,0.014303,0.092662,0.109726,0.009931,0.114815,0.413238
3,186954,0.244318,0.058361,0.291578,0.008872,0.027254,0.87321,0.002611,0.002738,0.006501,0.085331
4,12910,0.906381,0.428268,0.109081,0.00102,0.014678,0.079773,0.001986,0.005496,0.018491,0.013735


In [5]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn import metrics
import transformers
import torch
import string
import torchmetrics

from transformers import pipeline
from tqdm.notebook import tqdm
tqdm.pandas()

In [6]:
%pip install -q torchmetrics watermark

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.0.1 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
%load_ext watermark

In [8]:
%watermark --iversions

torch       : 2.0.0
transformers: 4.25.1
numpy       : 1.23.5
pandas      : 1.4.3
sklearn     : 1.1.2
torchmetrics: 0.11.4



In [9]:
label_columns = ["action","adventure","arcade","cardboard","indie","puzzle","quiz","rpg","simulation","strategy"]

# 📊 AUC ROC Score
---

In [10]:
# score for each genre categories

for cat in label_columns:
    
    print(f"Category: {cat}")
    print(f"Sklearn score: {metrics.roc_auc_score(actual_df[cat], predictions_df[cat])}")
    print(f"torchmetrics score: {torchmetrics.functional.classification.binary_auroc(torch.tensor(predictions_df[cat].values),torch.tensor(actual_df[cat].values), thresholds=None)}")
    print("#" * 30)
    print()

Category: action
Sklearn score: 0.9071859540966768
torchmetrics score: 0.907185971736908
##############################

Category: adventure
Sklearn score: 0.870325932159899
torchmetrics score: 0.8703259229660034
##############################

Category: arcade
Sklearn score: 0.8571348652803908
torchmetrics score: 0.8571348190307617
##############################

Category: cardboard
Sklearn score: 0.9344433371676153
torchmetrics score: 0.9344433546066284
##############################

Category: indie
Sklearn score: 0.8713088723805181
torchmetrics score: 0.8713088631629944
##############################

Category: puzzle
Sklearn score: 0.9040885258420909
torchmetrics score: 0.9040884375572205
##############################

Category: quiz
Sklearn score: 0.9461386404054666
torchmetrics score: 0.9461386203765869
##############################

Category: rpg
Sklearn score: 0.9128770294579969
torchmetrics score: 0.9128769636154175
##############################

Category: simulation
Sklea

In [11]:
# mean column-wise score on all categories

print(f"Sklearn score: {metrics.roc_auc_score(actual_df[label_columns].values, predictions_df[label_columns].values)}")
print(F"torchmetrics score: {torchmetrics.functional.classification.multilabel_auroc(torch.tensor(predictions_df[label_columns].values),torch.tensor(actual_df[label_columns].values),num_labels=10,thresholds=None)}")

print()
print(f"Sklearn score (weighted): {metrics.roc_auc_score(actual_df[label_columns].values, predictions_df[label_columns].values, average='weighted')}")
print(F"torchmetrics score (weighted): {torchmetrics.functional.classification.multilabel_auroc(torch.tensor(predictions_df[label_columns].values),torch.tensor(actual_df[label_columns].values),num_labels=10,thresholds=None,average='weighted' )}")

Sklearn score: 0.8989146256798497
torchmetrics score: 0.898914635181427

Sklearn score (weighted): 0.8867533519179122
torchmetrics score (weighted): 0.8867533206939697


# 📊 Average Precision Score
---

In [12]:
# score for each genre categories

for cat in label_columns:
    
    print(f"Category: {cat}")
    print(f"Sklearn score: {metrics.average_precision_score(actual_df[cat], predictions_df[cat])}")
    print(f"torchmetrics score: {torchmetrics.functional.classification.binary_average_precision(torch.tensor(predictions_df[cat].values),torch.tensor(actual_df[cat].values), thresholds=None)}")
    print("#" * 30)
    print()

Category: action
Sklearn score: 0.7679323501636987
torchmetrics score: 0.7679323554039001
##############################

Category: adventure
Sklearn score: 0.7946430805518749
torchmetrics score: 0.7946430444717407
##############################

Category: arcade
Sklearn score: 0.4235999002864591
torchmetrics score: 0.42359989881515503
##############################

Category: cardboard
Sklearn score: 0.39719487731151587
torchmetrics score: 0.39719486236572266
##############################

Category: indie
Sklearn score: 0.8096693450871373
torchmetrics score: 0.8096693754196167
##############################

Category: puzzle
Sklearn score: 0.6633332205879935
torchmetrics score: 0.6633331775665283
##############################

Category: quiz
Sklearn score: 0.4950955990761418
torchmetrics score: 0.4950955808162689
##############################

Category: rpg
Sklearn score: 0.7197555892964795
torchmetrics score: 0.7197555303573608
##############################

Category: simulation


In [13]:
# mean column-wise score on all categories

print(f"Sklearn score: {metrics.average_precision_score(actual_df[label_columns].values, predictions_df[label_columns].values)}")
print(F"torchmetrics score: {torchmetrics.functional.classification.multilabel_average_precision(torch.tensor(predictions_df[label_columns].values),torch.tensor(actual_df[label_columns].values),num_labels=10,thresholds=None)}")

print()
print(f"Sklearn score (weighted): {metrics.average_precision_score(actual_df[label_columns].values, predictions_df[label_columns].values, average='weighted')}")
print(F"torchmetrics score (weighted): {torchmetrics.functional.classification.multilabel_average_precision(torch.tensor(predictions_df[label_columns].values),torch.tensor(actual_df[label_columns].values),num_labels=10,thresholds=None,average='weighted' )}")

Sklearn score: 0.6645848360293142
torchmetrics score: 0.6645848155021667

Sklearn score (weighted): 0.760599373954096
torchmetrics score (weighted): 0.7605993747711182
