In [1]:
import pandas as pd

from sklearn.metrics import classification_report, matthews_corrcoef

import pickle, random

random.seed(0)

In [2]:
gt = pd.read_csv('../dataset/cleaned/test.csv')

true_labels_int = gt['label'].tolist()

with open('../dataset/cleaned/class2idx.pkl', 'rb') as f:
    class2idx = pickle.load(f)

In [7]:
def eval_result(result_file_path):

    if result_file_path != 'baseline':
        model_name = result_file_path.replace('../generated_result/result_from_','').replace('-post-processed','').replace('.txt','')

        with open(result_file_path) as f:
            predictions = f.readlines()

        if model_name == 'llama2-7b':
            ## some lines contain more than 1 words, so get only the first word

            predictions = [s.strip() if len(s.split()) > 0 else '[BLANK]' for s in predictions]
            predictions = [s.split()[0].strip() for s in predictions]

            ## some lines contain uppercase letters, so make them lowercase so that they match keys in dictionary
            predictions = [s.lower() for s in predictions]

            predictions = [class2idx.get(s, -1) for s in predictions]

            labels = list(class2idx.values()) + [-1]

        #     predictions = [int(s.strip()) for s in predictions]

        else:
            predictions = [int(s.strip()) for s in predictions]
            labels = None

    else:
        model_name = 'baseline'
        seq = list(class2idx.values())
        weight = [14.67, 14.43, 12.66, 9.56, 9.34, 7.42, 6.16, 5.53, 5.17, 4.79, 4.07, 3.28, 2.91]

        predictions = random.choices(seq, weight,k=len(true_labels_int))
        labels = None

    print('evaluating results from', model_name)

    print(classification_report(
        true_labels_int, 
        predictions, 
        labels = labels,
        target_names=list(class2idx.keys())
        ))

    print('MCC:', round(matthews_corrcoef(true_labels_int, predictions),4)*100,'%')

In [8]:
eval_result('../generated_result/result_from_llama2-7b-post-processed.txt')

evaluating results from llama2-7b
              precision    recall  f1-score   support

       sport       0.97      0.79      0.87       917
        film       0.86      0.87      0.86       895
       music       0.86      0.89      0.87       785
     culture       0.73      0.18      0.29       598
        food       0.91      0.85      0.88       580
       world       0.86      0.38      0.53       467
    business       0.58      0.75      0.65       387
 environment       0.82      0.28      0.42       344
       money       0.95      0.11      0.20       321
     fashion       0.97      0.67      0.79       300
  technology       0.60      0.75      0.67       254
     science       0.55      0.50      0.52       204
       games       0.93      0.79      0.86       177

    accuracy                           0.65      6229
   macro avg       0.76      0.56      0.60      6229
weighted avg       0.84      0.65      0.69      6229

MCC: 63.65 %


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [12]:
eval_result('../generated_result/result_from_bert.txt')

evaluating results from bert
              precision    recall  f1-score   support

       sport       0.99      0.98      0.99       917
        film       0.86      0.95      0.90       895
       music       0.89      0.94      0.92       785
     culture       0.84      0.64      0.72       598
        food       0.92      0.97      0.94       580
       world       0.91      0.84      0.87       467
    business       0.77      0.69      0.73       387
 environment       0.82      0.85      0.84       344
       money       0.88      0.82      0.85       321
     fashion       0.93      0.95      0.94       300
  technology       0.75      0.90      0.82       254
     science       0.81      0.85      0.83       204
       games       0.92      0.95      0.94       177

    accuracy                           0.88      6229
   macro avg       0.87      0.87      0.87      6229
weighted avg       0.88      0.88      0.88      6229

MCC: 87.11 %


In [13]:
eval_result('../generated_result/result_from_deberta.txt')

evaluating results from deberta
              precision    recall  f1-score   support

       sport       0.98      0.99      0.99       917
        film       0.88      0.94      0.91       895
       music       0.88      0.96      0.92       785
     culture       0.79      0.65      0.72       598
        food       0.97      0.92      0.95       580
       world       0.95      0.81      0.87       467
    business       0.73      0.78      0.75       387
 environment       0.81      0.86      0.84       344
       money       0.91      0.76      0.83       321
     fashion       0.98      0.89      0.93       300
  technology       0.78      0.90      0.84       254
     science       0.69      0.92      0.79       204
       games       0.94      0.93      0.94       177

    accuracy                           0.88      6229
   macro avg       0.87      0.87      0.87      6229
weighted avg       0.89      0.88      0.88      6229

MCC: 87.02 %


In [21]:
eval_result('baseline')

evaluating results from baseline
              precision    recall  f1-score   support

       sport       0.16      0.16      0.16       917
        film       0.14      0.14      0.14       895
       music       0.12      0.12      0.12       785
     culture       0.09      0.08      0.08       598
        food       0.11      0.11      0.11       580
       world       0.07      0.06      0.07       467
    business       0.08      0.08      0.08       387
 environment       0.06      0.06      0.06       344
       money       0.04      0.04      0.04       321
     fashion       0.03      0.03      0.03       300
  technology       0.05      0.05      0.05       254
     science       0.05      0.05      0.05       204
       games       0.01      0.02      0.02       177

    accuracy                           0.10      6229
   macro avg       0.08      0.08      0.08      6229
weighted avg       0.10      0.10      0.10      6229

MCC: 0.06 %
