In [None]:
# Mount the drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Navigate to the current directory
from google.colab import userdata
import os
os.chdir(userdata.get('CURRENT_DIR'))

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load the evaluation metrics for each model
raw_bert_metrics = pd.read_csv('sentiment_scores/raw_bert_metrics.csv')
finetuned_bert_metrics = pd.read_csv('sentiment_scores/finetuned_bert_metrics.csv')
raw_finbert_metrics = pd.read_csv('sentiment_scores/raw_finbert_metrics.csv')
finetuned_finbert_metrics = pd.read_csv('sentiment_scores/finetuned_finbert_metrics.csv')

raw_llama_0_shot_metrics = pd.read_csv('sentiment_scores/raw_llama_0_shot_metrics.csv')
finetuned_llama_0_shot_metrics = pd.read_csv('sentiment_scores/finetuned_llama_0_shot_metrics.csv')
raw_llama_1_shot_metrics = pd.read_csv('sentiment_scores/raw_llama_1_shot_metrics.csv')
finetuned_llama_1_shot_metrics = pd.read_csv('sentiment_scores/finetuned_llama_1_shot_metrics.csv')

raw_llama_1B_0_shot_metrics = pd.read_csv('sentiment_scores/raw_llama_1B_0_shot_metrics.csv')
finetuned_llama_1B_0_shot_metrics = pd.read_csv('sentiment_scores/finetuned_llama_1B_0_shot_metrics.csv')
raw_llama_1B_1_shot_metrics = pd.read_csv('sentiment_scores/raw_llama_1B_1_shot_metrics.csv')
finetuned_llama_1B_1_shot_metrics = pd.read_csv('sentiment_scores/finetuned_llama_1B_1_shot_metrics.csv')

In [None]:
raw_finbert_metrics

Unnamed: 0,accuracy,precision_weighted,recall_weighted,f1_weighted,class_0_precision,class_0_recall,class_0_f1,class_0_support,class_1_precision,class_1_recall,class_1_f1,class_1_support,class_2_precision,class_2_recall,class_2_f1,class_2_support
0,0.325698,0.548631,0.325698,0.290084,0.022785,0.106509,0.037539,169,0.691466,0.121585,0.206806,2599,0.350204,0.724526,0.472178,1423


In [None]:
finetuned_finbert_metrics

Unnamed: 0,eval_loss,eval_accuracy,eval_precision_weighted,eval_recall_weighted,eval_f1_weighted,eval_class_0_precision,eval_class_0_recall,eval_class_0_f1,eval_class_0_support,eval_class_1_precision,...,eval_class_1_f1,eval_class_1_support,eval_class_2_precision,eval_class_2_recall,eval_class_2_f1,eval_class_2_support,eval_runtime,eval_samples_per_second,eval_steps_per_second,epoch
0,1.558079,0.4932,0.607289,0.4932,0.484269,0.141762,0.218935,0.172093,169,0.741694,...,0.469629,2599,0.417095,0.799016,0.548084,1423,11.8947,352.342,44.053,5.0


In [None]:
raw_llama_0_shot_metrics

Unnamed: 0,accuracy,precision_weighted,recall_weighted,f1_weighted,class_0_precision,class_0_recall,class_0_f1,class_0_support,class_1_precision,class_1_recall,class_1_f1,class_1_support,class_2_precision,class_2_recall,class_2_f1,class_2_support
0,0.338821,0.534675,0.338821,0.209655,0.075691,0.307692,0.121495,169,0.646154,0.01616,0.031532,2599,0.385577,0.931834,0.545455,1423


In [None]:
finetuned_llama_0_shot_metrics

Unnamed: 0,accuracy,precision_weighted,recall_weighted,f1_weighted,class_0_precision,class_0_recall,class_0_f1,class_0_support,class_1_precision,class_1_recall,class_1_f1,class_1_support,class_2_precision,class_2_recall,class_2_f1,class_2_support
0,0.346218,0.550669,0.346218,0.211511,0.075885,0.266272,0.11811,169,0.671875,0.016545,0.032294,2599,0.385682,0.957836,0.549929,1423


In [None]:
raw_llama_1_shot_metrics

Unnamed: 0,accuracy,precision_weighted,recall_weighted,f1_weighted,class_0_precision,class_0_recall,class_0_f1,class_0_support,class_1_precision,class_1_recall,class_1_f1,class_1_support,class_2_precision,class_2_recall,class_2_f1,class_2_support
0,0.341685,0.749196,0.341685,0.177689,0.323529,0.065089,0.108374,169,1.0,0.000385,0.000769,2599,0.341675,0.997892,0.509052,1423


In [None]:
finetuned_llama_1_shot_metrics

Unnamed: 0,accuracy,precision_weighted,recall_weighted,f1_weighted,class_0_precision,class_0_recall,class_0_f1,class_0_support,class_1_precision,class_1_recall,class_1_f1,class_1_support,class_2_precision,class_2_recall,class_2_f1,class_2_support
0,0.419232,0.56428,0.419232,0.365298,0.461538,0.035503,0.065934,169,0.681818,0.196229,0.304751,2599,0.361808,0.872101,0.511436,1423


In [None]:
# finetuned_finbert_metrics's columns are different because we evaluate the model's answer with compute_metrics() rather than our custom get_metrics() function
# So I'll change the name of the columns for that model first
eval_columns = finetuned_finbert_metrics.columns.to_list()
eval_columns

['eval_loss',
 'eval_accuracy',
 'eval_precision_weighted',
 'eval_recall_weighted',
 'eval_f1_weighted',
 'eval_class_0_precision',
 'eval_class_0_recall',
 'eval_class_0_f1',
 'eval_class_0_support',
 'eval_class_1_precision',
 'eval_class_1_recall',
 'eval_class_1_f1',
 'eval_class_1_support',
 'eval_class_2_precision',
 'eval_class_2_recall',
 'eval_class_2_f1',
 'eval_class_2_support',
 'eval_runtime',
 'eval_samples_per_second',
 'eval_steps_per_second',
 'epoch']

In [None]:
# Keep only the metrics we're interested in
eval_columns = [column for column in eval_columns if column not in ['eval_loss', 'eval_runtime', 'eval_samples_per_second', 'eval_steps_per_second', 'epoch']]
eval_columns

['eval_accuracy',
 'eval_precision_weighted',
 'eval_recall_weighted',
 'eval_f1_weighted',
 'eval_class_0_precision',
 'eval_class_0_recall',
 'eval_class_0_f1',
 'eval_class_0_support',
 'eval_class_1_precision',
 'eval_class_1_recall',
 'eval_class_1_f1',
 'eval_class_1_support',
 'eval_class_2_precision',
 'eval_class_2_recall',
 'eval_class_2_f1',
 'eval_class_2_support']

In [None]:
finetuned_finbert_metrics = finetuned_finbert_metrics[eval_columns]
finetuned_bert_metrics = finetuned_bert_metrics[eval_columns]
finetuned_finbert_metrics

Unnamed: 0,eval_accuracy,eval_precision_weighted,eval_recall_weighted,eval_f1_weighted,eval_class_0_precision,eval_class_0_recall,eval_class_0_f1,eval_class_0_support,eval_class_1_precision,eval_class_1_recall,eval_class_1_f1,eval_class_1_support,eval_class_2_precision,eval_class_2_recall,eval_class_2_f1,eval_class_2_support
0,0.4932,0.607289,0.4932,0.484269,0.141762,0.218935,0.172093,169,0.741694,0.343594,0.469629,2599,0.417095,0.799016,0.548084,1423


In [None]:
finetuned_bert_metrics

Unnamed: 0,eval_accuracy,eval_precision_weighted,eval_recall_weighted,eval_f1_weighted,eval_class_0_precision,eval_class_0_recall,eval_class_0_f1,eval_class_0_support,eval_class_1_precision,eval_class_1_recall,eval_class_1_f1,eval_class_1_support,eval_class_2_precision,eval_class_2_recall,eval_class_2_f1,eval_class_2_support
0,0.499642,0.593818,0.499642,0.496726,0.15847,0.171598,0.164773,169,0.725291,0.383994,0.502138,2599,0.405395,0.749824,0.526264,1423


In [None]:
# Remove the eval_ prefix from each column's name
finetuned_finbert_metrics.columns = finetuned_finbert_metrics.columns.str.replace('eval_', '')
finetuned_bert_metrics.columns = finetuned_bert_metrics.columns.str.replace('eval_', '')
finetuned_finbert_metrics

Unnamed: 0,accuracy,precision_weighted,recall_weighted,f1_weighted,class_0_precision,class_0_recall,class_0_f1,class_0_support,class_1_precision,class_1_recall,class_1_f1,class_1_support,class_2_precision,class_2_recall,class_2_f1,class_2_support
0,0.4932,0.607289,0.4932,0.484269,0.141762,0.218935,0.172093,169,0.741694,0.343594,0.469629,2599,0.417095,0.799016,0.548084,1423


In [None]:
finetuned_bert_metrics

Unnamed: 0,accuracy,precision_weighted,recall_weighted,f1_weighted,class_0_precision,class_0_recall,class_0_f1,class_0_support,class_1_precision,class_1_recall,class_1_f1,class_1_support,class_2_precision,class_2_recall,class_2_f1,class_2_support
0,0.499642,0.593818,0.499642,0.496726,0.15847,0.171598,0.164773,169,0.725291,0.383994,0.502138,2599,0.405395,0.749824,0.526264,1423


In [None]:
# Add a column to specify the model
raw_bert_metrics['model'] = 'pretrained_bert'
finetuned_bert_metrics['model'] = 'finetuned_bert'
raw_finbert_metrics['model'] = 'pretrained_finbert'
finetuned_finbert_metrics['model'] = 'finetuned_finbert'

raw_llama_0_shot_metrics['model'] = 'pretrained_llama_3B_0_shot'
finetuned_llama_0_shot_metrics['model'] = 'finetuned_llama_3B_0_shot'
raw_llama_1_shot_metrics['model'] = 'pretrained_llama_3B_1_shot'
finetuned_llama_1_shot_metrics['model'] = 'finetuned_llama_3B_1_shot'

raw_llama_1B_0_shot_metrics['model'] = 'pretrained_llama_1B_0_shot'
finetuned_llama_1B_0_shot_metrics['model'] = 'finetuned_llama_1B_0_shot'
raw_llama_1B_1_shot_metrics['model'] = 'pretrained_llama_1B_1_shot'
finetuned_llama_1B_1_shot_metrics['model'] = 'finetuned_llama_1B_1_shot'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  finetuned_finbert_metrics['model'] = 'finetuned_finbert'


In [None]:
# Concatenate them all vertically
metrics = pd.concat(
    [raw_bert_metrics,
     finetuned_bert_metrics,
     raw_finbert_metrics,
     finetuned_finbert_metrics,
     raw_llama_1B_0_shot_metrics,
     finetuned_llama_1B_0_shot_metrics,
     raw_llama_1B_1_shot_metrics,
     finetuned_llama_1B_1_shot_metrics,
     raw_llama_0_shot_metrics,
     finetuned_llama_0_shot_metrics,
     raw_llama_1_shot_metrics,
     finetuned_llama_1_shot_metrics], axis=0)
metrics

Unnamed: 0,accuracy,precision_weighted,recall_weighted,f1_weighted,class_0_precision,class_0_recall,class_0_f1,class_0_support,class_1_precision,class_1_recall,class_1_f1,class_1_support,class_2_precision,class_2_recall,class_2_f1,class_2_support,model
0,0.040325,0.001626,0.040325,0.003126,0.040325,1.0,0.077523,169,0.0,0.0,0.0,2599,0.0,0.0,0.0,1423,pretrained_bert
0,0.499642,0.593818,0.499642,0.496726,0.15847,0.171598,0.164773,169,0.725291,0.383994,0.502138,2599,0.405395,0.749824,0.526264,1423,finetuned_bert
0,0.325698,0.548631,0.325698,0.290084,0.022785,0.106509,0.037539,169,0.691466,0.121585,0.206806,2599,0.350204,0.724526,0.472178,1423,pretrained_finbert
0,0.4932,0.607289,0.4932,0.484269,0.141762,0.218935,0.172093,169,0.741694,0.343594,0.469629,2599,0.417095,0.799016,0.548084,1423,finetuned_finbert
0,0.572417,0.508777,0.572417,0.52557,0.0,0.0,0.0,169,0.622747,0.811081,0.704545,2599,0.361042,0.204498,0.261104,1423,pretrained_llama_1B_0_shot
0,0.488189,0.497492,0.488189,0.487291,0.0,0.0,0.0,169,0.612455,0.525972,0.565928,2599,0.346605,0.477161,0.401538,1423,finetuned_llama_1B_0_shot
0,0.461465,0.588249,0.461465,0.429816,0.0,0.0,0.0,169,0.743408,0.282032,0.408926,2599,0.374727,0.843992,0.519015,1423,pretrained_llama_1B_1_shot
0,0.347888,0.651837,0.347888,0.190953,0.0,0.0,0.0,169,0.863636,0.014621,0.028755,2599,0.342416,0.997892,0.509874,1423,finetuned_llama_1B_1_shot
0,0.338821,0.534675,0.338821,0.209655,0.075691,0.307692,0.121495,169,0.646154,0.01616,0.031532,2599,0.385577,0.931834,0.545455,1423,pretrained_llama_3B_0_shot
0,0.346218,0.550669,0.346218,0.211511,0.075885,0.266272,0.11811,169,0.671875,0.016545,0.032294,2599,0.385682,0.957836,0.549929,1423,finetuned_llama_3B_0_shot


In [None]:
metrics.set_index('model', inplace=True)
metrics

Unnamed: 0_level_0,accuracy,precision_weighted,recall_weighted,f1_weighted,class_0_precision,class_0_recall,class_0_f1,class_0_support,class_1_precision,class_1_recall,class_1_f1,class_1_support,class_2_precision,class_2_recall,class_2_f1,class_2_support
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
pretrained_bert,0.040325,0.001626,0.040325,0.003126,0.040325,1.0,0.077523,169,0.0,0.0,0.0,2599,0.0,0.0,0.0,1423
finetuned_bert,0.499642,0.593818,0.499642,0.496726,0.15847,0.171598,0.164773,169,0.725291,0.383994,0.502138,2599,0.405395,0.749824,0.526264,1423
pretrained_finbert,0.325698,0.548631,0.325698,0.290084,0.022785,0.106509,0.037539,169,0.691466,0.121585,0.206806,2599,0.350204,0.724526,0.472178,1423
finetuned_finbert,0.4932,0.607289,0.4932,0.484269,0.141762,0.218935,0.172093,169,0.741694,0.343594,0.469629,2599,0.417095,0.799016,0.548084,1423
pretrained_llama_1B_0_shot,0.572417,0.508777,0.572417,0.52557,0.0,0.0,0.0,169,0.622747,0.811081,0.704545,2599,0.361042,0.204498,0.261104,1423
finetuned_llama_1B_0_shot,0.488189,0.497492,0.488189,0.487291,0.0,0.0,0.0,169,0.612455,0.525972,0.565928,2599,0.346605,0.477161,0.401538,1423
pretrained_llama_1B_1_shot,0.461465,0.588249,0.461465,0.429816,0.0,0.0,0.0,169,0.743408,0.282032,0.408926,2599,0.374727,0.843992,0.519015,1423
finetuned_llama_1B_1_shot,0.347888,0.651837,0.347888,0.190953,0.0,0.0,0.0,169,0.863636,0.014621,0.028755,2599,0.342416,0.997892,0.509874,1423
pretrained_llama_3B_0_shot,0.338821,0.534675,0.338821,0.209655,0.075691,0.307692,0.121495,169,0.646154,0.01616,0.031532,2599,0.385577,0.931834,0.545455,1423
finetuned_llama_3B_0_shot,0.346218,0.550669,0.346218,0.211511,0.075885,0.266272,0.11811,169,0.671875,0.016545,0.032294,2599,0.385682,0.957836,0.549929,1423


In [None]:
# Split into 2 sets of metrics: overall_metrics and class_metrics
overall_metrics = metrics.iloc[:, :4]
class_metrics = metrics.iloc[:, 4:]

In [None]:
overall_metrics

Unnamed: 0_level_0,accuracy,precision_weighted,recall_weighted,f1_weighted
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
pretrained_bert,0.040325,0.001626,0.040325,0.003126
finetuned_bert,0.499642,0.593818,0.499642,0.496726
pretrained_finbert,0.325698,0.548631,0.325698,0.290084
finetuned_finbert,0.4932,0.607289,0.4932,0.484269
pretrained_llama_1B_0_shot,0.572417,0.508777,0.572417,0.52557
finetuned_llama_1B_0_shot,0.488189,0.497492,0.488189,0.487291
pretrained_llama_1B_1_shot,0.461465,0.588249,0.461465,0.429816
finetuned_llama_1B_1_shot,0.347888,0.651837,0.347888,0.190953
pretrained_llama_3B_0_shot,0.338821,0.534675,0.338821,0.209655
finetuned_llama_3B_0_shot,0.346218,0.550669,0.346218,0.211511


Conclusions on the overall performance of each model

1. The fine-tuned models are generally better than the raw models
2. Raw versions: FinBERT performs at nearly the same level as Llama 0-shot (except for a superior f1_weighted), while Llama 1-shot outperforms both at every metrics except for f1_weighted again
3. Fine-tuned versions: FinBERT attains vastly superior performance over both Llama models, and the Llama 1-shot model also performs generally better than the Llama 0-shot model
4. The overall performances of the models are not that great: while fine-tuned FinBERT is the overall winner here, its metrics stay around the 0.5 mark, with the exception of weighted precision that reaches up to 0.6


In [None]:
class_metrics

Unnamed: 0_level_0,class_0_precision,class_0_recall,class_0_f1,class_0_support,class_1_precision,class_1_recall,class_1_f1,class_1_support,class_2_precision,class_2_recall,class_2_f1,class_2_support
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
pretrained_bert,0.040325,1.0,0.077523,169,0.0,0.0,0.0,2599,0.0,0.0,0.0,1423
finetuned_bert,0.15847,0.171598,0.164773,169,0.725291,0.383994,0.502138,2599,0.405395,0.749824,0.526264,1423
pretrained_finbert,0.022785,0.106509,0.037539,169,0.691466,0.121585,0.206806,2599,0.350204,0.724526,0.472178,1423
finetuned_finbert,0.141762,0.218935,0.172093,169,0.741694,0.343594,0.469629,2599,0.417095,0.799016,0.548084,1423
pretrained_llama_1B_0_shot,0.0,0.0,0.0,169,0.622747,0.811081,0.704545,2599,0.361042,0.204498,0.261104,1423
finetuned_llama_1B_0_shot,0.0,0.0,0.0,169,0.612455,0.525972,0.565928,2599,0.346605,0.477161,0.401538,1423
pretrained_llama_1B_1_shot,0.0,0.0,0.0,169,0.743408,0.282032,0.408926,2599,0.374727,0.843992,0.519015,1423
finetuned_llama_1B_1_shot,0.0,0.0,0.0,169,0.863636,0.014621,0.028755,2599,0.342416,0.997892,0.509874,1423
pretrained_llama_3B_0_shot,0.075691,0.307692,0.121495,169,0.646154,0.01616,0.031532,2599,0.385577,0.931834,0.545455,1423
finetuned_llama_3B_0_shot,0.075885,0.266272,0.11811,169,0.671875,0.016545,0.032294,2599,0.385682,0.957836,0.549929,1423


Conclusions on the performance for each classes:

**1. Across the 3 classes:**
- Overall, all the models struggled with the minority class (the negative class)
- Between neutral & positive, the models tend to achieve better precision for the former, but vastly better recall & f1 for the latter. Specifically, all models achieve a recall greater than 0.7 for the positive class. This might indicate that positive sentiment might be easier to pick up by LLMs than neutral sentiment, but also that the LLMs might be more prone to wrongly classify headlines as positive instead of neutral
=> This likely indicate that the poor overall performance of the models are significantly explained by the great imbalances in the dataset

**2. Negative (class 0, by far the minority class):**
- Fine-tuning helps FinBERT attain better performance across all metrics, but it has ambiguous effects on Llama 0-shot and Llama 1-shot. For the Llama models, while fine-tuning improves precision, it makes recall and f1-score worse, which means making the model worse at classifying negative news as negative
- The raw FinBERT is strictly inferior to the raw Llama models and the fine-tuned ones
- Using 1-shot prompts instead of 0-shot improves precision, but reduces recall & f1, which means making Llama worse at classifying negative news as negative. This pattern exists for both the raw Llama and the fine-tuned Llama models
- The fine-tuned FinBERT has vastly lower precision, but higher recall & f1 than the 2 fine-tuned Llama models.
- Taking f1 as the deciding factor, fine-tuned FinBERT has the best performance on the negative class

**3. Neutral (class 1, the majority class):**
- Fine-tuning does help the models gain better performance. For FinBERT & Llama 0-shot, it helps the model achieve better metrics, while for Llama 1-shot, it makes the model more balanced by improving the recall and the f1, despite a reduction in precision. However, for Llama 0-shot, performance gains are rather insignificant
- Most notably, the raw Llama 1-shot model attains perfect precision but near 0 recall & f1, meaning that every sample classified as neutral is neutral, but hardly any neutral data point is classified as neutral. This does indicate that while the model never wrongly classify samples as neutral, there's a general bias against predicting samples as neutral, instead going for "positive" or "negative" instead. Both points to the idea of correctly predicting very few samples as neutral. The same pattern exists for the raw Llama 0-shot model, albeit to a lesser extent.
- Raw FinBERT also does exhibit the same pattern, but to an even lesser extent, hence it can be considered more balanced than the raw Llama models
- The fine-tuned FinBERT generally achieves superior performance over the 2 fine-tuned Llama models for the neutral class.
- Using f1 as the deciding factor, fine-tuned FinBERT is once again the winner

**4. Positive (class 2, the lesser majority class):**
- Generally, fine-tuning does improve the models' performances here, though not as dramatically
- The performance across the models do not differ greatly from each other, except for recall, where the 2 Llama models observe superior recall of around 0.9 over FinBERT's recall of 0.7
- As mentioned before, all the models attain better recall than precision here, which might point to a general pattern where LLMs are good at extracting more intense sentiment (in this case positive), as opposed to neutral sentiment
- Using f1, fine-tuned Llama on 0-shot prompts is the winner, followed by fine-tuned FinBERT

From the evaluations above, I decide that **fine-tuned FinBERT is the overall
best model**, followed by the 2 fine-tuned Llama models. While **Llama 1-shot** is not clearly better than Llama 0-shot when we examine class metrics, it is better on the overall metrics and thus **can be considered the slightly better model.**  

In [None]:
# Save the 2 metrics to .csv files
overall_metrics.to_csv('sentiment_eval_metrics/overall_metrics.csv')
class_metrics.to_csv('sentiment_eval_metrics/class_metrics.csv')