In [1]:
import pandas as pd
import numpy as np

#models
import torch
from transformers import AutoTokenizer, AutoModel
from transformers import EvalPrediction
from torch.utils.data import DataLoader, Dataset
from torch.nn import BCEWithLogitsLoss
from torch.optim import Adam

#metrics
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_curve, auc

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(f"Device: {device}")

Device: cuda


## Task 1 - Corpus

In [2]:
def load_and_merge_data():
    
    #encodng the data into pandas.DataFrame objects
    url_a_test = '/kaggle/input/dataset/arguments-test.tsv'
    df_a_test = pd.read_csv(url_a_test, sep='\t')

    url_a_training = '/kaggle/input/dataset/arguments-training.tsv'
    df_a_training = pd.read_csv(url_a_training, sep='\t')

    url_a_validation = '/kaggle/input/dataset/arguments-validation.tsv'
    df_a_validation = pd.read_csv(url_a_validation, sep='\t')

    url_l_test = '/kaggle/input/dataset/labels-test.tsv'
    df_l_test = pd.read_csv(url_l_test, sep='\t')

    url_l_training = '/kaggle/input/dataset/labels-training.tsv'
    df_l_training = pd.read_csv(url_l_training, sep='\t')

    url_l_validation = '/kaggle/input/dataset/labels-validation.tsv'
    df_l_validation = pd.read_csv(url_l_validation, sep='\t')

    #merge argument dataframes with label dataframes
    df_test = pd.merge(df_a_test, df_l_test, on='Argument ID')
    df_training = pd.merge(df_a_training, df_l_training, on='Argument ID')
    df_validation = pd.merge(df_a_validation, df_l_validation, on='Argument ID')

    return df_test, df_training, df_validation

In [3]:
df_test, df_training, df_validation = load_and_merge_data()

In [4]:
def merge_and_drop_columns(df):
    # Merge level 2 annotations to level 3 categories
    df['Openess to change'] = df[['Self-direction: thought', 'Self-direction: action', 'Stimulation', 'Hedonism']].any(axis=1).astype(int)
    df['Self-enhancement'] = df[['Hedonism', 'Achievement', 'Power: dominance', 'Power: resources', 'Face']].any(axis=1).astype(int)
    df['Conservation'] = df[['Face', 'Security: personal', 'Security: societal', 'Tradition', 'Conformity: rules', 'Conformity: interpersonal', 'Humility']].any(axis=1).astype(int)
    df['Self-transcendence'] = df[['Humility', 'Benevolence: caring', 'Benevolence: dependability', 'Universalism: concern', 'Universalism: nature', 'Universalism: tolerance', 'Universalism: objectivity']].any(axis=1).astype(int)
    
    # Drop unuseful columns
    columns_to_drop = ['Argument ID', 'Self-direction: thought', 'Self-direction: action', 'Stimulation', 'Hedonism', 'Achievement', 'Power: dominance', 'Power: resources', 'Face', 'Security: personal', 'Security: societal', 'Tradition', 'Conformity: rules', 'Conformity: interpersonal', 'Humility', 'Benevolence: caring', 'Benevolence: dependability', 'Universalism: concern', 'Universalism: nature', 'Universalism: tolerance', 'Universalism: objectivity']
    df = df.drop(columns=columns_to_drop)
    
    return df

df_test = merge_and_drop_columns(df_test)
df_training = merge_and_drop_columns(df_training)
df_validation = merge_and_drop_columns(df_validation)


In [5]:
df_test.head()

Unnamed: 0,Conclusion,Stance,Premise,Openess to change,Self-enhancement,Conservation,Self-transcendence
0,We should end affirmative action,against,affirmative action helps with employment equity.,0,1,1,1
1,We should end affirmative action,in favor of,affirmative action can be considered discrimin...,0,1,0,1
2,We should ban naturopathy,in favor of,naturopathy is very dangerous for the most vul...,0,1,1,1
3,We should prohibit women in combat,in favor of,women shouldn't be in combat because they aren...,0,1,0,0
4,We should ban naturopathy,in favor of,once eradicated illnesses are returning due to...,0,1,1,1


### DATA EXPLORATION
Ancora da inserire - sicuramente c'è da mettere una metrica per vedere la lunghezza dentro a conclusion, premise e stance per giustificare max length = 100
per il resto non saprei cosa altro mettere, se vuoi la presenza di in favor of e against della colonna stance nei vari dataset e altro

### DATA PREPROCESSING

Encoding 'Stance' column into numerical format  

In [6]:
df_test['Stance'] = df_test['Stance'].replace({'in favor of': 1, 'against': 0}).astype(str)
df_training['Stance'] = df_training['Stance'].replace({'in favor of': 1, 'against': 0}).astype(str)
df_validation['Stance'] = df_validation['Stance'].replace({'in favor of': 1, 'against': 0}).astype(str)

  df_test['Stance'] = df_test['Stance'].replace({'in favor of': 1, 'against': 0}).astype(str)
  df_training['Stance'] = df_training['Stance'].replace({'in favor of': 1, 'against': 0}).astype(str)
  df_validation['Stance'] = df_validation['Stance'].replace({'in favor of': 1, 'against': 0}).astype(str)


Preparing data for tokenization input

In [7]:
labels_test = df_test.iloc[:, 3:7].values
labels_training = df_training.iloc[:, 3:7].values
labels_validation = df_validation.iloc[:, 3:7].values

stance_test = df_test['Stance'].values
stance_training = df_training['Stance'].values
stance_validation = df_validation['Stance'].values

Tokenization process and creation of a dataset structure compatible with the bert model 

In [8]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
max_length = 100

class BertDatasetCreator(Dataset):
    def __init__(self, encodings, labels, tokenizer, max_length):
        self.encodings = encodings
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.encodings)
    
    def __getitem__(self, idx):
        item = str(self.encodings[idx])
        item = ' '.join(item.split())
        
        encoded_dict = self.tokenizer.encode_plus(
            item,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
        )
        input_ids = encoded_dict['input_ids']
        attention_masks = encoded_dict['attention_mask']
        token_type_ids = encoded_dict['token_type_ids']

        return {
            'input_ids': torch.tensor(input_ids, dtype=torch.long),
            'attention_mask': torch.tensor(attention_masks, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'labels': torch.tensor(self.labels[idx], dtype=torch.float)
        }

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

#### Applying the BertDatasetCreator and preparing the datasets for the three different type of BERT models

##### BERT w/C dataset

In [9]:
test_dataset_c = BertDatasetCreator(df_test['Conclusion'], labels_test, tokenizer, max_length)
train_dataset_c = BertDatasetCreator(df_training['Conclusion'], labels_training, tokenizer, max_length)
val_dataset_c = BertDatasetCreator(df_validation['Conclusion'], labels_validation, tokenizer, max_length)

DataLoader definition - which will supply the data to the neural network in batches for efficient training and processing

In [11]:
batch_size = 16
test_dataloader_c = DataLoader(test_dataset_c, batch_size=batch_size)
train_dataloader_c = DataLoader(train_dataset_c, batch_size=batch_size)
val_dataloader_c = DataLoader(val_dataset_c, batch_size=batch_size)

##### BERT w/CP

In [12]:
test_dataset_cp = BertDatasetCreator(df_test['Conclusion'] + ' ' + df_test['Premise'], labels_test, tokenizer, max_length)
train_dataset_cp = BertDatasetCreator(df_training['Conclusion'] + ' ' + df_training['Premise'], labels_training, tokenizer, max_length)
val_dataset_cp = BertDatasetCreator(df_validation['Conclusion'] + ' ' + df_validation['Premise'], labels_validation, tokenizer, max_length)

test_dataloader_cp = DataLoader(test_dataset_cp, batch_size=batch_size)
train_dataloader_cp = DataLoader(train_dataset_cp, batch_size=batch_size)
val_dataloader_cp = DataLoader(val_dataset_cp, batch_size=batch_size)

##### BERT w/CPS

In [13]:
test_dataset_cps = BertDatasetCreator(df_test['Conclusion'] + ' ' + df_test['Premise'] + ' ' + df_test['Stance'], labels_test, tokenizer, max_length)
train_dataset_cps = BertDatasetCreator(df_training['Conclusion'] + ' ' + df_training['Premise'] + ' ' + df_training['Stance'], labels_training, tokenizer, max_length)
val_dataset_cps = BertDatasetCreator(df_validation['Conclusion'] + ' ' + df_validation['Premise'] + ' ' + df_validation['Stance'], labels_validation, tokenizer, max_length)

test_dataloader_cps = DataLoader(test_dataset_cps, batch_size=batch_size)
train_dataloader_cps = DataLoader(train_dataset_cps, batch_size=batch_size)
val_dataloader_cps = DataLoader(val_dataset_cps, batch_size=batch_size)

## Task 2 - Models Definitions

### BASELINE MODELS

Random uniform classifier

In [14]:
def create_random_uniform_classifier(label):
    def random_uniform_classifier(size):
        return np.random.choice([0, 1], size=size)
    return random_uniform_classifier

Majority classifier - always predicting the most frequent valorization for the column

In [15]:
def create_majority_classifier(label, train_data):
    # Calcola il majority_value come la moda della colonna corrispondente alla label nel dataset di train
    majority_value = train_data[label].mode()[0]
    def majority_classifier(size):
        return np.full(size, majority_value)
    return majority_classifier

Creating the baseline models for every category and saving them in a classifiers dictionary

In [16]:
classifiers = {}

labels = ['Openess to change', 'Self-enhancement', 'Conservation', 'Self-transcendence']

#create classifiers for each category and save them in the dictionary
for label in labels:
    #random uniform classifier
    random_uniform_name = f'random_uniform_classifier_{label}'
    classifiers[random_uniform_name] = create_random_uniform_classifier(label)

    #majority classifier
    majority_name = f'majority_classifier_{label}'
    classifiers[majority_name] = create_majority_classifier(label, df_training)

### BERT Model Definition

In [17]:
class Bert_Model(torch.nn.Module):
    def __init__(self):
        super(Bert_Model, self).__init__()
        self.bert = AutoModel.from_pretrained(
            pretrained_model_name_or_path= 'bert-base-uncased', 
            problem_type='multi_label_classification', 
            num_labels = 4, 
            return_dict=False)
        self.dropout = torch.nn.Dropout(p=0.3)
        self.classifier = torch.nn.Linear(self.bert.config.hidden_size, 4)

    def forward(self, input_ids, attention_mask, token_type_ids):
        _, outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        outputs = self.dropout(outputs)
        outputs = self.classifier(outputs)
        return outputs

#### Bert Models

In [18]:
c_model = Bert_Model()
cp_model = Bert_Model()
cps_model = Bert_Model()

c_model.to(device)
cp_model.to(device)
cps_model.to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Bert_Model(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_af

## Task 3 - Metrics

### Baseline models metrics function

In [None]:
def baseline_metrics(predicted_results, test_dataset, labels):
    f1_scores = {}
    binary_f1_scores = []
    for label in labels:
        true_values = test_dataset[label]
        predictions = predicted_results[label]
        f1 = f1_score(true_values, predictions, average='binary')
        f1 = round(f1, 2) #round to 2 decimal places
        f1_score_name = f'{label} F1: '
        f1_scores[f1_score_name] = f1

        #binary f1 score for every category
        binary_f1 = f1_score(true_values, predictions, average='binary')
        binary_f1_scores.append(binary_f1)

     #macro f1 score
    macro_f1 = round(np.mean(binary_f1_scores), 2)
    f1_scores['macro_f1: '] = macro_f1
    return f1_scores

Baseline models metrics printing function

In [None]:
def print_baseline_metrics(metrics, type):
    if type == 'random_uniform':
        classifier_type = 'Random Uniform'
    elif type == 'majority':
        classifier_type = 'Majority'
    print(f"Classifier Type: {classifier_type}")
    for label, score in metrics.items():
        print(f"{label}: {score}")

### BERT models metrics function

In [19]:
def generate_classification_report(model, X_test, Y_test, thresholds):
    
    # Mettere il modello in modalità di valutazione
    model.eval()
    Y_pred = []
    num_labels = len(thresholds)
    
    with torch.no_grad():# Disabilitare il calcolo dei gradienti per la valutazione
        for _, batch in enumerate(X_test, 0):
            input_ids = batch['input_ids'].to(device, dtype = torch.long)
            attention_mask = batch['attention_mask'].to(device, dtype = torch.long)
            token_type_ids = batch['token_type_ids'].to(device, dtype = torch.long)
            #labels = batch['labels'].to(device, dtype = torch.long)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
            
            # Spostare il tensore degli output sulla CPU e convertirlo in un array NumPy
            outputs = outputs.cpu().numpy()

        # Applica le soglie ottimali per ottenere le previsioni binarie
            preds = np.zeros_like(outputs)
            for i in range(num_labels):
                preds[:, i] = (outputs[:, i] > thresholds[i]).astype(int)
            Y_pred.extend(preds)
            
    Y_pred = np.array(Y_pred)
    '''
    # Convertire Y_test in array multiclass se è multilabel
    if len(Y_test.shape) > 1 and Y_test.shape[1] > 1:
        Y_test = np.argmax(Y_test, axis=1)
    '''
    
    # Generare il classification report
    report = classification_report(Y_test, Y_pred, zero_division=1, output_dict=True)
    
    # Mappatura delle labels
    label_names = {
        '0': "Openness to change",
        '1': "Self-enhancement",
        '2': "Conservation",
        '3': "Self-transcendence"
    }
    
    # Estrarre e stampare l'F1 score per ogni etichetta
    print("\nF1 Scores per Label:")
    for label, metrics in report.items():
        if label not in ['accuracy', 'macro avg', 'weighted avg']:
            label_name = label_names.get(label, f"Label {label}")
            f1_score = metrics['f1-score']
            print(f"{label_name} F1: = {f1_score:.2f}")
    
    # Estrarre e stampare il macro F1 score
    macro_f1_score = report['macro avg']['f1-score']
    print(f"Macro F1 Score: {macro_f1_score:.2f}")


## Task 4 - Training and evaluation

## Baseline models training and metrics

### Random uniform classifiers

In [None]:
def random_uniform_classifiers(classifiers, test_dataset, labels):
    predicted_results = {}

    for label in labels:
        random_uniform_name = f'random_uniform_classifier_{label}'
        classifier = classifiers[random_uniform_name]
        test_data = test_dataset[label]
        size = len(test_data)
        predictions = classifier(size)
        predicted_results[label] = predictions

    return predicted_results

Metrics for the random uniform classifiers - F1 score for each category and macro F1 score 

In [None]:
metrics_random_uniform = baseline_metrics(random_uniform_classifiers(classifiers, df_test, labels), df_test, labels)
print_baseline_metrics(metrics_random_uniform, 'random_uniform')

### Majority classifiers

In [None]:
def majority_classifiers(classifiers, test_dataset, labels):
    predicted_results = {}

    for label in labels:
        majority_name = f'majority_classifier_{label}'
        classifier = classifiers[majority_name]
        test_data = test_dataset[label]
        size = len(test_data)
        predictions = classifier(size)
        predicted_results[label] = predictions

    return predicted_results

Metrics for the majority classifiers - F1 score for each category and macro F1 score 

In [None]:
metrics_majority = baseline_metrics(majority_classifiers(classifiers, df_test, labels), df_test, labels)
print_baseline_metrics(metrics_majority, 'majority')

## BERT model training and evaluation

Training process utils

In [20]:
#definition of the loss function
def loss_function(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

#definition of the optimizers
optimizer = Adam(c_model.parameters(), lr = 1e-5)

# Set seeds for reproducibility
def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)

seeds = [42, 123, 2024]
epochs = 5

#### Training function definition

In [21]:
def trainBert(model, dataloader, optimizer, loss_function):
    model.train()
    running_loss = 0.0
    for _, data in enumerate(dataloader, 0):
        ids = data['input_ids'].to(device, dtype=torch.long)
        mask = data['attention_mask'].to(device, dtype=torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
        labels = data['labels'].to(device)
        optimizer.zero_grad()
        outputs = model(ids, mask, token_type_ids)
    
        loss_value = loss_function(outputs, labels)
        loss_value.backward()
        optimizer.step()
        running_loss += loss_value.item()
        avg_train_loss = running_loss / len(dataloader)
    
    return avg_train_loss

#### Validation function definition

In [22]:
def validate_model(model, dataloader):
    model.eval()
    all_labels = []
    all_outputs = []

    with torch.no_grad():
        for batch_idx, data in enumerate(dataloader, 0):
            input_ids = data['input_ids'].to(device, dtype = torch.long)
            attention_mask = data['attention_mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            labels = data['labels'].to(device, dtype = torch.float)
            outputs = model(input_ids, attention_mask, token_type_ids)
            all_labels.extend(labels.cpu().numpy())
            all_outputs.extend(outputs.cpu().numpy())

    all_labels = np.array(all_labels)
    all_outputs = np.array(all_outputs)

    # Numero di etichette
    num_labels = all_labels.shape[1]

    # Inizializza una lista per memorizzare le soglie ottimali per ogni etichetta
    optimal_thresholds = []
    auc_scores = []

    for i in range(num_labels):
        # Calcola la curva ROC
        fpr, tpr, thresholds = roc_curve(all_labels[:, i], all_outputs[:, i])
        # Calcola l'AUC
        roc_auc = auc(fpr, tpr)
        auc_scores.append(roc_auc)
        # Trova la soglia che massimizza la somma di sensibilità e specificità
        optimal_idx = np.argmax(tpr - fpr)
        optimal_threshold = thresholds[optimal_idx]
        optimal_thresholds.append(optimal_threshold)

    # Applica le soglie ottimali per ottenere le previsioni binarie
    all_preds = np.zeros_like(all_outputs)
    for i in range(num_labels):
        all_preds[:, i] = (all_outputs[:, i] > optimal_thresholds[i]).astype(int)

    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds, average='weighted')
    recall = recall_score(all_labels, all_preds, average='weighted')
    f1 = f1_score(all_labels, all_preds, average='weighted')

    return accuracy, precision, recall, f1, optimal_thresholds


### Training and evaluating BERT w/C

Defining a dictionary to store all the classification reports - one for each model per seed. 

In [23]:
classification_reports = {}

Training, evaluation and metrics 

In [24]:
for seed in seeds:
    set_seed(seed)
    print(f'\nSeed {seed}\n-------------------------------')
    for epoch in range(epochs):
        #training
        print(f"\nEpoch {epoch + 1}\n-------------------------------")
        avg_train_loss = trainBert(c_model, train_dataloader_c, optimizer, loss_function)
        print(f'Average Train Loss: {avg_train_loss:.4f}')
        #validation
        accuracy, precision, recall, f1, c_thresholds = validate_model(c_model, val_dataloader_c)
        print(f'Validation - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}')
    #generate classification-report
    print(f'\nClassification report \n-------------------------------')
    classification_report_name = f'classification_report_c_model_{seed}'
    classification_reports[classification_report_name] = generate_classification_report(c_model, test_dataloader_c, labels_test, c_thresholds)
    
    torch.save(c_model.state_dict(), f'c_model_{seed}.pth')


Seed 42
-------------------------------

Epoch 1
-------------------------------
Average Train Loss: 0.5964
Validation - Accuracy: 0.0628, Precision: 0.7128, Recall: 0.5429, F1 Score: 0.5645

Epoch 2
-------------------------------
Average Train Loss: 0.5639
Validation - Accuracy: 0.1756, Precision: 0.7552, Recall: 0.6002, F1 Score: 0.6676

Epoch 3
-------------------------------
Average Train Loss: 0.5530
Validation - Accuracy: 0.2104, Precision: 0.7559, Recall: 0.6359, F1 Score: 0.6899

Epoch 4
-------------------------------
Average Train Loss: 0.5464
Validation - Accuracy: 0.1519, Precision: 0.7429, Recall: 0.5581, F1 Score: 0.6270

Epoch 5
-------------------------------
Average Train Loss: 0.5432
Validation - Accuracy: 0.2104, Precision: 0.7577, Recall: 0.6053, F1 Score: 0.6711

Classification report 
-------------------------------

F1 Scores per Label:
Openness to change F1: = 0.05
Self-enhancement F1: = 0.19
Conservation F1: = 0.64
Self-transcendence F1: = 0.77
Label micro av

### Training and evaluating BERT w/CP

Training, evaluation and metrics 

In [25]:
for seed in seeds:
    set_seed(seed)
    print(f'\nSeed {seed}\n-------------------------------')
    for epoch in range(epochs):
        print(f"\nEpoch {epoch + 1}\n-------------------------------")
        avg_train_loss = trainBert(cp_model, train_dataloader_cp, optimizer, loss_function)
        print(f'Loss: {avg_train_loss:.4f}')
        #validation
        accuracy, precision, recall, f1, cp_thresholds = validate_model(cp_model, val_dataloader_cp)
        print(f'Validation - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}')
    #generate classification-report
    print(f'\nClassification report \n-------------------------------')
    classification_report_name = f'classification_report_cp_model_{seed}'
    classification_reports[classification_report_name] = generate_classification_report(cp_model, test_dataloader_cp, labels_test, cp_thresholds)
    
    torch.save(cp_model.state_dict(), f'cp_model_{seed}.pth')


Seed 42
-------------------------------

Epoch 1
-------------------------------
Loss: 0.7450
Validation - Accuracy: 0.0432, Precision: 0.6771, Recall: 0.5116, F1 Score: 0.4484

Epoch 2
-------------------------------
Loss: 0.7431
Validation - Accuracy: 0.0432, Precision: 0.6771, Recall: 0.5116, F1 Score: 0.4484

Epoch 3
-------------------------------
Loss: 0.7436
Validation - Accuracy: 0.0432, Precision: 0.6771, Recall: 0.5116, F1 Score: 0.4484

Epoch 4
-------------------------------
Loss: 0.7443
Validation - Accuracy: 0.0432, Precision: 0.6771, Recall: 0.5116, F1 Score: 0.4484

Epoch 5
-------------------------------
Loss: 0.7438
Validation - Accuracy: 0.0432, Precision: 0.6771, Recall: 0.5116, F1 Score: 0.4484

Classification report 
-------------------------------

F1 Scores per Label:
Openness to change F1: = 0.40
Self-enhancement F1: = 0.56
Conservation F1: = 0.76
Self-transcendence F1: = 0.04
Label micro avg F1: = 0.50
Label samples avg F1: = 0.46
Macro F1 Score: 0.44

Seed 1

### Training and evaluating BERT w/CPS

Training, evaluation and metrics 

In [26]:
for seed in seeds:
    set_seed(seed)
    print(f'Seed {seed}\n-------------------------------')
    for epoch in range(epochs):
        print(f"\nEpoch {epoch + 1}\n-------------------------------")
        avg_train_loss = trainBert(cps_model, train_dataloader_cps, optimizer, loss_function)
        print(f'Loss: {avg_train_loss:.4f}')
        #validation
        accuracy, precision, recall, f1, cps_thresholds = validate_model(cps_model, val_dataloader_cps)
        print(f'Validation - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}')
    #generate classification-report
    print(f'\nClassification report \n-------------------------------')
    classification_report_name = f'classification_report_cps_model_{seed}'
    classification_reports[classification_report_name] = generate_classification_report(cps_model, test_dataloader_cps, labels_test, cps_thresholds)
    
    torch.save(cps_model.state_dict(), f'cps_model_{seed}.pth')

Seed 42
-------------------------------

Epoch 1
-------------------------------
Loss: 0.6833
Validation - Accuracy: 0.0654, Precision: 0.6881, Recall: 0.5905, F1 Score: 0.5471

Epoch 2
-------------------------------
Loss: 0.6828
Validation - Accuracy: 0.0654, Precision: 0.6881, Recall: 0.5905, F1 Score: 0.5471

Epoch 3
-------------------------------
Loss: 0.6815
Validation - Accuracy: 0.0654, Precision: 0.6881, Recall: 0.5905, F1 Score: 0.5471

Epoch 4
-------------------------------
Loss: 0.6818
Validation - Accuracy: 0.0654, Precision: 0.6881, Recall: 0.5905, F1 Score: 0.5471

Epoch 5
-------------------------------
Loss: 0.6834
Validation - Accuracy: 0.0654, Precision: 0.6881, Recall: 0.5905, F1 Score: 0.5471

Classification report 
-------------------------------

F1 Scores per Label:
Openness to change F1: = 0.46
Self-enhancement F1: = 0.03
Conservation F1: = 0.34
Self-transcendence F1: = 0.88
Label micro avg F1: = 0.56
Label samples avg F1: = 0.54
Macro F1 Score: 0.43
Seed 123

## Task 5 - Error Analysis