# Imports and Reading Files

In [None]:
!pip install datasets pandas transformers transformers[torch] scikit-multilearn optuna
!git clone https://github.com/lucasadelino/thesis.git

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting scikit-multilearn
  Downloading scikit_multilearn-0.2.0-py3-none-any.whl.metadata (6.0 kB)
Collecting optuna
  Downloading optuna-3.6.1-py3-none-any.whl.metadata (17 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.5.0-py3-none-any.whl.metadat

In [None]:
import ast
import pandas as pd
import numpy as np
import torch
from datasets import load_dataset
from google.colab import files
from torch.nn.modules.loss import BCEWithLogitsLoss
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BertTokenizer,
    BertForSequenceClassification,
    BertForTokenClassification,
    DataCollatorForTokenClassification,
    EvalPrediction,
    TrainingArguments,
    Trainer,
)
from sklearn.metrics import f1_score, roc_auc_score, hamming_loss
from skmultilearn.model_selection import iterative_train_test_split
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    precision_score,
    recall_score,
    precision_recall_fscore_support,
    accuracy_score,
    f1_score,
)

# token_subgroup_single
model_type = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_type)
num_labels = 3
label_format = "maj_multi"


def id_array_to_labels(id_array):
    """Converts an array of indices to a bit array
    e.g. the array [0, 2, 3, 4, 5] is converted to [1. 0. 1. 1. 1. 1. 0. 0. 0.]"""
    labels = np.zeros(9)
    labels[id_array] = 1
    return labels.astype(float)


def tokenize_and_align_labels(example, single_label=True):

    # Tokenize the sentence pair
    tokenized_inputs = tokenizer(
        example["sentence1_tokenized"],
        example["sentence2_tokenized"],
        padding="max_length",
        max_length=90,
        truncation=True,
        is_split_into_words=True,
    )

    label_array_1 = example["s1_token_labs"]  # Label array for the first sentence
    label_array_2 = example["s2_token_labs"]  # Label array for the second sentence
    word_ids = tokenized_inputs.word_ids(batch_index=0)

    label_ids = []
    sentence_switch = False  # Flag to indicate when to switch from the first to the second sentence's labels
    previous_word_id = None

    if single_label:
        pad_value = -100
    else:
        pad_value = [-100.] * num_labels

    for index, word_id in enumerate(word_ids):
        if word_id is None and not sentence_switch:
            # First [CLS] or [SEP] token encountered
            label_ids.append(pad_value)
            if index > 0:
                # First [SEP] token encountered
                sentence_switch = True  # Switch to the second sentence's labels
        elif word_id is None:
            # Second [SEP] token or [CLS] token at the end
            label_ids.append(pad_value)
        else:
            # Normal token, choose appropriate label array
            current_label_array = label_array_2 if sentence_switch else label_array_1
            label_ids.append(
                current_label_array[word_id] if single_label else current_label_array[word_id].tolist()
                )

        previous_word_id = word_id

    tokenized_inputs["labels"] = label_ids

    return tokenized_inputs


def apply_tokenization(train_df, test_df, val_df, single_label=True):
    "Tokenize sentences and save as new column in dfs"

    train_df["tokenized_sentences"] = train_df.apply(
        tokenize_and_align_labels, single_label=single_label, axis=1
        )
    test_df["tokenized_sentences"] = test_df.apply(
        tokenize_and_align_labels, single_label=single_label, axis=1
        )
    val_df["tokenized_sentences"] = val_df.apply(
        tokenize_and_align_labels, single_label=single_label, axis=1
        )

    # Convert tokenized sentences to tensors. Those will be the inputs to our (PyTorch) model
    train_df["inputs"] = train_df["tokenized_sentences"].apply(
        lambda x: x.convert_to_tensors("pt")
    )
    test_df["inputs"] = test_df["tokenized_sentences"].apply(
        lambda x: x.convert_to_tensors("pt")
    )
    val_df["inputs"] = val_df["tokenized_sentences"].apply(
        lambda x: x.convert_to_tensors("pt")
    )


def compute_metrics(p: EvalPrediction):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    y_pred = [
        p
        for prediction, label in zip(predictions, labels)
        for p, l in zip(prediction, label)
        if l != -100
    ]
    y_true = [l for label in labels for l in label if l != -100]

    non_zero_labels = list(range(1, num_labels))

    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average="weighted")
    accuracy = f1_score(
        y_true, y_pred, average="micro", labels=non_zero_labels
    )
    inflated_accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true=y_true, y_pred=y_pred, average="weighted")
    recall = recall_score(y_true=y_true, y_pred=y_pred, average="weighted")
    results = {
        "f1": f1_micro_average,
        "accuracy": accuracy,
        "0accuracy": inflated_accuracy,
        "precision": precision,
        "recall": recall,
    }
    return results


def optuna_hp_space(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 3e-4, log=True),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 4, 8),
        "warmup_steps": trial.suggest_int("warmup_steps", 10, 300),
        "weight_decay": trial.suggest_float("weight_decay", 0.01, 0.05, log=True),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [8, 16, 32, 64, 128]),
        "seed": trial.set_user_attr("seed", 3)

    }

def model_init():
    return BertForTokenClassification.from_pretrained(model_type, num_labels=num_labels)


def get_accuracy(input):
    return input["eval_accuracy"]

# Defining evaluation metrics
# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def test_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)
    y_pred = [
        p for prediction, label in zip(predictions, labels)
        for p, l in zip(prediction, label) if l != -100
    ]
    y_true = [
        l for label in labels
        for l in label if l != -100
    ]
    labs = list(range(1, num_labels))
    overall_f1_macro = f1_score(y_true=y_true, y_pred=y_pred, average='macro', labels=labs).tolist()
    overall_f1_micro = f1_score(y_true=y_true, y_pred=y_pred, average='micro', labels=labs).tolist()
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average=None, labels=labs).tolist()
    precision_overall = precision_score(y_true=y_true, y_pred=y_pred, average='micro', labels=labs)
    recall_overall = recall_score(y_true=y_true, y_pred=y_pred, average='micro', labels=labs)

    #accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true=y_true, y_pred=y_pred, average=None, labels=labs).tolist()
    recall = recall_score(y_true=y_true, y_pred=y_pred, average=None, labels=labs).tolist()

    results = {'F1': f1_micro_average,
               'Overal F1 Macro': overall_f1_macro,
               'Overall Accuracy': overall_f1_micro,
               #'accuracy': accuracy,
               'Precision': precision,
               'Recall': recall,
               'Precision Overall': precision_overall,
               'Recall Overall': recall_overall}

    return results

def multilabel_test_metrics(predictions, labels, thresholds=[0.5] * num_labels):
    thresholds = torch.Tensor(thresholds)
    # First, apply sigmoid on predictions
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # Flatten probs and labels
    # Originally of dims [batch_size, sequence_length, num_labels] to [batch_size * sequence_length, num_labels]
    flat_probs = probs.view(-1, probs.shape[-1])
    flat_labels = labels.reshape(-1, labels.shape[-1])

    # Filter rows where all labels are -100
    mask = ~(flat_labels == -100).all(axis=1)
    filtered_probs = flat_probs[mask]
    filtered_labels = flat_labels[mask]

    # Generate predictions using threshold
    y_pred = np.zeros(filtered_probs.shape)
    y_pred[np.where(filtered_probs > thresholds)] = 1

    y_true = filtered_labels

    # Compute overall metrics
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    precision_overall = precision_score(y_true=y_true, y_pred=y_pred, average='micro')
    recall_overall = recall_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    hamming = hamming_loss(y_true, y_pred)

    # Compute class-wise Precision, Recall, F1 Score
    precision_classwise = precision_score(y_true, y_pred, average=None).tolist()
    recall_classwise = recall_score(y_true, y_pred, average=None).tolist()
    f1_classwise = f1_score(y_true, y_pred, average=None).tolist()

    # Samples
    f1_samples = f1_score(y_true, y_pred, average='samples').tolist()
    accuracy = accuracy_score(y_true, y_pred)

    # Return metrics in a dictionary
    metrics = {
        'f1': f1_micro_average,
        'roc_auc': roc_auc,
        'hamming_loss': hamming,
        'precision_per_class': precision_classwise,
        'recall_per_class': recall_classwise,
        'f1_per_class': f1_classwise,
        'f1_samples': f1_samples,
        'accuracy': accuracy,
        'precision_overall': precision_overall,
        'recall_overall': recall_overall
    }
    return metrics

def subset_labels(df, label_format):
    "Returns a subset of df containing only labels according to label_format"
    assert label_format in ["sub_single", "maj_single", "sub_multi", "maj_multi"], "Invalid label_format"
    new_df = df[['sentence1', 'sentence2', 'sentence1_tokenized', 'sentence2_tokenized', 'collapsed_labels',
                f's1_token_labs_{label_format}', f's2_token_labs_{label_format}',]]
    new_df.rename(columns={f's1_token_labs_{label_format}': 's1_token_labs',
                            f's2_token_labs_{label_format}': 's2_token_labs',}, inplace=True)
    return new_df

def show_test_result(trainer, test_df):
    test_result = trainer.predict(test['inputs'].values)

    # Print default metrics collected during prediction
    for item, value in test_result.metrics.items():
        print(f"{item}: {value}")

    predictions = torch.Tensor(test_result.predictions)
    labels = torch.Tensor(test_result.label_ids)

    # Compute class-wise metrics
    #thresholds = [0.15, 0.5, 0.5, 0.5, 0.5, 0.19, 0.5, 0.5]
    results = test_metrics((predictions, labels))
    df = pd.DataFrame.from_dict(results)
    #df.drop(columns = ['f1', 'roc_auc', 'hamming_loss'], inplace=True)
    if num_labels == 4:
        indices = [['1. Addition/Deletion', '2. Change of Order', '3. Substitution']]
    else:
        indices = [['1. Add/Del - Function Word', '2. Add/Del - Content Word', '3. Change of Order',
             '4. Substitution - Synonym', '5. Substitution - Contextual Synonym', '6. Substitution - Morphological',
             '7. Substitution - Spelling and Format', '8. Add/Del - Punctuation']]
    df.index = indices
    return df

def show_multilabel_test_result(trainer, test_df):
    test_result = trainer.predict(test['inputs'].values)

    # Print default metrics collected during prediction
    for item, value in test_result.metrics.items():
        print(f"{item}: {value}")

    predictions = torch.Tensor(test_result.predictions)
    labels = torch.Tensor(test_result.label_ids)

    # Compute class-wise metrics
    #thresholds = [0.15, 0.5, 0.5, 0.5, 0.5, 0.19, 0.5, 0.5]
    results = multilabel_test_metrics(predictions, labels)
    df = pd.DataFrame.from_dict(results)
    df.drop(columns = ['f1', 'roc_auc', 'hamming_loss'], inplace=True)
    if num_labels == 3:
        indices = [['1. Addition/Deletion', '2. Change of Order', '3. Substitution']]
    else:
        indices = [['1. Add/Del - Function Word', '2. Add/Del - Content Word', '3. Change of Order',
             '4. Substitution - Synonym', '5. Substitution - Contextual Synonym', '6. Substitution - Morphological',
             '7. Substitution - Spelling and Format', '8. Add/Del - Punctuation']]
    df.index = indices
    return df

class MultiLabelTrainer(Trainer):
    def __init__(self, *args, class_weights = None, **kwargs):
        super().__init__(*args, **kwargs)
        if class_weights is not None:
            class_weights = class_weights.to(self.args.device)
            #logging.info(f"Using multi-label classification with class weights", class_weights)
        self.loss_fct = BCEWithLogitsLoss(weight=class_weights)

    def compute_loss(self, model, inputs, return_outputs=False):
        """
        How the loss is computed by Trainer. By default, all models return the loss in the first element.
        Subclass and override for custom behavior.
        """
        labels  = inputs.pop("labels")
        outputs = model(**inputs)

        # this simultaneously accesses predictions for tokens that aren't CLS or PAD
        # and flattens the logits or labels
        flat_outputs = outputs.logits[labels!=-100]
        flat_labels  = labels[labels!=-100]

        try:
            loss = self.loss_fct(flat_outputs, flat_labels.float())
        except AttributeError:  # DataParallel
            loss = self.loss_fct(flat_outputs, flat_labels.float())

        return (loss, outputs) if return_outputs else loss

def multilabel_metrics(predictions, labels, threshold=0.5):
    # First, apply sigmoid on predictions
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # Flatten probs and labels
    # Originally of dims [batch_size, sequence_length, num_labels] to [batch_size * sequence_length, num_labels]
    flat_probs = probs.view(-1, probs.shape[-1])
    flat_labels = labels.reshape(-1, labels.shape[-1])

    # Filter rows where all labels are -100
    mask = ~(flat_labels == -100).all(axis=1)
    filtered_probs = flat_probs[mask]
    filtered_labels = flat_labels[mask]

    # Generate predictions using threshold
    y_pred = np.zeros(filtered_probs.shape)
    y_pred[np.where(filtered_probs > threshold)] = 1

    # Now we can compute metrics:
    y_true = filtered_labels
    #print(y_true)
    #print(y_pred)
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    hamming = hamming_loss(y_true, y_pred)
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'hamming loss': hamming}
    return metrics

def compute_multilabel_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    result = multilabel_metrics(predictions=preds, labels=p.label_ids)
    return result

In [None]:
data = pd.read_pickle('thesis/datasets/etpc_reannotated.pkl')
data = subset_labels(data, label_format)
data.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df.rename(columns={f's1_token_labs_{label_format}': 's1_token_labs',


Unnamed: 0,sentence1,sentence2,sentence1_tokenized,sentence2_tokenized,collapsed_labels,s1_token_labs,s2_token_labs
0,"Amrozi accused his brother, whom he called ""th...","Referring to him as only ""the witness"", Amrozi...","[Amrozi, accused, his, brother, ,, whom, he, c...","[Referring, to, him, as, only, ``, the, witnes...","[0, 2, 3, 4]","[[0.0, 1.0, 0.0], [0.0, 1.0, 0.0], [0.0, 1.0, ...","[[0.0, 0.0, 1.0], [0.0, 0.0, 1.0], [0.0, 0.0, ..."
2,They had published an advertisement on the Int...,"On June 10, the ship's owners had published an...","[They, had, published, an, advertisement, on, ...","[On, June, 10, ,, the, ship, 's, owners, had, ...","[0, 2, 3, 4, 5]","[[0.0, 0.0, 1.0], [0.0, 0.0, 0.0], [0.0, 0.0, ...","[[0.0, 1.0, 0.0], [0.0, 1.0, 0.0], [0.0, 1.0, ..."
4,"The stock rose $2.11, or about 11 percent, to ...",PG&E Corp. shares jumped $1.63 or 8 percent to...,"[The, stock, rose, $, 2.11, ,, or, about, 11, ...","[PG, &, E, Corp., shares, jumped, $, 1.63, or,...","[0, 1, 2, 3, 4, 5, 8]","[[0.0, 0.0, 1.0], [0.0, 0.0, 1.0], [0.0, 0.0, ...","[[0.0, 0.0, 1.0], [0.0, 0.0, 1.0], [0.0, 0.0, ..."
5,Revenue in the first quarter of the year dropp...,With the scandal hanging over Stewart's compan...,"[Revenue, in, the, first, quarter, of, the, ye...","[With, the, scandal, hanging, over, Stewart, '...","[0, 1, 2]","[[0.0, 0.0, 0.0], [1.0, 0.0, 0.0], [0.0, 0.0, ...","[[1.0, 0.0, 0.0], [1.0, 0.0, 0.0], [1.0, 0.0, ..."
7,The DVD-CCA then appealed to the state Supreme...,The DVD CCA appealed that decision to the U.S....,"[The, DVD-CCA, then, appealed, to, the, state,...","[The, DVD, CCA, appealed, that, decision, to, ...","[0, 1, 2, 4, 7]","[[0.0, 0.0, 0.0], [0.0, 0.0, 1.0], [1.0, 0.0, ...","[[0.0, 0.0, 0.0], [0.0, 0.0, 1.0], [0.0, 0.0, ..."


# Preprocessing

## Train/Test Split

In [None]:
# Indices selected for each set
# It's easier to just use indices rather than having to deal with sentence pairs here
indices = np.array(data.index.tolist())
indices = np.expand_dims(indices, axis=1)
indices.shape

(3900, 1)

In [None]:
# Labels, converted
labels = data['collapsed_labels'].values
labels = [id_array_to_labels(each_list) for each_list in labels]
data['labels'] = labels
labels = np.array(labels)

## Stratification

In [None]:
# ITERATIVE STRATIFICATION
np.random.seed(3)
# 80/20 split into train and temp sets (temp set will be further split below)
x_train, y_train, x_val_test, y_val_test = iterative_train_test_split(indices, labels, test_size=0.2)
# 50/50 split temp set into validation and test sets
x_val, y_val, x_test, y_test  = iterative_train_test_split(x_val_test, y_val_test, test_size=0.5)

print(x_train.shape)
print(x_val.shape)
print(x_test.shape)

(3087, 1)
(414, 1)
(399, 1)


In [None]:
# Converting indices in x matrices back to 1-d
indices_train = np.squeeze(x_train)
indices_test = np.squeeze(x_test)
indices_val = np.squeeze(x_val)

In [None]:
train = data.loc[indices_train, :]
test = data.loc[indices_test, :]
val = data.loc[indices_val, :]

## Tokenizing

In [None]:
apply_tokenization(train, test, val, single_label=False)

# Training

In [None]:
params = {'learning_rate': 2.367719983396521e-05, 'num_train_epochs': 7, 'warmup_steps': 78, 'weight_decay': 0.03871726302598747, 'per_device_train_batch_size': 8}
metric_name = "f1"

args = TrainingArguments(
    f"bert-paraop",
    eval_strategy = "epoch",
    logging_steps = 10,
    save_strategy = "epoch",
    learning_rate=params['learning_rate'],
    per_device_train_batch_size=params['per_device_train_batch_size'],
    per_device_eval_batch_size=params['per_device_train_batch_size'],
    num_train_epochs=params['num_train_epochs'],
    warmup_steps=params['warmup_steps'],
    weight_decay=params['weight_decay'],
    load_best_model_at_end=True,
    save_total_limit=2,
    metric_for_best_model=metric_name,
    seed=3
)
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

trainer = MultiLabelTrainer(
    model=None,
    args=args,
    train_dataset=train['inputs'].values,
    eval_dataset=val['inputs'].values,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_multilabel_metrics,
    model_init=model_init
)

def f1(input):
    return input['eval_f1']

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
trainer.train()

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Roc Auc,Hamming loss
1,0.189,0.176469,0.55111,0.721952,0.065798
2,0.1843,0.155314,0.604024,0.75767,0.06133
3,0.1218,0.158818,0.627321,0.773348,0.058843
4,0.1061,0.169232,0.636613,0.784112,0.058905
5,0.077,0.169428,0.648585,0.800917,0.059473
6,0.0756,0.177444,0.639852,0.791543,0.059795
7,0.0667,0.181027,0.642963,0.792194,0.059043


TrainOutput(global_step=2702, training_loss=0.1301419171115189, metrics={'train_runtime': 286.4767, 'train_samples_per_second': 75.43, 'train_steps_per_second': 9.432, 'total_flos': 992533476220020.0, 'train_loss': 0.1301419171115189, 'epoch': 7.0})

# Evaluation

In [None]:
eval_result = trainer.evaluate()
for item, value in eval_result.items():
    print(f"{item}: {value}")

eval_loss: 0.16942796111106873
eval_f1: 0.6485849056603774
eval_roc_auc: 0.8009168210017643
eval_hamming loss: 0.05947281966256774
eval_runtime: 1.096
eval_samples_per_second: 377.743
eval_steps_per_second: 47.446
epoch: 7.0


# Testing

In [None]:
show_multilabel_test_result(trainer, test)

test_loss: 0.1862102448940277
test_f1: 0.6625010847869478
test_roc_auc: 0.7991621949025983
test_hamming loss: 0.06382420035120542
test_runtime: 1.0439
test_samples_per_second: 382.232
test_steps_per_second: 47.899


  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


Unnamed: 0,precision_per_class,recall_per_class,f1_per_class,f1_samples,accuracy,precision_overall,recall_overall
1. Addition/Deletion,0.770734,0.707222,0.737614,0.187731,0.832751,0.70101,0.628003
2. Change of Order,0.621387,0.446985,0.519952,0.187731,0.832751,0.70101,0.628003
3. Substitution,0.674126,0.63151,0.652122,0.187731,0.832751,0.70101,0.628003


# Optional: Hyperparameter Search

In [None]:
best_trials = trainer.hyperparameter_search(
    direction="maximize",
    backend="optuna",
    hp_space=optuna_hp_space,
    n_trials=15,
    compute_objective=get_accuracy
)

[I 2024-08-01 23:04:29,453] A new study created in memory with name: no-name-6ae1f40d-91a1-4435-9a62-480a4441911a
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy,0accuracy,Precision,Recall
1,1.0377,0.802727,0.628894,0.0,0.739603,0.547013,0.739603
2,0.7302,0.575984,0.774364,0.453159,0.796021,0.755583,0.796021
3,0.5806,0.510594,0.801866,0.548438,0.821305,0.795793,0.821305
4,0.4546,0.459336,0.822911,0.596686,0.822503,0.824289,0.822503
5,0.3707,0.471696,0.83028,0.612268,0.838668,0.82978,0.838668


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
[I 2024-08-01 23:06:30,513] Trial 0 finished with value: 0.6122684074109628 and parameters: {'learning_rate': 0.0001148646311538582, 'num_train_epochs': 5, 'warmup_steps': 130, 'weight_decay': 0.012740828164462818, 'per_device_train_batch_size': 128}. Best is trial 0 with value: 0.6122684074109628.
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1,Accuracy,0accuracy,Precision,Recall
1,0.4933,0.502941,0.808001,0.558203,0.830194,0.808801,0.830194
2,0.4249,0.416505,0.843796,0.651714,0.846175,0.842492,0.846175


[W 2024-08-01 23:08:04,052] Trial 1 failed with parameters: {'learning_rate': 4.5916782139053155e-05, 'num_train_epochs': 4, 'warmup_steps': 177, 'weight_decay': 0.019537492806101862, 'per_device_train_batch_size': 8} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/optuna/study/_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
  File "/usr/local/lib/python3.10/dist-packages/transformers/integrations/integration_utils.py", line 211, in _objective
    trainer.train(resume_from_checkpoint=checkpoint, trial=trial)
  File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 1932, in train
    return inner_training_loop(
  File "/usr/local/lib/python3.10/dist-packages/transformers/trainer.py", line 2273, in _inner_training_loop
    and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step))
KeyboardInterrupt
[W 2024-08-01 23:08:04,053] Trial 1 failed with value N

KeyboardInterrupt: 

In [None]:
best_trials

BestRun(run_id='1', objective=0.6022401184856059, hyperparameters={'learning_rate': 0.00021437583926908025, 'num_train_epochs': 7, 'warmup_steps': 261, 'weight_decay': 0.03953905927366381, 'per_device_train_batch_size': 16}, run_summary=None)