In [1]:
# Importing all libraries
import pandas as pd
import numpy as np
import ast
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from transformers import BertTokenizerFast, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from transformers import EarlyStoppingCallback
import torch

KeyboardInterrupt: 

In [24]:
# Load train and validation data
df_train = pd.read_csv('/home/matus/NLPD_18/part1/outputs/output_train.csv')
df_valid = pd.read_csv('/home/matus/NLPD_18/part1/outputs/output_valid.csv')

In [25]:
# Functions to extract and insert XML tags (entities) into statement text

def merge_adjacent_entities(entities):
    if not entities:
        return []

    # Sort entities by start position
    entities = sorted(entities, key=lambda x: x['start'])
    merged = [entities[0]]

    for current in entities[1:]:
        last = merged[-1]

        # Merge adjacent entities of the same type
        if current['entity'] == last['entity'] and current['start'] <= last['end'] + 1:
            last['end'] = current['end']
        else:
            merged.append(current)

    return merged

def insert_xml_tags(text, entities):
    if not entities:
        return text

    # If entities are a string, try to evaluate it
    if isinstance(entities, str):
        try:
            entities = ast.literal_eval(entities)
        except (ValueError, SyntaxError) as e:
            print(f"Error parsing entities: {entities} - {e}")
            return text  # Skip this row or handle it differently

    # Merge adjacent entities
    merged_entities = merge_adjacent_entities(entities)
    
    # Sort entities by their start position
    merged_entities.sort(key=lambda x: x['start'])

    offset = 0
    for ent in merged_entities:
        ent_type = ent['entity']
        start = ent['start'] + offset
        end = ent['end'] + offset

        open_tag = f"<{ent_type}>"
        close_tag = f"</{ent_type}>"

        text = text[:start] + open_tag + text[start:end] + close_tag + text[end:]
        offset += len(open_tag) + len(close_tag)

    return text

In [26]:
tqdm.pandas() # Progress bar

# Use defined functions to add new col with XML tags in statements
df_train['A_XML_statement'] = df_train.progress_apply(
    lambda row: insert_xml_tags(row['statement'], row['A_raw_entities']),
    axis=1
)

df_valid['A_XML_statement'] = df_valid.progress_apply(
    lambda row: insert_xml_tags(row['statement'], row['A_raw_entities']),
    axis=1
)

100%|███████████████████████████████████| 18369/18369 [00:01<00:00, 9536.28it/s]
100%|█████████████████████████████████████| 2297/2297 [00:00<00:00, 9242.53it/s]


In [31]:
df_train['A_XML_statement'].head(5)

0    90 percent of <MISC>Americans</MISC> "support ...
1    Last year was one of the deadliest years ever ...
2    <PER>Bernie Sanders</PER>'s plan is "to raise ...
3    Voter ID is supported by an overwhelming major...
4    Says <PER>Barack Obama</PER> "robbed <MISC>Med...
Name: A_XML_statement, dtype: object

In [35]:
# Add XML tags to tokenizer vocabulary
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

# Pridaj špeciálne XML tagy
special_tokens = ['<PER>', '</PER>', '<ORG>', '</ORG>', '<LOC>', '</LOC>', '<MISC>', '</MISC>']
tokenizer.add_special_tokens({'additional_special_tokens': special_tokens})

# Load bert for classification
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
model.resize_token_embeddings(len(tokenizer))

train_encodings = tokenizer(
    df_train['A_XML_statement'].tolist(),
    truncation=True,
    padding=True
)

valid_encodings = tokenizer(
    df_valid['A_XML_statement'].tolist(),
    truncation=True,
    padding=True
)

train_labels = df_train['label_binary'].tolist()
valid_labels = df_valid['label_binary'].tolist()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [37]:
# Save as dataset
class NERDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = NERDataset(train_encodings, train_labels)
valid_dataset = NERDataset(valid_encodings, valid_labels)

In [38]:
# PARAMETERS grid search
experiment_configs = []
exp_id = 1

for epochs in [2,3,4,5,6]:
    for batch_size in [4,8,16,32,64]:
        for lr in [1e-5,2e-5,3e-5,4e-5]:
            experiment_configs.append({
                "name": f"exp_{exp_id}",
                "epochs": epochs,
                "batch_size": batch_size,
                "lr": lr
            })
            exp_id += 1

In [39]:

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    return {
        'accuracy': accuracy_score(labels, preds),
        'f1': f1_score(labels, preds),
        'precision': precision_score(labels, preds),
        'recall': recall_score(labels, preds)
    }

In [41]:
# TRAIN

all_results = []

for config in experiment_configs:
    print(f"Training {config['name']}...")

    model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
    model.resize_token_embeddings(len(tokenizer))

    training_args = TrainingArguments(
        output_dir=f"./results/{config['name']}",
        evaluation_strategy="epoch",
        logging_strategy="epoch",
        per_device_train_batch_size=config['batch_size'],
        per_device_eval_batch_size=config['batch_size'],
        num_train_epochs=config['epochs'],
        learning_rate=config['lr'],
        weight_decay=0.01,
        logging_dir=f"./logs/{config['name']}",
        save_strategy="epoch",
        report_to="none",  # don't use wandb/huggingface
        metric_for_best_model="f1",        # 👈 Use F1 for early stopping
        greater_is_better=True,            # 👈 Higher F1 = better
        load_best_model_at_end=True,
        fp16=True
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=valid_dataset,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
    )

    trainer.train()
    eval_result = trainer.evaluate()

    all_results.append({
    "name": config["name"],
    "epochs": config["epochs"],
    "batch_size": config["batch_size"],
    "lr": config["lr"],
    "accuracy": eval_result.get("eval_accuracy", None),
    "f1": eval_result.get("eval_f1", None),
    "precision": eval_result.get("eval_precision", None),
    "recall": eval_result.get("eval_recall", None),
    "best_f1": trainer.state.best_metric,
    "final_epoch": trainer.state.epoch,
    "best_checkpoint": trainer.state.best_model_checkpoint,
    "best_step": int(trainer.state.best_model_checkpoint.split("-")[-1]) if trainer.state.best_model_checkpoint else None
})


Training exp_1...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5781,0.560258,0.703091,0.632939,0.663657,0.604938
2,0.4843,0.581341,0.699608,0.685219,0.615574,0.772634


In [43]:
# Show and sort results
results_df = pd.DataFrame(all_results)
results_df = results_df.sort_values(by="f1", ascending=False).reset_index(drop=True)
display(results_df)

Unnamed: 0,name,epochs,batch_size,lr,accuracy,f1,precision,recall,best_f1,final_epoch,best_checkpoint,best_step
0,exp_1,2,64,4e-05,0.699608,0.685219,0.615574,0.772634,0.685219,2.0,./results/exp_1/checkpoint-576,576


In [2]:
results_df.to_csv("delete_test_bert_experiment_results.csv", index=False)

NameError: name 'results_df' is not defined

In [3]:
# After running all this as .py on cluster, read created csv file and pick best model

In [13]:
import pandas as pd
import numpy as np
df_exp = pd.read_csv('/home/matus/NLPD_18/part2/bert_parameteres_gridsearch.csv')
df_exp.head(3)

Unnamed: 0,name,epochs,batch_size,lr,accuracy,f1,precision,recall,best_f1,final_epoch,best_checkpoint,best_step
0,exp_14,2,32,2e-05,0.708315,0.6949,0.623366,0.784979,0.6949,2.0,./results/exp_14/checkpoint-1150,1150
1,exp_53,4,32,1e-05,0.703091,0.694444,0.615079,0.797325,0.694444,4.0,./results/exp_53/checkpoint-1725,1725
2,exp_6,2,8,2e-05,0.717022,0.693685,0.64,0.757202,0.693685,2.0,./results/exp_6/checkpoint-4594,4594


In [16]:
top3 = df_exp.sort_values(by='f1', ascending=False).head(3)

# Print details of the top 3
for idx, row in top3.iterrows():
    print(f"--- Model {idx + 1} ---")
    print(f"name: {row['name']}")
    print(f"F1: {row['f1']}")
    print(f"Accuracy: {row['accuracy']}")
    print(f"Recall: {row['recall']}")
    print(f"batch_size: {row['batch_size']}")
    print(f"lr: {row['lr']}")
    print(f"final_epoch: {row['final_epoch']}\n")

--- Model 1 ---
name: exp_14
F1: 0.6948998178506375
Accuracy: 0.7083151937309534
Recall: 0.7849794238683128
batch_size: 32
lr: 2e-05
final_epoch: 2.0

--- Model 2 ---
name: exp_53
F1: 0.6944444444444444
Accuracy: 0.7030909882455376
Recall: 0.7973251028806584
batch_size: 32
lr: 1e-05
final_epoch: 4.0

--- Model 3 ---
name: exp_6
F1: 0.6936852026390198
Accuracy: 0.717022202873313
Recall: 0.757201646090535
batch_size: 8
lr: 2e-05
final_epoch: 2.0

