In [None]:
#!pip install transformers seqeval[gpu]
#!pip install torch
#!pip install pandas

In [1]:
### SETUP ###

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from collections import defaultdict
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import json
import csv
import wandb  

device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')

In [2]:
### PREPROCESS DATA ### 

def preprocess(file_path: str):
   """
   Read and process a CoNLL format file into a structured DataFrame with sentence-level annotations.
   
   The function processes files where each line contains a word and its tag, sentences are separated
   by blank lines, and returns processed data with sentence-level aggregation.
   
   Args:
       file_path (str): Path to the CoNLL format file to be processed
       
   Returns:
       tuple: Contains:
           - DataFrame: Processed data with columns ['sentence', 'word_labels']
           - dict: Mapping of labels to unique IDs (label2id)
           - dict: Mapping of IDs back to labels (id2label)
           
   Note:
       - Words in sentences are joined with spaces
       - Tags are joined with commas
       - Empty values are replaced with 'NaN'
   """
   # initialize lists to store sentence data
   sentences = []
   sentence_no = 1
   sentence_words = []
   sentence_tags = []
   
   # process file line by line
   with open(file_path, "r", encoding="utf-8") as f:
       for line in f:
           line = line.strip()
           
           # handle sentence boundaries
           if not line:
               if sentence_words and sentence_tags:
                   sentences.append((sentence_no, sentence_words, sentence_tags))
               sentence_no += 1
               sentence_words = []
               sentence_tags = []
           
           # extract word and tag from line
           else:
               word, tag = line.split()[0], line.split()[-1]
               sentence_words.append(word or "NaN")
               sentence_tags.append(tag or "NaN")
   
   # add final sentence if exists
   if sentence_words and sentence_tags:
       sentences.append((sentence_no, sentence_words, sentence_tags))
   
   # create dataframe from processed sentences
   data = pd.DataFrame({
       "sentence_no": [s[0] for s in sentences],
       "word": [" ".join(s[1]) for s in sentences],
       "tag": [",".join(s[2]) for s in sentences]
   })
   
   # create label mappings
   unique_tags = sorted(set(tag for tags in data["tag"] for tag in tags.split(",")))
   label2id = {tag: idx for idx, tag in enumerate(unique_tags)}
   id2label = {idx: tag for tag, idx in label2id.items()}
   
   # prepare final dataframe
   data = data[["word", "tag"]].drop_duplicates().reset_index(drop=True)
   data.columns = ["sentence", "word_labels"]
   
   return data, label2id, id2label


data, label2id, id2label = preprocess('data_title.conll')

In [3]:
### HYPERPARAMS ### 

default_parameters = {
    'MAX_LEN': 512,
    'TRAIN_BATCH_SIZE': 21, 
    'VALID_BATCH_SIZE': 21, 
    'EPOCHS': 30, 
    'MAX_GRAD_NORM': 10,
    'model_name': 'xlm-roberta-large',  # Default model
    'tokenizer_name': 'xlm-roberta-large'  # Default tokenizer
}

In [4]:
### INITIALIZE TOKENIZER ### 

tokenizer = AutoTokenizer.from_pretrained(default_parameters['tokenizer_name'])



In [5]:
### DEFINE WORD TOKENIZER ### 

def tokenize_and_preserve_labels(sentence, text_labels, tokenizer):

    """
    Function and description (below) modified from https://github.com/chambliss/Multilingual_NER/blob/8d3afffd4c99774e0585f4c7d721bb99481fd60f/python/utils/main_utils.py#L118
    
    Word piece tokenization makes it difficult to match word labels
    back up with individual word pieces. This function tokenizes each
    word one at a time so that it is easier to preserve the correct
    label for each subword. It is, of course, a bit slower in processing
    time, but it will help our model achieve higher accuracy.
    """

    tokenized_sentence = []
    labels = []

    for word, label in zip(sentence.strip().split(), text_labels.split(",")):
        tokenized_word = tokenizer.tokenize(word)
        tokenized_sentence.extend(tokenized_word)
        labels.extend([label] * len(tokenized_word))

    return tokenized_sentence, labels

In [6]:
### DATASET CLASS ### 

class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len, label2id):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.label2id = label2id

    def __getitem__(self, index):
        sentence = self.data.sentence[index]
        word_labels = self.data.word_labels[index].split(",")

        # Tokenize the sentence with offsets
        encoding = self.tokenizer(
            sentence.split(),
            is_split_into_words=True,
            return_offsets_mapping=True,
            padding='max_length',
            truncation=True,
            max_length=self.max_len
        )

        labels = [self.label2id[label] for label in word_labels]
        word_ids = encoding.word_ids()

        # Create label_ids aligned with tokens
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Ignored when computing loss
            else:
                label_ids.append(labels[word_idx])

        # Prepare item
        item = {key: torch.as_tensor(val) for key, val in encoding.items()}
        item['labels'] = torch.as_tensor(label_ids)
        item['texts'] = sentence  # Include original text for prediction
        return item

    def __len__(self):
        return len(self.data)

In [7]:
### CREATE DATA SPLITS ### 

train_size = 0.6
val_size = 0.2
test_size = 0.2

data = data.sample(frac=1, random_state=42).reset_index(drop=True)

train_data, temp_data = train_test_split(data, test_size=(1 - train_size), random_state=1234)
val_data, test_data = train_test_split(temp_data, test_size=(test_size / (val_size + test_size)), random_state=1234)

print(f"FULL Dataset: {data.shape}")
print(f"TRAIN Dataset: {train_data.shape}")
print(f"VALIDATION Dataset: {val_data.shape}")
print(f"TEST Dataset: {test_data.shape}")

train_data = train_data.reset_index(drop=True)
val_data = val_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)

FULL Dataset: (401, 2)
TRAIN Dataset: (240, 2)
VALIDATION Dataset: (80, 2)
TEST Dataset: (81, 2)


In [9]:
train_data.to_csv("train_titles.csv") 
val_data.to_csv("val_titles.csv") 
test_data.to_csv("test_titles.csv") 

In [9]:
def collate_fn(batch):
    keys = batch[0].keys()
    collated = {key: [] for key in keys}

    for item in batch:
        for key in keys:
            collated[key].append(item[key])

    # Stack tensors for input_ids, attention_mask, labels
    for key in ['input_ids', 'attention_mask', 'labels']:
        collated[key] = torch.stack(collated[key])

    # Pad offset_mapping
    max_len = max([len(o) for o in collated['offset_mapping']])
    padded_offsets = []
    for offsets in collated['offset_mapping']:
        if len(offsets) < max_len:
            padding = torch.zeros((max_len - len(offsets), 2), dtype=torch.long)
            offsets = torch.cat((offsets, padding), dim=0)
        padded_offsets.append(offsets)
    collated['offset_mapping'] = torch.stack(padded_offsets)

    # 'texts' is already a list of strings, no need to process further
    # Ensure 'texts' is included in collated
    # collated['texts'] = collated['texts']  # This line is unnecessary

    return collated

In [10]:
### BUILD DATALOADERS ### 

training_set = CustomDataset(train_data, tokenizer, default_parameters['MAX_LEN'], label2id)
validation_set = CustomDataset(val_data, tokenizer, default_parameters['MAX_LEN'], label2id)
testing_set = CustomDataset(test_data, tokenizer, default_parameters['MAX_LEN'], label2id)


training_loader = DataLoader(
    training_set,
    batch_size=default_parameters['TRAIN_BATCH_SIZE'],
    shuffle=True,
    collate_fn=collate_fn
)

validation_loader = DataLoader(
    validation_set,
    batch_size=default_parameters['VALID_BATCH_SIZE'],
    shuffle=False,
    collate_fn=collate_fn
)

testing_loader = DataLoader(
    testing_set,
    batch_size=default_parameters['VALID_BATCH_SIZE'],
    shuffle=False,
    collate_fn=collate_fn
)

In [11]:
### INITIALIZE MODEL ### 

def prepare_model(config):
    model = AutoModelForTokenClassification.from_pretrained(
        config['model_name'],
        num_labels=len(id2label),
        id2label=id2label,
        label2id=label2id,
        ignore_mismatched_sizes=True
    )
    model.to(device)
    return model

In [12]:
### CONSTRUCT FULL NAMED ENTITIES FROM TOKENS ### 

def reconstruct_entities(tokens, labels, offsets):
    entities = []
    current_entity = None

    for idx, (token, label, offset) in enumerate(zip(tokens, labels, offsets)):
        start, end = offset

        # Skip tokens with no offset (special tokens, padding)
        if start == end:
            continue

        # Replace special characters if necessary (adjust for your tokenizer)
        word = token.replace('▁', ' ')

        # Aggregate I-O-B tagged entities
        if label.startswith('B-'):
            if current_entity:
                entities.append(current_entity)
            current_entity = {'type': label[2:], 'text': word.strip(), 'start': start, 'end': end}
        elif label.startswith('I-'):
            if current_entity and label[2:] == current_entity['type']:
                current_entity['text'] += word
                current_entity['end'] = end
            else:
                if current_entity:
                    entities.append(current_entity)
                current_entity = {'type': label[2:], 'text': word.strip(), 'start': start, 'end': end}
        else:
            # Deal with 'O' tag
            if current_entity:
                entities.append(current_entity)
                current_entity = None

    if current_entity:
        entities.append(current_entity)

    # Merge entities of the same type if they are adjacent
    merged_entities = []
    for entity in entities:
        if merged_entities and merged_entities[-1]['type'] == entity['type'] and merged_entities[-1]['end'] == entity['start']:
            merged_entities[-1]['text'] += entity['text']
            merged_entities[-1]['end'] = entity['end']
        else:
            merged_entities.append(entity)

    # Clean up extra spaces
    for entity in merged_entities:
        entity['text'] = ' '.join(entity['text'].split())
        entity['start'] = int(entity['start'])
        entity['end'] = int(entity['end'])

    return merged_entities


In [13]:
### F1 METRICS ### 

def compute_f1_scores(true_entities_list, pred_entities_list):
   """
   Compute precision, recall, and F1 scores for named entity recognition.
   
   Args:
       true_entities_list (list): Ground truth entities
       pred_entities_list (list): Predicted entities
       
   Returns:
       dict: F1 scores including per-class, micro, macro and weighted averages
   """
   # initialize metric counters
   class_metrics = defaultdict(lambda: {"tp": 0, "fp": 0, "fn": 0})
   total_metrics = {"tp": 0, "fp": 0, "fn": 0}
   total_support = 0
   
   # compute true/false positives/negatives
   for ground_truth, predictions in zip(true_entities_list, pred_entities_list):
       ground_truth_set = set((e['type'], e['text']) for e in ground_truth)
       predictions_set = set((e['type'], e['text']) for e in predictions)
       
       # count true positives and false negatives
       for entity in ground_truth_set:
           if entity in predictions_set:
               class_metrics[entity[0]]["tp"] += 1
               total_metrics["tp"] += 1
           else:
               class_metrics[entity[0]]["fn"] += 1
               total_metrics["fn"] += 1
       
       # count false positives
       for entity in predictions_set:
           if entity not in ground_truth_set:
               class_metrics[entity[0]]["fp"] += 1
               total_metrics["fp"] += 1
   
   # compute per-class metrics
   f1_scores = {}
   total_precision = 0
   total_recall = 0
   total_f1 = 0
   
   for entity_class, metrics in class_metrics.items():
       tp = metrics["tp"]
       fp = metrics["fp"]
       fn = metrics["fn"]
       support = tp + fn
       
       # calculate precision, recall, f1
       precision = tp / (tp + fp) if (tp + fp) > 0 else 0
       recall = tp / (tp + fn) if (tp + fn) > 0 else 0
       f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
       
       f1_scores[entity_class] = {
           "precision": precision,
           "recall": recall,
           "f1-score": f1,
           "support": support
       }
       
       total_precision += precision
       total_recall += recall
       total_f1 += f1
       total_support += support
   
   # compute macro averages
   num_classes = len(class_metrics)
   macro_precision = total_precision / num_classes if num_classes > 0 else 0
   macro_recall = total_recall / num_classes if num_classes > 0 else 0
   macro_f1 = total_f1 / num_classes if num_classes > 0 else 0
   
   # compute micro averages
   tp_total = total_metrics["tp"]
   fp_total = total_metrics["fp"]
   fn_total = total_metrics["fn"]
   micro_precision = tp_total / (tp_total + fp_total) if (tp_total + fp_total) > 0 else 0
   micro_recall = tp_total / (tp_total + fn_total) if (tp_total + fn_total) > 0 else 0
   micro_f1 = 2 * micro_precision * micro_recall / (micro_precision + micro_recall) if (micro_precision + micro_recall) > 0 else 0
   
   # compute weighted averages
   weighted_precision = sum((metrics["precision"] * metrics["support"]) for metrics in f1_scores.values()) / total_support if total_support > 0 else 0
   weighted_recall = sum((metrics["recall"] * metrics["support"]) for metrics in f1_scores.values()) / total_support if total_support > 0 else 0
   weighted_f1 = sum((metrics["f1-score"] * metrics["support"]) for metrics in f1_scores.values()) / total_support if total_support > 0 else 0
   
   # add average metrics to results
   f1_scores.update({
       "micro avg": {
           "precision": micro_precision,
           "recall": micro_recall,
           "f1-score": micro_f1,
           "support": total_support
       },
       "macro avg": {
           "precision": macro_precision,
           "recall": macro_recall,
           "f1-score": macro_f1,
           "support": total_support
       },
       "weighted avg": {
           "precision": weighted_precision,
           "recall": weighted_recall,
           "f1-score": weighted_f1,
           "support": total_support
       }
   })
   
   return f1_scores


In [14]:
### TRAIN FOR A SINGLE EPOCH ### 

def train_epoch(model, optimizer, train_loader, epoch, config):
   """
   Train model for one epoch.
   
   Args:
       model: NER model
       optimizer: Optimizer instance
       train_loader: Training data loader
       epoch (int): Current epoch number 
       config (dict): Training configuration parameters
   """
   # set training mode
   model.train()
   total_loss = 0
   nb_tr_steps = 0
   
   # train on batches
   for idx, batch in enumerate(train_loader):
       # move batch to device
       ids = batch['input_ids'].to(device)
       mask = batch['attention_mask'].to(device)
       targets = batch['labels'].to(device)
       
       # forward pass
       outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
       loss = outputs.loss
       
       # accumulate loss
       total_loss += loss.item()
       nb_tr_steps += 1
       
       # backward pass with gradient clipping
       torch.nn.utils.clip_grad_norm_(
           parameters=model.parameters(), 
           max_norm=config['MAX_GRAD_NORM']
       )
       optimizer.zero_grad()
       loss.backward()
       optimizer.step()
   
   # compute average loss and log
   avg_epoch_loss = total_loss / nb_tr_steps
   wandb.log({'train_loss': avg_epoch_loss, 'epoch': epoch + 1})
   print(f"Epoch {epoch + 1}/{config['EPOCHS']}, Training Loss: {avg_epoch_loss}")


def validate(model, val_loader, epoch, config):
    model.eval()
    eval_loss = 0
    nb_eval_steps = 0
    true_entities_list = []
    pred_entities_list = []

    with torch.no_grad():
        for idx, batch in enumerate(val_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            offsets = batch['offset_mapping']
            texts = batch['texts']  # 'texts' should now be included

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits
            eval_loss += loss.item()
            nb_eval_steps += 1

            preds = torch.argmax(logits, dim=2)

            for i in range(input_ids.size(0)):
                # Move tensors to CPU
                input_ids_i = input_ids[i].cpu()
                pred_labels_i = preds[i].cpu()
                true_labels_i = labels[i].cpu()
                attention_mask_i = attention_mask[i].cpu()
                offset_mapping_i = offsets[i]
                text = texts[i]

                active_tokens = attention_mask_i == 1
                input_ids_i = input_ids_i[active_tokens].numpy()
                pred_labels_i = pred_labels_i[active_tokens].numpy()
                true_labels_i = true_labels_i[active_tokens].numpy()
                offset_mapping_i = offset_mapping_i[active_tokens].numpy()

                tokens = tokenizer.convert_ids_to_tokens(input_ids_i)
                pred_tags = [id2label[label_id] if label_id != -100 else 'O' for label_id in pred_labels_i]
                true_tags = [id2label[label_id] if label_id != -100 else 'O' for label_id in true_labels_i]

                pred_entities = reconstruct_entities(tokens, pred_tags, offset_mapping_i)
                true_entities = reconstruct_entities(tokens, true_tags, offset_mapping_i)

                pred_entities_list.append(pred_entities)
                true_entities_list.append(true_entities)

    avg_eval_loss = eval_loss / nb_eval_steps
    f1_scores = compute_f1_scores(true_entities_list, pred_entities_list)
    micro_f1 = f1_scores['micro avg']['f1-score']
    wandb.log({'validation_loss': avg_eval_loss, 'validation_f1': micro_f1, 'epoch': epoch + 1})
    print(f"Validation Loss: {avg_eval_loss}")
    print(f"Validation F1 Score (Entity Level): {micro_f1}")

    return avg_eval_loss, micro_f1


In [15]:
### TRAIN ### 

def train(config=None):
   """
   Train NER model with given configuration.
   
   Args:
       config (dict, optional): Training configuration parameters.
           If None, uses default_parameters
           
   Returns:
       tuple: Best validation F1 score and learning rate used
   """
   # setup configuration
   if config is None:
       config = default_parameters
   else:
       config = dict(config)
       config.update(default_parameters)
       
   # initialize model and optimizer
   model = prepare_model(config)
   optimizer = torch.optim.Adam(
       params=model.parameters(), 
       lr=config['learning_rate']
   )
   
   # track best model
   best_f1 = 0.0
   best_epoch = 0
   best_model_state = None
   
   # training loop
   for epoch in range(config['EPOCHS']):
       print(f"Training epoch: {epoch + 1}/{config['EPOCHS']}")
       
       # train and validate
       train_epoch(model, optimizer, training_loader, epoch, config)
       print("Running validation...")
       val_loss, val_f1 = validate(model, validation_loader, epoch, config)
       
       # save best model
       if val_f1 > best_f1:
           best_f1 = val_f1
           best_epoch = epoch + 1
           best_model_state = model.state_dict()
           
           # save model with learning rate in filename
           lr_str = f"{config['learning_rate']:.10e}"
           model_filename = f'best_model_lr_{lr_str}.pt'
           torch.save(best_model_state, model_filename)
           print(f"Best model saved at epoch {best_epoch} with F1 score: {best_f1} as {model_filename}")
   
   # log final results
   print(f"Training completed. Best validation F1 score: {best_f1} at epoch {best_epoch}")
   wandb.log({
       'best_epoch': best_epoch, 
       'best_validation_f1': best_f1
   })
   
   return best_f1, config['learning_rate']

In [16]:
# W&B SWEEP CONFIGS 
sweep_config = {
    'method': 'grid',  # Change to grid search
    'metric': {
        'name': 'best_validation_f1',
        'goal': 'maximize'
    },
    'parameters': {
        'learning_rate': {
            'values': [2e-05, 3e-05, 4e-05, 5e-05, 6e-05]  # Specify exact values to test
        }
    }
}


# INITIALIZE SWEEP 
sweep_id = wandb.sweep(sweep_config, project="gbpatentdata_grid_xlm_title")

def sweep_run():
    with wandb.init():
        config = wandb.config
        # Train the model with the current hyperparameters
        best_f1, lr = train(config)

        # Log the best validation F1 score and learning rate
        wandb.log({'best_validation_f1': best_f1, 'learning_rate': lr})

Create sweep with ID: bksuaru8
Sweep URL: https://wandb.ai/matthewleechen/gbpatentdata_grid_xlm_title/sweeps/bksuaru8


In [18]:
# RUN HYPERPARAM SWEEP
wandb.agent(sweep_id, function=sweep_run)

[34m[1mwandb[0m: Agent Starting Run: ukrvwiw8 with config:
[34m[1mwandb[0m: 	learning_rate: 2e-05
[34m[1mwandb[0m: Currently logged in as: [33mmatthewleechen[0m. Use [1m`wandb login --relogin`[0m to force relogin


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training epoch: 1/30
Epoch 1/30, Training Loss: 0.6016708438595136
Running validation...
Validation Loss: 0.21204044669866562
Validation F1 Score (Entity Level): 0
Training epoch: 2/30
Epoch 2/30, Training Loss: 0.16986056044697762
Running validation...
Validation Loss: 0.12579835206270218
Validation F1 Score (Entity Level): 0
Training epoch: 3/30
Epoch 3/30, Training Loss: 0.09591686104734738
Running validation...
Validation Loss: 0.05564243160188198
Validation F1 Score (Entity Level): 0.3404255319148936
Best model saved at epoch 3 with F1 score: 0.3404255319148936 as best_model_lr_2.0000000000e-05.pt
Training epoch: 4/30
Epoch 4/30, Training Loss: 0.04182384628802538
Running validation...
Validation Loss: 0.02374283759854734
Validation F1 Score (Entity Level): 0.5434782608695653
Best model saved at epoch 4 with F1 score: 0.5434782608695653 as best_model_lr_2.0000000000e-05.pt
Training epoch: 5/30
Epoch 5/30, Training Loss: 0.017668565657610696
Running validation...
Validation Loss: 0

VBox(children=(Label(value='0.005 MB of 0.005 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
best_epoch,▁
best_validation_f1,▁▁
epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
learning_rate,▁
train_loss,█▃▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
validation_f1,▁▁▄▆▆▆▆▇▆▇▆▇▇▇▇▇▇▇▇████████▇█▇
validation_loss,█▅▃▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
best_epoch,20.0
best_validation_f1,0.83429
epoch,30.0
learning_rate,2e-05
train_loss,0.00103
validation_f1,0.75581
validation_loss,0.00751


[34m[1mwandb[0m: Agent Starting Run: k27ruqas with config:
[34m[1mwandb[0m: 	learning_rate: 3e-05


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training epoch: 1/30
Epoch 1/30, Training Loss: 0.4283423125743866
Running validation...
Validation Loss: 0.17397898063063622
Validation F1 Score (Entity Level): 0
Training epoch: 2/30
Epoch 2/30, Training Loss: 0.0941535122692585
Running validation...
Validation Loss: 0.05906441621482372
Validation F1 Score (Entity Level): 0.24896265560165975
Best model saved at epoch 2 with F1 score: 0.24896265560165975 as best_model_lr_3.0000000000e-05.pt
Training epoch: 3/30
Epoch 3/30, Training Loss: 0.03135848937866589
Running validation...
Validation Loss: 0.017101927427574992
Validation F1 Score (Entity Level): 0.738095238095238
Best model saved at epoch 3 with F1 score: 0.738095238095238 as best_model_lr_3.0000000000e-05.pt
Training epoch: 4/30
Epoch 4/30, Training Loss: 0.014719743941289684
Running validation...
Validation Loss: 0.0047556618228554726
Validation F1 Score (Entity Level): 0.8048780487804879
Best model saved at epoch 4 with F1 score: 0.8048780487804879 as best_model_lr_3.00000000

VBox(children=(Label(value='0.005 MB of 0.005 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
best_epoch,▁
best_validation_f1,▁▁
epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
learning_rate,▁
train_loss,█▃▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
validation_f1,▁▃▆▇▇▇▇▇██████████████████████
validation_loss,█▃▂▁▁▁▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
best_epoch,27.0
best_validation_f1,0.97468
epoch,30.0
learning_rate,3e-05
train_loss,0.00031
validation_f1,0.9375
validation_loss,0.00667


[34m[1mwandb[0m: Agent Starting Run: bq2ytqb2 with config:
[34m[1mwandb[0m: 	learning_rate: 4e-05


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training epoch: 1/30
Epoch 1/30, Training Loss: 0.2876355604579051
Running validation...
Validation Loss: 0.07656682282686234
Validation F1 Score (Entity Level): 0.12269938650306748
Best model saved at epoch 1 with F1 score: 0.12269938650306748 as best_model_lr_4.0000000000e-05.pt
Training epoch: 2/30
Epoch 2/30, Training Loss: 0.0544097195379436
Running validation...
Validation Loss: 0.0294593321159482
Validation F1 Score (Entity Level): 0.4433497536945813
Best model saved at epoch 2 with F1 score: 0.4433497536945813 as best_model_lr_4.0000000000e-05.pt
Training epoch: 3/30
Epoch 3/30, Training Loss: 0.026865689277959365
Running validation...
Validation Loss: 0.013947198167443275
Validation F1 Score (Entity Level): 0.5513513513513514
Best model saved at epoch 3 with F1 score: 0.5513513513513514 as best_model_lr_4.0000000000e-05.pt
Training epoch: 4/30
Epoch 4/30, Training Loss: 0.009920233472560843
Running validation...
Validation Loss: 0.011154122301377356
Validation F1 Score (Entity

VBox(children=(Label(value='0.005 MB of 0.005 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
best_epoch,▁
best_validation_f1,▁▁
epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
learning_rate,▁
train_loss,█▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▁
validation_f1,▁▄▅▅▇████████▇▇████████████▇▇▇
validation_loss,█▄▂▂▁▁▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▃▁▂

0,1
best_epoch,25.0
best_validation_f1,0.97468
epoch,30.0
learning_rate,4e-05
train_loss,0.00296
validation_f1,0.84848
validation_loss,0.01


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: xs3tkt6i with config:
[34m[1mwandb[0m: 	learning_rate: 5e-05


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training epoch: 1/30
Epoch 1/30, Training Loss: 0.30421949674685794
Running validation...
Validation Loss: 0.06956760119646788
Validation F1 Score (Entity Level): 0.09815950920245398
Best model saved at epoch 1 with F1 score: 0.09815950920245398 as best_model_lr_5.0000000000e-05.pt
Training epoch: 2/30
Epoch 2/30, Training Loss: 0.06370623114829262
Running validation...
Validation Loss: 0.05183085985481739
Validation F1 Score (Entity Level): 0.136986301369863
Best model saved at epoch 2 with F1 score: 0.136986301369863 as best_model_lr_5.0000000000e-05.pt
Training epoch: 3/30
Epoch 3/30, Training Loss: 0.026605913570771616
Running validation...
Validation Loss: 0.01923153386451304
Validation F1 Score (Entity Level): 0.3070539419087137
Best model saved at epoch 3 with F1 score: 0.3070539419087137 as best_model_lr_5.0000000000e-05.pt
Training epoch: 4/30
Epoch 4/30, Training Loss: 0.01355737952205042
Running validation...
Validation Loss: 0.01808786834590137
Validation F1 Score (Entity L

VBox(children=(Label(value='0.005 MB of 0.005 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
best_epoch,▁
best_validation_f1,▁▁
epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
learning_rate,▁
train_loss,█▂▂▁▁▁▁▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
validation_f1,▁▁▃▃▆▆▆▇▆█▆▅▅▅▆▆▇▆▇▇█▇▇▅▅▆▆▆▆▆
validation_loss,█▆▃▂▁▂▂▁▂▂▁█▇▃▂▁▁▁▁▁▁▂▂▂▂▁▁▁▁▁

0,1
best_epoch,21.0
best_validation_f1,0.83832
epoch,30.0
learning_rate,5e-05
train_loss,0.00224
validation_f1,0.63043
validation_loss,0.00496


[34m[1mwandb[0m: Agent Starting Run: lnzpusl1 with config:
[34m[1mwandb[0m: 	learning_rate: 6e-05


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training epoch: 1/30
Epoch 1/30, Training Loss: 0.17031084342549244
Running validation...
Validation Loss: 0.09834115579724312
Validation F1 Score (Entity Level): 0
Training epoch: 2/30
Epoch 2/30, Training Loss: 0.03463570478682717
Running validation...
Validation Loss: 0.015824561938643456
Validation F1 Score (Entity Level): 0.4479166666666667
Best model saved at epoch 2 with F1 score: 0.4479166666666667 as best_model_lr_6.0000000000e-05.pt
Training epoch: 3/30
Epoch 3/30, Training Loss: 0.009205982011432448
Running validation...
Validation Loss: 0.007575953844934702
Validation F1 Score (Entity Level): 0.5978260869565218
Best model saved at epoch 3 with F1 score: 0.5978260869565218 as best_model_lr_6.0000000000e-05.pt
Training epoch: 4/30
Epoch 4/30, Training Loss: 0.004529690893832594
Running validation...
Validation Loss: 0.006565174437128007
Validation F1 Score (Entity Level): 0.5604395604395604
Training epoch: 5/30
Epoch 5/30, Training Loss: 0.0030462495051324368
Running validati

VBox(children=(Label(value='0.005 MB of 0.005 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
best_epoch,▁
best_validation_f1,▁▁
epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
learning_rate,▁
train_loss,█▂▁▁▁▁▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
validation_f1,▁▄▅▅▆▅▃▄▄▃▆██████████▇▇███████
validation_loss,█▂▁▁▁▁▄▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
best_epoch,15.0
best_validation_f1,0.96855
epoch,30.0
learning_rate,6e-05
train_loss,7e-05
validation_f1,0.9441
validation_loss,0.00321


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Sweep Agent: Exiting.


In [None]:
## THEN USE BEST MODEL 

In [17]:
def print_classification_report(f1_scores):
    print("Classification Report (Entity Level):")
    print("{:>15} {:>10} {:>10} {:>10} {:>10}".format("Entity", "Precision", "Recall", "F1-Score", "Support"))
    for entity, scores in f1_scores.items():
        print("{:>15} {:>10.4f} {:>10.4f} {:>10.4f} {:>10}".format(
            entity,
            scores["precision"],
            scores["recall"],
            scores["f1-score"],
            scores["support"]
        ))

def save_classification_report(f1_scores, filename="classification_report.csv"):
    with open(filename, 'w', newline='') as csvfile:
        fieldnames = ['Entity', 'Precision', 'Recall', 'F1-Score', 'Support']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for entity, scores in f1_scores.items():
            writer.writerow({
                'Entity': entity,
                'Precision': f"{scores['precision']:.4f}",
                'Recall': f"{scores['recall']:.4f}",
                'F1-Score': f"{scores['f1-score']:.4f}",
                'Support': scores['support']
            })
    print(f"Classification report saved to {filename}")


In [18]:
# Fetch the best run from W&B
api = wandb.Api()
sweep_runs = api.runs(path="matthewleechen/gbpatentdata_grid_xlm_title")  # Replace with your W&B username/project

# Find the run with the highest best_validation_f1
best_run = None
best_f1 = -1

for run in sweep_runs:
    if run.state == 'finished':
        val_f1 = run.summary.get('best_validation_f1')
        if val_f1 is not None and val_f1 > best_f1:
            best_f1 = val_f1
            best_run = run

if best_run is not None:
    best_lr = best_run.config['learning_rate']
    print(f"Best learning rate from grid search: {best_lr}")
else:
    print("No completed runs found.")
    best_lr = default_parameters['LEARNING_RATE']

# Load the best model
lr_str = f"6.5000000000e-05"
best_model_path = f'best_model_lr_{lr_str}.pt'
print(f"Loading best model from {best_model_path}")
model = prepare_model({'model_name': default_parameters['model_name']})
model.load_state_dict(torch.load(best_model_path))


# Evaluate on the test set
def predict(model, dataloader):
    model.eval()
    output = []

    with torch.no_grad():
        for idx, batch in enumerate(dataloader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)  # Ground truth labels
            offsets = batch['offset_mapping']
            texts = batch['texts']

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            preds = torch.argmax(logits, dim=2)

            for i in range(input_ids.size(0)):
                # Move tensors to CPU
                input_ids_i = input_ids[i].cpu()
                pred_labels_i = preds[i].cpu()
                true_labels_i = labels[i].cpu()
                attention_mask_i = attention_mask[i].cpu()
                offset_mapping_i = offsets[i]
                text = texts[i]

                active_tokens = attention_mask_i == 1
                input_ids_i = input_ids_i[active_tokens].numpy()
                pred_labels_i = pred_labels_i[active_tokens].numpy()
                true_labels_i = true_labels_i[active_tokens].numpy()
                offset_mapping_i = offset_mapping_i[active_tokens].numpy()

                tokens = tokenizer.convert_ids_to_tokens(input_ids_i)
                pred_tags = [id2label[label_id] if label_id != -100 else 'O' for label_id in pred_labels_i]
                true_tags = [id2label[label_id] if label_id != -100 else 'O' for label_id in true_labels_i]

                pred_entities = reconstruct_entities(tokens, pred_tags, offset_mapping_i)
                true_entities = reconstruct_entities(tokens, true_tags, offset_mapping_i)

                # Ensure that 'ground_truth_entities' is included
                output.append({
                    "id": len(output),
                    "sentence": text,
                    "predicted_entities": pred_entities,
                    "ground_truth_entities": true_entities
                })

    with open("test_set_predictions.json", "w") as f:
        json.dump(output, f, indent=4)
    print("Predictions saved to test_set_predictions.json")

# Run prediction on test set
predict(model, testing_loader)

# Load predictions and compute metrics
with open("test_set_predictions.json", "r") as f:
    data = json.load(f)

true_entities_list = [item['ground_truth_entities'] for item in data]
pred_entities_list = [item['predicted_entities'] for item in data]

f1_scores = compute_f1_scores(true_entities_list, pred_entities_list)
print_classification_report(f1_scores)

# Optionally, save the classification report
save_classification_report(f1_scores, filename=f"classification_report_lr_{lr_str}.csv")

Best learning rate from grid search: 4.5e-05
Loading best model from best_model_lr_6.5000000000e-05.pt


Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


OutOfMemoryError: CUDA out of memory. Tried to allocate 16.00 MiB (GPU 0; 47.54 GiB total capacity; 1.22 GiB already allocated; 5.94 MiB free; 1.23 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [21]:
# Push model to hub
model_name = "matthewleechen/patent_entities_ner"

# Upload files to the hub
tokenizer.push_to_hub(model_name)
model.push_to_hub(model_name)

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/gbpatentdata/patent_data_ner_xlmroberta_20241110/commit/95c4820e160bbaaaa5f9fe94e7a2a49a9e1f42ae', commit_message='Upload XLMRobertaForTokenClassification', commit_description='', oid='95c4820e160bbaaaa5f9fe94e7a2a49a9e1f42ae', pr_url=None, pr_revision=None, pr_num=None)