# Libraries

In [1]:
import sys
import os
import time
import csv
import ast
import importlib
import collections
import pickle

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm, trange
from sklearn.metrics import precision_recall_fscore_support, classification_report

import torch
import torch.nn.functional as F
import torch.nn as nn
import torch.autograd as autograd
from torch.utils.data.distributed import DistributedSampler
from torch.utils import data

from transformers import AutoTokenizer, AutoModel
!pip install pytorch-pretrained-bert
from pytorch_pretrained_bert.optimization import BertAdam

Collecting pytorch-pretrained-bert
  Downloading pytorch_pretrained_bert-0.6.2-py3-none-any.whl.metadata (86 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.7/86.7 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Collecting botocore<1.30.0,>=1.29.100 (from boto3->pytorch-pretrained-bert)
  Downloading botocore-1.29.165-py3-none-any.whl.metadata (5.9 kB)
Downloading pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m123.8/123.8 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading botocore-1.29.165-py3-none-any.whl (11.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.0/11.0 MB[0m [31m85.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: botocore, pytorch-pretrained-bert
  Attempting uninstall: botocore
    Found existing installation: botocore 1.34.131
    Uninstalling botocore-1.34.131:
      Successfully uninstalled botocore-1.34.131


# All settings

In [2]:
# Files directory
data_dir = '/kaggle/input/l-ner-data/' # Update this path with your data directory containing the train, dev, and test sets.
output_dir = '/kaggle/working/'        # Update this path to where you want to save the model checkpoints.

load_checkpoint = True                 # Whether to load a checkpoint file before model training
do_train = True                        # Set to True to run the training procedure.
use_local_trained_model = True         # Set to True to use the locally trained model. Set to False to load the trained model 
                                       # from HuggingFace. If do_train == False, this will be set to False.

# Model settings
language_model_name = 'lexlms/legal-longformer-base' # Replace with other language models from Hugging Face if desired.
do_lower_case = False
max_seq_length = 256

# Training settings
batch_size = 16
learning_rate0 = 5e-5
lr0_crf_fc = 8e-5
weight_decay_finetune = 1e-5
weight_decay_crf_fc = 5e-6
total_train_epochs = 30
gradient_accumulation_steps = 1
warmup_proportion = 0.1

# CUDA settings
cuda_yes = torch.cuda.is_available()
# cuda_yes = False
print('Cuda is available?', cuda_yes)
device = torch.device("cuda:0" if cuda_yes else "cpu")
print('Device:', device)


Cuda is available? True
Device: cuda:0


# Functions and Classes for read and organize dataset

In [3]:
class InputExample(object):
    """A single training/test example for NER."""

    def __init__(self, guid, words, labels):
        """Constructs a InputExample.

        Args:
          guid: Unique id for the example(a sentence or a pair of sentences).
          words: list of words of sentence
          labels_a/labels_b: (Optional) string. The label seqence of the text_a/text_b. This should be
            specified for train and dev examples, but not for test examples.
        """
        self.guid = guid
        # list of words of the sentence,example: [EU, rejects, German, call, to, boycott, British, lamb .]
        self.words = words
        # list of label sequence of the sentence,like: [B-ORG, O, B-MISC, O, O, O, B-MISC, O, O]
        self.labels = labels

In [4]:
class InputFeatures(object):
    """A single set of features of data.
    result of convert_examples_to_features(InputExample)
    """

    def __init__(self, input_ids, input_mask, segment_ids,  predict_mask, label_ids):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.predict_mask = predict_mask
        self.label_ids = label_ids

In [5]:
class NERLensDataProcessor(object):
    """
    Processor class for preparing and handling NER data for the LegalLensNER dataset. 
    """

    def __init__(self):
        self._label_types = [ 'X', '[CLS]', '[SEP]', 'O', 'B-LAW', 'I-LAW', 'B-VIOLATION', 'I-VIOLATION', 'B-VIOLATED BY', 'I-VIOLATED BY', 'B-VIOLATED ON', 'I-VIOLATED ON']
        self._num_labels = len(self._label_types)
        self._label_map = {label: i for i,
                           label in enumerate(self._label_types)}

    def get_train_examples(self, data_dir):
        return self._create_examples(
            self._read_data(os.path.join(data_dir, "trainset_NER_LegalLens.csv")))

    def get_dev_examples(self, data_dir):
        return self._create_examples(
            self._read_data(os.path.join(data_dir, "devset_NER_LegalLens.csv")))

    def get_test_examples(self, data_dir):
        # The provided Excel test file without labels needs to be handled differently
        return self._create_examples(
            self._read_data(os.path.join(data_dir, "testset_NER_LegalLens.xlsx"), is_test_file=True)) 

    def get_labels(self):
        return self._label_types

    def get_num_labels(self):
        return self.get_num_labels

    def get_label_map(self):
        return self._label_map

    def get_start_label_id(self):
        return self._label_map['[CLS]']

    def get_stop_label_id(self):
        return self._label_map['[SEP]']

    def _read_data(self, file_path, is_test_file=False):

        def apply_literal_eval(x):
            try:
                return ast.literal_eval(x)
            except (ValueError, SyntaxError):
                return x
            
        if is_test_file: # Read the Excel test file
            read_df = pd.read_excel(file_path)
            read_df['ner_tags'] = read_df['tokens'].apply(lambda x: ['X'] * len(x)) # Create dummy labels for test file
        else: # Read CSV train and dev sets files
            read_df = pd.read_csv(file_path)
            
        data = read_df.to_dict(orient='records')
        
        for i in range(len(data)):
            data[i]['tokens'] = apply_literal_eval(data[i]['tokens'])
            data[i]['ner_tags'] = apply_literal_eval(data[i]['ner_tags'])
        return data

    def _create_examples(self, data):
        examples = []
        for i, item in enumerate(data):
            guid = item['id']
            words = item['tokens']
            labels = item['ner_tags']
            examples.append(InputExample(
                guid=guid, words=words, labels=labels))
        return examples

def example2feature(example, tokenizer, label_map, max_seq_length):
    add_label = 'X'
    tokens = ['[CLS]']
    predict_mask = [0]
    label_ids = [label_map['[CLS]']]
    for i, w in enumerate(example.words):
        # use Tokenizer to split words
        # 1996-08-22 => 1996 - 08 - 22
        # sheepmeat => sheep ##me ##at
        sub_words = tokenizer.tokenize(w)
        if not sub_words:
            sub_words = ['[UNK]']
        tokens.extend(sub_words)
        for j in range(len(sub_words)):
            if j == 0:
                predict_mask.append(1)
                label_ids.append(label_map[example.labels[i]])
            else:
                # '##xxx' -> 'X' 
                predict_mask.append(0)
                label_ids.append(label_map[add_label])

    # truncate
    if len(tokens) > max_seq_length - 1:
        print('Example No.{} is too long, length is {}, truncated to {}!'.format(example.guid, len(tokens), max_seq_length))
        tokens = tokens[0:(max_seq_length - 1)]
        predict_mask = predict_mask[0:(max_seq_length - 1)]
        label_ids = label_ids[0:(max_seq_length - 1)]
    tokens.append('[SEP]')
    predict_mask.append(0)
    label_ids.append(label_map['[SEP]'])

    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    segment_ids = [0] * len(input_ids)
    input_mask = [1] * len(input_ids)

    feat=InputFeatures(
                # guid=example.guid,
                # tokens=tokens,
                input_ids=input_ids,
                input_mask=input_mask,
                segment_ids=segment_ids,
                predict_mask=predict_mask,
                label_ids=label_ids)

    return feat

In [6]:
class NerDataset(data.Dataset):
    """
    Custom Dataset class for NER task, which converts examples into features that can be used by a LM.
    """
    def __init__(self, examples, tokenizer, label_map, max_seq_length):
        self.examples=examples
        self.tokenizer=tokenizer
        self.label_map=label_map
        self.max_seq_length=max_seq_length

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        feat=example2feature(self.examples[idx], self.tokenizer, self.label_map, max_seq_length)
        return feat.input_ids, feat.input_mask, feat.segment_ids, feat.predict_mask, feat.label_ids

    @classmethod
    def pad(cls, batch):

        seqlen_list = [len(sample[0]) for sample in batch]
        maxlen = np.array(seqlen_list).max()

        f = lambda x, seqlen: [sample[x] + [0] * (seqlen - len(sample[x])) for sample in batch] # 0: X for padding
        input_ids_list = torch.LongTensor(f(0, maxlen))
        input_mask_list = torch.LongTensor(f(1, maxlen))
        segment_ids_list = torch.LongTensor(f(2, maxlen))
        predict_mask_list = torch.BoolTensor(f(3, maxlen))
        label_ids_list = torch.LongTensor(f(4, maxlen))

        return input_ids_list, input_mask_list, segment_ids_list, predict_mask_list, label_ids_list

# Prepare datasets

In [7]:
np.random.seed(44)
torch.manual_seed(44)
if cuda_yes:
    torch.cuda.manual_seed_all(44)

# Load pre-trained model tokenizer (vocabulary)
nerLensProcessor = NERLensDataProcessor()
label_list = nerLensProcessor.get_labels()
label_map = nerLensProcessor.get_label_map()
train_examples = nerLensProcessor.get_train_examples(data_dir)
dev_examples = nerLensProcessor.get_dev_examples(data_dir)
test_examples = nerLensProcessor.get_test_examples(data_dir)

total_train_steps = int(len(train_examples) / batch_size / gradient_accumulation_steps * total_train_epochs)
print("***** Running training *****")
print("  Num examples = %d"% len(train_examples))
print("  Batch size = %d"% batch_size)
print("  Num steps = %d"% total_train_steps)

tokenizer = AutoTokenizer.from_pretrained(language_model_name, do_lower_case=do_lower_case)

train_dataset = NerDataset(train_examples,tokenizer,label_map,max_seq_length)
dev_dataset = NerDataset(dev_examples,tokenizer,label_map,max_seq_length)
test_dataset = NerDataset(test_examples,tokenizer,label_map,max_seq_length)

train_dataloader = data.DataLoader(dataset=train_dataset,
                                batch_size=batch_size,
                                shuffle=True,
                                num_workers=4,
                                collate_fn=NerDataset.pad)

dev_dataloader = data.DataLoader(dataset=dev_dataset,
                                batch_size=batch_size,
                                shuffle=False,
                                num_workers=4,
                                collate_fn=NerDataset.pad)

test_dataloader = data.DataLoader(dataset=test_dataset,
                                batch_size=batch_size,
                                shuffle=False,
                                num_workers=4,
                                collate_fn=NerDataset.pad)

***** Running training *****
  Num examples = 976
  Batch size = 16
  Num steps = 1830


tokenizer_config.json:   0%|          | 0.00/377 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.16M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/167 [00:00<?, ?B/s]

# Model Definition

- **Use Language Model + CRF:**
  - **CRF (Conditional Random Field):** Used for transition and the Maximum Likelihood Estimate (MLE).
  - **Language Model:** Responsible for the latent label, which leads to the emission of word embeddings.


In [8]:
def log_sum_exp_1vec(vec):  # shape(1,m)
    max_score = vec[0, np.argmax(vec)]
    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
    return max_score + torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))

def log_sum_exp_mat(log_M, axis=-1):  # shape(n,m)
    return torch.max(log_M, axis)[0]+torch.log(torch.exp(log_M-torch.max(log_M, axis)[0][:, None]).sum(axis))

def log_sum_exp_batch(log_Tensor, axis=-1): # shape (batch_size,n,m)
    return torch.max(log_Tensor, axis)[0]+torch.log(torch.exp(log_Tensor-torch.max(log_Tensor, axis)[0].view(log_Tensor.shape[0],-1,1)).sum(axis))


class LM_CRF_NER(nn.Module):

    def __init__(self, language_model, start_label_id, stop_label_id, num_labels, max_seq_length, batch_size, device):
        super(LM_CRF_NER, self).__init__()
        self.hidden_size = 768
        self.start_label_id = start_label_id
        self.stop_label_id = stop_label_id
        self.num_labels = num_labels
        self.max_seq_length = max_seq_length
        self.batch_size = batch_size
        self.device=device

        # use pretrainded LM
        self.language_model = language_model
        self.dropout = torch.nn.Dropout(0.2)
        # Maps the output of the LM into label space.
        self.hidden2label = nn.Linear(self.hidden_size, self.num_labels)

        # Matrix of transition parameters.  Entry i,j is the score of transitioning *to* i *from* j.
        self.transitions = nn.Parameter(
            torch.randn(self.num_labels, self.num_labels))

        # These two statements enforce the constraint that we never transfer *to* the start tag(or label),
        # and we never transfer *from* the stop label (the model would probably learn this anyway,
        # so this enforcement is likely unimportant)
        self.transitions.data[start_label_id, :] = -10000
        self.transitions.data[:, stop_label_id] = -10000

        nn.init.xavier_uniform_(self.hidden2label.weight)
        nn.init.constant_(self.hidden2label.bias, 0.0)


    def _forward_alg(self, feats):
        '''
        This also called alpha-recursion or forward recursion, to calculate log_prob of all barX
        '''

        T = feats.shape[1]
        batch_size = feats.shape[0]

        log_alpha = torch.Tensor(batch_size, 1, self.num_labels).fill_(-10000.).to(self.device)
        # self.start_label has all of the score. it is log,0 is p=1
        log_alpha[:, 0, self.start_label_id] = 0

        # feats is the probability of emission, feat.shape=(1,tag_size)
        for t in range(1, T):
            log_alpha = (log_sum_exp_batch(self.transitions + log_alpha, axis=-1) + feats[:, t]).unsqueeze(1)

        # log_prob of all barX
        log_prob_all_barX = log_sum_exp_batch(log_alpha)
        return log_prob_all_barX

    def _get_lm_features(self, input_ids, segment_ids, input_mask):
        '''
        sentences -> word embeddings -> LM -> feats
        '''
        lm_seq_out = self.language_model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, output_hidden_states=False).last_hidden_state
        lm_seq_out = self.dropout(lm_seq_out)
        lm_feats = self.hidden2label(lm_seq_out)
        return lm_feats

    def _score_sentence(self, feats, label_ids):
        '''
        Gives the score of a provided label sequence
        p(X=w1:t,Zt=tag1:t)=...p(Zt=tag_t|Zt-1=tag_t-1)p(xt|Zt=tag_t)...
        '''

        # T = self.max_seq_length
        T = feats.shape[1]
        batch_size = feats.shape[0]

        batch_transitions = self.transitions.expand(batch_size,self.num_labels,self.num_labels)
        batch_transitions = batch_transitions.flatten(1)

        score = torch.zeros((feats.shape[0],1)).to(device)
        # the 0th node is start_label->start_word,the probability of them=1. so t begin with 1.
        for t in range(1, T):
            score = score + \
                batch_transitions.gather(-1, (label_ids[:, t]*self.num_labels+label_ids[:, t-1]).view(-1,1)) \
                    + feats[:, t].gather(-1, label_ids[:, t].view(-1,1)).view(-1,1)
        return score

    def _viterbi_decode(self, feats):
        '''
        Max-Product Algorithm or viterbi algorithm, argmax(p(z_0:t|x_0:t))
        '''

        # T = self.max_seq_length
        T = feats.shape[1]
        batch_size = feats.shape[0]

        # batch_transitions=self.transitions.expand(batch_size,self.num_labels,self.num_labels)

        log_delta = torch.Tensor(batch_size, 1, self.num_labels).fill_(-10000.).to(self.device)
        log_delta[:, 0, self.start_label_id] = 0

        # psi is for the vaule of the last latent that make P(this_latent) maximum.
        psi = torch.zeros((batch_size, T, self.num_labels), dtype=torch.long).to(self.device)  # psi[0]=0000 useless
        for t in range(1, T):
            # delta[t][k]=max_z1:t-1( p(x1,x2,...,xt,z1,z2,...,zt-1,zt=k|theta) )
            # delta[t] is the max prob of the path from  z_t-1 to z_t[k]
            log_delta, psi[:, t] = torch.max(self.transitions + log_delta, -1)
            # psi[t][k]=argmax_z1:t-1( p(x1,x2,...,xt,z1,z2,...,zt-1,zt=k|theta) )
            # psi[t][k] is the path choosed from z_t-1 to z_t[k],the value is the z_state(is k) index of z_t-1
            log_delta = (log_delta + feats[:, t]).unsqueeze(1)

        # trace back
        path = torch.zeros((batch_size, T), dtype=torch.long).to(self.device)

        # max p(z1:t,all_x|theta)
        max_logLL_allz_allx, path[:, -1] = torch.max(log_delta.squeeze(), -1)

        for t in range(T-2, -1, -1):
            # choose the state of z_t according the state choosed of z_t+1.
            path[:, t] = psi[:, t+1].gather(-1,path[:, t+1].view(-1,1)).squeeze()

        return max_logLL_allz_allx, path

    def neg_log_likelihood(self, input_ids, segment_ids, input_mask, label_ids):
        lm_feats = self._get_lm_features(input_ids, segment_ids, input_mask)
        forward_score = self._forward_alg(lm_feats)
        # p(X=w1:t,Zt=tag1:t)=...p(Zt=tag_t|Zt-1=tag_t-1)p(xt|Zt=tag_t)...
        gold_score = self._score_sentence(lm_feats, label_ids)
        # - log[ p(X=w1:t,Zt=tag1:t)/p(X=w1:t) ] = - log[ p(Zt=tag1:t|X=w1:t) ]
        return torch.mean(forward_score - gold_score)

    # this forward is just for predict, not for train
    # dont confuse this with _forward_alg above.
    def forward(self, input_ids, segment_ids, input_mask):
        # Get the emission scores from the LM
        lm_feats = self._get_lm_features(input_ids, segment_ids, input_mask)

        # Find the best path, given the features.
        score, label_seq_ids = self._viterbi_decode(lm_feats)
        return score, label_seq_ids

In [9]:
# Initialize the custom model
start_label_id = nerLensProcessor.get_start_label_id()
stop_label_id = nerLensProcessor.get_stop_label_id()

language_model = AutoModel.from_pretrained(language_model_name)
model = LM_CRF_NER(language_model, start_label_id, stop_label_id, len(label_list), max_seq_length, batch_size, device)

#%%
if load_checkpoint and os.path.exists(output_dir+'/ner_lm_crf_checkpoint.pt'):
    checkpoint = torch.load(output_dir+'/ner_lm_crf_checkpoint.pt', map_location='cpu')
    start_epoch = checkpoint['epoch']+1
    valid_acc_prev = checkpoint['valid_acc']
    valid_f1_prev = checkpoint['valid_f1']
    pretrained_dict=checkpoint['model_state']
    net_state_dict = model.state_dict()
    pretrained_dict_selected = {k: v for k, v in pretrained_dict.items() if k in net_state_dict}
    net_state_dict.update(pretrained_dict_selected)
    model.load_state_dict(net_state_dict)
    print('Loaded the pretrain NER_LM_CRF model, epoch:',checkpoint['epoch'],'valid acc:',
            checkpoint['valid_acc'], 'valid f1:', checkpoint['valid_f1'])
else:
    start_epoch = 0
    valid_acc_prev = 0
    valid_f1_prev = 0

model.to(device)

# Prepare optimizer
param_optimizer = list(model.named_parameters())

no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
new_param = ['transitions', 'hidden2label.weight', 'hidden2label.bias']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) \
        and not any(nd in n for nd in new_param)], 'weight_decay': weight_decay_finetune},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) \
        and not any(nd in n for nd in new_param)], 'weight_decay': 0.0},
    {'params': [p for n, p in param_optimizer if n in ('transitions','hidden2label.weight')] \
        , 'lr':lr0_crf_fc, 'weight_decay': weight_decay_crf_fc},
    {'params': [p for n, p in param_optimizer if n == 'hidden2label.bias'] \
        , 'lr':lr0_crf_fc, 'weight_decay': 0.0}
]
optimizer = BertAdam(optimizer_grouped_parameters, lr=learning_rate0, warmup=warmup_proportion, t_total=total_train_steps)

def warmup_linear(x, warmup=0.002):
    if x < warmup:
        return x/warmup
    return 1.0 - x

# Revert the mapped tags
def revert_tags(mapped_list, label_list, label_map):
    # Reverse the label map
    reversed_map = {v: k for k, v in label_map.items()}
    # Convert mapped list to tag list
    tag_list = [reversed_map[idx] for idx in mapped_list]
    return tag_list

# Convert all 'X' tags (subword tokens) to match the previous token's tag
def process_X_tags(ner_tags):
    processed_tags = []
    for i, tag in enumerate(ner_tags):
        if tag == "X":
            # If the tag is "X", look at the previous tag
            previous_tag = processed_tags[-1] if processed_tags else None
            if previous_tag:
                if previous_tag.startswith("B-"):
                    # If the previous tag is "B-...", convert "X" to "I-..."
                    new_tag = "I-" + previous_tag[2:]
                else:
                    # Otherwise, copy the previous tag
                    new_tag = previous_tag
                processed_tags.append(new_tag)
            else:
                # If there's no previous tag (which shouldn't happen), keep "X" as is
                processed_tags.append("X")
        else:
            # If the tag is not "X", add it as is
            processed_tags.append(tag)
    
    return processed_tags

# Result evaluation
def evaluate(model, predict_dataloader, batch_size, epoch_th, dataset_name, print_classification_report=False):
    model.eval()
    all_preds = []
    all_labels = []
    total = 0
    correct = 0
    start = time.time()
    
    with torch.no_grad():
        for batch in predict_dataloader:
            batch = tuple(t.to(device) for t in batch)
            input_ids, input_mask, segment_ids, predict_mask, label_ids = batch
            _, predicted_label_seq_ids = model(input_ids, segment_ids, input_mask)
            
            valid_predicted = torch.masked_select(predicted_label_seq_ids, predict_mask)
            valid_label_ids = torch.masked_select(label_ids, predict_mask)
            
            all_preds.extend(valid_predicted.tolist())
            all_labels.extend(valid_label_ids.tolist())
            
            total += len(valid_label_ids)
            correct += valid_predicted.eq(valid_label_ids).sum().item()
    
    test_acc = correct / total
    
    all_preds = revert_tags(all_preds, label_list, label_map)
    all_preds = process_X_tags(all_preds)
    all_labels = revert_tags(all_labels, label_list, label_map)
    
    if print_classification_report: print(classification_report(all_labels, all_preds))

    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='macro')
    
    end = time.time()
    print('Epoch:%d, Macro Acc:%.2f, Macro Precision: %.2f, Macro Recall: %.2f, Macro F1: %.2f on %s, Spend:%.3f minutes for evaluation' \
        % (epoch_th, 100.*test_acc, 100.*precision, 100.*recall, 100.*f1, dataset_name,(end-start)/60.0))
    print('--------------------------------------------------------------')
    return test_acc, f1

config.json:   0%|          | 0.00/1.00k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/594M [00:00<?, ?B/s]

Some weights of LongformerModel were not initialized from the model checkpoint at lexlms/legal-longformer-base and are newly initialized: ['longformer.pooler.dense.bias', 'longformer.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Train procedure

In [10]:
if do_train:
    
    global_step_th = int(len(train_examples) / batch_size / gradient_accumulation_steps * start_epoch)

    for epoch in range(start_epoch, total_train_epochs):
        tr_loss = 0
        train_start = time.time()
        model.train()
        optimizer.zero_grad()
        for step, batch in enumerate(train_dataloader):
            batch = tuple(t.to(device) for t in batch)
            input_ids, input_mask, segment_ids, predict_mask, label_ids = batch

            neg_log_likelihood = model.neg_log_likelihood(input_ids, segment_ids, input_mask, label_ids)

            if gradient_accumulation_steps > 1:
                neg_log_likelihood = neg_log_likelihood / gradient_accumulation_steps

            neg_log_likelihood.backward()

            tr_loss += neg_log_likelihood.item()

            if (step + 1) % gradient_accumulation_steps == 0:
                # modify learning rate with warm up 
                lr_this_step = learning_rate0 * warmup_linear(global_step_th/total_train_steps, warmup_proportion)
                for param_group in optimizer.param_groups:
                    param_group['lr'] = lr_this_step

                optimizer.step()
                optimizer.zero_grad()
                global_step_th += 1

    #         print("Epoch:{}-{}/{}, Negative loglikelihood: {} ".format(epoch, step, len(train_dataloader), neg_log_likelihood.item()))

        print('--------------------------------------------------------------')
        print("Epoch:{} completed, Total training's Loss: {}, Spend: {}m".format(epoch, tr_loss, (time.time() - train_start)/60.0))
        valid_acc, valid_f1 = evaluate(model, dev_dataloader, batch_size, epoch, 'Valid_set')

        # Save a checkpoint
        if valid_f1 > valid_f1_prev:
            torch.save({'epoch': epoch, 'model_state': model.state_dict(), 'valid_acc': valid_acc,
                'valid_f1': valid_f1, 'max_seq_length': max_seq_length, 'lower_case': do_lower_case},
                        os.path.join(output_dir, 'ner_lm_crf_checkpoint.pt'))
            valid_f1_prev = valid_f1


Input ids are automatically padded to be a multiple of `config.attention_window`: 512
	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at /usr/local/src/pytorch/torch/csrc/utils/python_arg_parser.cpp:1519.)
  next_m.mul_(beta1).add_(1 - beta1, grad)


--------------------------------------------------------------
Epoch:0 completed, Total training's Loss: 579078.123046875, Spend: 1.8228780627250671m


  _warn_prf(average, modifier, msg_start, len(result))


Epoch:0, Macro Acc:86.55, Macro Precision: 23.72, Macro Recall: 18.43, Macro F1: 18.37 on Valid_set, Spend:0.160 minutes for evaluation
--------------------------------------------------------------
--------------------------------------------------------------
Epoch:1 completed, Total training's Loss: 571984.94140625, Spend: 1.804467511177063m
Epoch:1, Macro Acc:93.87, Macro Precision: 75.95, Macro Recall: 76.59, Macro F1: 75.90 on Valid_set, Spend:0.160 minutes for evaluation
--------------------------------------------------------------
--------------------------------------------------------------
Epoch:2 completed, Total training's Loss: 567630.7543945312, Spend: 1.8063796997070312m
Epoch:2, Macro Acc:95.48, Macro Precision: 83.61, Macro Recall: 86.19, Macro F1: 84.37 on Valid_set, Spend:0.159 minutes for evaluation
--------------------------------------------------------------
--------------------------------------------------------------
Epoch:3 completed, Total training's Loss:

# Load the trained model 

In [11]:
from huggingface_hub import hf_hub_download

if do_train == False: use_local_trained_model = False # If no training occurs, there would be no local trained model

if use_local_trained_model: # Reload the local model that've just been trained
    checkpoint = torch.load(output_dir+'/ner_lm_crf_checkpoint.pt', map_location='cpu')
    print("From the local training procedure:")
  # OR
else: # Load the trained model from huggingface 
    repo_id = "lxbach10012004/ner-lm-crf"
    filename = "ner_lm_crf_checkpoint.pt"
    api_key = "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"

    # Download the file
    checkpoint_path = hf_hub_download(repo_id=repo_id, filename=filename, use_auth_token=api_key)
    checkpoint = torch.load(checkpoint_path, map_location='cpu')
    print("From HuggingFace:")

epoch = checkpoint['epoch']
valid_acc_prev = checkpoint['valid_acc']
valid_f1_prev = checkpoint['valid_f1']
pretrained_dict=checkpoint['model_state']
net_state_dict = model.state_dict()
pretrained_dict_selected = {k: v for k, v in pretrained_dict.items() if k in net_state_dict}
net_state_dict.update(pretrained_dict_selected)
model.load_state_dict(net_state_dict)
print('Loaded the pretrained  NER_LM_CRF  model, epoch:',checkpoint['epoch'],'valid acc:',
      checkpoint['valid_acc'], 'valid f1:', checkpoint['valid_f1'])

model.to(device)
model.eval()

print("Previous result of the dev set with the best epoch:")
evaluate(model, dev_dataloader, batch_size, epoch, 'Valid_set', print_classification_report=True)

From the local training procedure:
Loaded the pretrained  NER_LM_CRF  model, epoch: 18 valid acc: 0.9692135460397425 valid f1: 0.9237636777976719
Previous result of the dev set with the best epoch:
               precision    recall  f1-score   support

        B-LAW       0.95      0.95      0.95        82
B-VIOLATED BY       0.97      0.94      0.96        82
B-VIOLATED ON       0.81      0.76      0.78        82
  B-VIOLATION       0.89      0.92      0.90       351
        I-LAW       0.99      0.99      0.99       289
I-VIOLATED BY       0.95      0.92      0.93       172
I-VIOLATED ON       0.89      0.90      0.89       179
  I-VIOLATION       0.90      0.95      0.92      4062
            O       0.99      0.98      0.98     19712

     accuracy                           0.97     25011
    macro avg       0.93      0.92      0.92     25011
 weighted avg       0.97      0.97      0.97     25011

Epoch:18, Macro Acc:96.92, Macro Precision: 92.64, Macro Recall: 92.17, Macro F1: 92

(0.9692135460397425, 0.9237636777976719)

# Test set Inference

In [12]:
# Test_set prediction using the best epoch of NER_LM_CRF model
predictions = []

with torch.no_grad():
    demon_dataloader = data.DataLoader(dataset=test_dataset, batch_size=10, shuffle=False, num_workers=4, collate_fn=NerDataset.pad)
    for batch in demon_dataloader:
        batch = tuple(t.to(device) for t in batch)
        input_ids, input_mask, segment_ids, predict_mask, label_ids = batch
        _, predicted_label_seq_ids = model(input_ids, segment_ids, input_mask)
        valid_predicted = torch.masked_select(predicted_label_seq_ids, predict_mask)
        
        for i in range(len(input_ids)):
            new_ids = predicted_label_seq_ids[i].cpu().numpy()[predict_mask[i].cpu().numpy() == 1]
            predicted_tags = list(map(lambda i: label_list[i], new_ids))
            predictions.append(predicted_tags)
            
# Post process the 'X' tags and add predicted tags to the test dataset records
for i, example in enumerate(test_examples):
    prediction = process_X_tags(predictions[i])
    example.predicted_tags = prediction

# Save the results to a new CSV file
output_file = 'predictions_NERLens.csv'
with open(output_file, 'w', newline='') as f:
    writer = csv.DictWriter(f, fieldnames=['id', 'tokens', 'ner_tags'])
    writer.writeheader()
    for example in test_examples:
        row = {
            'id': example.__dict__['guid'],
            'tokens': example.__dict__['words'],
            'ner_tags': example.__dict__['predicted_tags']
        }
        writer.writerow(row)

print(f'Predictions saved to {output_file}')

Predictions saved to predictions_NERLens.csv
