In [1]:
from datasets import load_dataset
import string
from transformers import (LukeTokenizer, LukeModel, LukeForEntityPairClassification, 
                          AutoModel, AutoTokenizer, LukeForEntitySpanClassification,
                          BertForTokenClassification, get_linear_schedule_with_warmup, AdamW, get_scheduler)
import matplotlib.pyplot as plt
import torch
from torch.utils.data import DataLoader, TensorDataset
from collections import Counter

import seaborn as sns
from sklearn import metrics
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score


from ray import tune, train

In [2]:


dataset_name = "Babelscape/multinerd"

# Load data & Remove non-english items

In [3]:



#Loads data, only keeps english, removes language column
dataset_eng = load_dataset(dataset_name).filter(lambda x: x["lang"] == "en").remove_columns("lang")

Found cached dataset json (C:/Users/NicHer/.cache/huggingface/datasets/Babelscape___json/Babelscape--multinerd-f822e910a4f604c0/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached processed dataset at C:\Users\NicHer\.cache\huggingface\datasets\Babelscape___json\Babelscape--multinerd-f822e910a4f604c0\0.0.0\0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51\cache-eb9ab4cbc9b9233f.arrow
Loading cached processed dataset at C:\Users\NicHer\.cache\huggingface\datasets\Babelscape___json\Babelscape--multinerd-f822e910a4f604c0\0.0.0\0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51\cache-eed54e3e6216fb0e.arrow
Loading cached processed dataset at C:\Users\NicHer\.cache\huggingface\datasets\Babelscape___json\Babelscape--multinerd-f822e910a4f604c0\0.0.0\0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51\cache-fe66576485de1b0f.arrow


# Data exploration

In [4]:

#Uneven class distribution
#nercounts = dict(Counter([item for sublist in dataset_eng["train"]["ner_tags"] for item in sublist]))
#nercounts

In [5]:
#most texts are short, can get away with low max token length
#plt.hist([len(x) for x in dataset_eng["train"]["tokens"]], bins=50)

## Itos & Stoi

In [6]:
#Mapping from integer labels to strings (from HF dataset repo) and vice versa

stoi = {
    "O": 0,
    "B-PER": 1,
    "I-PER": 2,
    "B-ORG": 3,
    "I-ORG": 4,
    "B-LOC": 5,
    "I-LOC": 6,
    "B-ANIM": 7,
    "I-ANIM": 8,
    "B-BIO": 9,
    "I-BIO": 10,
    "B-CEL": 11,
    "I-CEL": 12,
    "B-DIS": 13,
    "I-DIS": 14,
    "B-EVE": 15,
    "I-EVE": 16,
    "B-FOOD": 17,
    "I-FOOD": 18,
    "B-INST": 19,
    "I-INST": 20,
    "B-MEDIA": 21,
    "I-MEDIA": 22,
    "B-MYTH": 23,
    "I-MYTH": 24,
    "B-PLANT": 25,
    "I-PLANT": 26,
    "B-TIME": 27,
    "I-TIME": 28,
    "B-VEHI": 29,
    "I-VEHI": 30,
  }



itos = {value:key for key,value in stoi.items()}

# Preprocessing

In [7]:
#Basic bert model - cased since the text is cased and it probably helps label names like "Jessica Alba"

model_name = "bert-base-cased"

# Tokenizer is to be initialized here, used for processing
tokenizer = AutoTokenizer.from_pretrained(model_name)


In [8]:


#Modularize this, dont hardocode a bunch of stuff


#Taken from HF, to align the ner labels with tokenized words
def align_labels_with_tokens(labels, word_ids, max_length):
    new_labels = []
    current_word = None
    
    for word_id in word_ids:
        if word_id != current_word:
            # New word
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]   #-100 to not take into account during loss function
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)                                #-100 for special tokens
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels[:max_length]    ##truncates labels match



def to_tensor_dataset(data, labels):
    # Effectively zips the data and labels
    
    inp_ids = data["input_ids"]
    atmask = data["attention_mask"]
    return TensorDataset(inp_ids, atmask, labels)



def label_fix(labels):
    # Input is a nested list of labels
    
    return [[x if x in [1,2,3,4,5,6,7,8,13,14] else 0 for x in sublist] for sublist in labels]
    



def prepare_data(dataset, splitname, all_labels, max_length=60):
    ## dataset, split, all_labels or subset => Returns torch tensors
    
    splitname = splitname
    sents = dataset[splitname]["tokens"]
    labels = dataset[splitname]["ner_tags"]
    
    # Change labels if all_labels = False
    # A bit slow, would be faster with tensor operations
    if not all_labels:
        labels = label_fix(labels) 
    
    assert len(sents) == len(labels)
    
    # Tokenize and align labels, currently pad everything to the same length
    tokenized_sents = tokenizer(sents, is_split_into_words=True, add_special_tokens=True, padding="max_length", truncation=True, max_length=max_length,  return_tensors="pt")
    
    # Stack to turn list of torch tensors into one tensor
    aligned_labels = torch.stack([torch.tensor(align_labels_with_tokens(labels[i], tokenized_sents[i].word_ids, max_length)) for i in range(len(labels))])
    
    dataset = to_tensor_dataset(tokenized_sents, aligned_labels)
    
    return dataset




all_labels = True

training_dataset = prepare_data(dataset_eng, "train", all_labels)
evaluation_dataset = prepare_data(dataset_eng, "validation", all_labels)
testing_dataset = prepare_data(dataset_eng, "test", all_labels)

In [9]:
len(training_dataset)

262560


for i in range(0,3):
    print(tokenizer.tokenize(" ".join(dataset_eng["train"]["tokens"][i])))
    print(dataset_eng["train"]["tokens"][i])
    print(dataset_eng["train"]["ner_tags"][i])
    print([itos[int(x)] for x in train_labels[i] if x != -100])
    print("-------------------------------------------")

In [10]:
# If not all labels, num labels = 11, 2*5 for B and I, + 1 for O
# Else its the length of the label dataset

if not all_labels:
    num_labels = 11
else:
    num_labels = len(itos)
    
num_labels

31

In [11]:

device = "cuda"
#batch_size = 256


#test_loader = DataLoader(testing_dataset, batch_size=batch_size, shuffle=False) #False for reproducibility for now

# Training

In [12]:

# The model probably doesnt need to be more complicated than this, the context is usually very short
model = BertForTokenClassification.from_pretrained("bert-base-cased", 
                                                   num_labels=num_labels, 
                                                   vocab_size=tokenizer.vocab_size, 
                                                   ignore_mismatched_sizes=True).to(device)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
### Hyperparameters ###

#epochs = 1
#lr = 3e-5,


#num_batches = train_data["input_ids"].shape[0] // batch_size
#total_steps = num_batches * epochs

#num_warmup_steps = total_steps // 10
max_grad_norm = 1.0


### Hyperparameters ###




# Keep track for plotting &  Check difference between training and eval loss to prevent overfitting
t_lossi, e_lossi, total_training_loss, total_eval_loss = [], [], [], []



In [14]:
train_loader = DataLoader(training_dataset, batch_size=32, shuffle=False)

for j,batch, in enumerate(train_loader):
    print(j)
    _ids, at, lab = batch
    print(_ids.shape)
    print(at.shape)
    print(lab.shape)
    break

0
torch.Size([32, 60])
torch.Size([32, 60])
torch.Size([32, 60])


In [None]:


def train_model(params, model, training_dataset, evaluation_dataset):

    #Reset loaders with batch size parameter
    train_loader = DataLoader(training_dataset, batch_size=params["batch_size"], shuffle=False) #False for reproducibility for now
    eval_loader = DataLoader(evaluation_dataset, batch_size=params["batch_size"], shuffle=False) #False for reproducibility for now
    
    
    optimizer = AdamW(params = model.parameters(), lr=params["lr"])
    
    total_steps = (len(training_dataset) // params["batch_size"]) * params["epochs"]
    warmup_steps = int(total_steps * 0.1) #standard 10th
    
    scheduler = get_scheduler("linear", 
                              optimizer=optimizer, 
                              num_warmup_steps= warmup_steps, ##set warmup steps to 0.1 * total num steps 
                              num_training_steps=total_steps)
    
    
    for i in range(1, params["epochs"] + 1):
        train_loss = 0
        model.train()
        for j,batch in enumerate(train_loader):
            _ids, at, lab = batch #_ids, attention_mask, labels = [b.to(model.device) for b in batch] - looks nicer
            
            out = model(input_ids=_ids.to(device), attention_mask=at.to(device), labels=lab.to(device)) #   logits = [256, 40, 31] B,T,C, loss is NLL
            train_loss += out.loss.item()
            
            torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm) #gradient clipping - safety net

            optimizer.zero_grad() # Zero gradients between each update
            out.loss.backward()   # Calculate gradients
            optimizer.step()      # Step

            if scheduler:  # Update learning rate
                scheduler.step()

            if j % 25 == 0 and j > 0:
                print("Average T loss at step {}: {}".format(j, train_loss / j ))
                train.report({"step": j, "epoch": i, "training_loss": train_loss / j, "loss": None}) ##loss needs to be 0 here otherwise error
                t_lossi.append(train_loss / j)

        model.eval()
        eval_loss = 0
        with torch.no_grad():
            for j,batch in enumerate(eval_loader):
                _ids, at, lab = batch
                out = model(input_ids=_ids.to(device), attention_mask=at.to(device), labels=lab.to(device))

                eval_loss += out.loss.item()

                if j % 25 == 0 and j > 0:
                    print("Average E loss at step {}: {}".format(j, eval_loss / j ))
                    train.report({"step": j, "epoch": i, "evaluation_loss": train_loss / j, "loss": None }) ##loss needs to be 0 here otherwise error
                    e_lossi.append(eval_loss / j )
        
        
        #report average evaluation loss to tune
        avg_eval_loss = eval_loss / len(eval_loader)
        train.report({"loss" : avg_eval_loss})

        print("epoch: ", i)
    
    
    

    
## Random search of hyperparameters, not optimal but its something
analysis = tune.run(
    tune.with_parameters(
        train_model,                           #name of function - normally trainer.train
        model=model,                          #model input
        training_dataset=training_dataset,    #this otherwise errors when running
        evaluation_dataset=evaluation_dataset  #this otherwise errors when running
    ),
    resources_per_trial={
        "gpu": 0.2  # only gpu, set to 0.2 for 5 simultaneous jobs
    },
    config={
        "lr": tune.choice([2e-5, 3e-5, 5e-5]),    #per original bert paper
        "batch_size": tune.choice([16, 32, 64]),    #batch sizes to test
        "epochs": tune.choice([1]),           #epoch choices
    }, 
    num_samples=10,                                   #total number of combinations
    metric="loss",                                 # The metric to optimize
    mode="min"    
)



2023-12-12 21:01:26,988	INFO tune.py:586 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949


0,1
Current time:,2023-12-12 21:52:17
Running for:,00:50:50.75
Memory:,24.3/255.7 GiB

Trial name,status,loc,batch_size,epochs,lr,iter,total time (s),step,epoch,training_loss
train_model_3bbc1_00000,RUNNING,127.0.0.1:9260,64,1,5e-05,144.0,3025.31,3600.0,1.0,0.123966
train_model_3bbc1_00001,RUNNING,127.0.0.1:14048,16,1,3e-05,361.0,3036.93,9025.0,1.0,0.13892
train_model_3bbc1_00002,RUNNING,127.0.0.1:16260,64,1,5e-05,144.0,3025.09,3600.0,1.0,0.12388
train_model_3bbc1_00003,RUNNING,127.0.0.1:13576,64,1,3e-05,144.0,3025.25,3600.0,1.0,0.142874
train_model_3bbc1_00004,RUNNING,127.0.0.1:4436,64,1,5e-05,144.0,3025.41,3600.0,1.0,0.123451
train_model_3bbc1_00005,PENDING,,16,1,2e-05,,,,,
train_model_3bbc1_00006,PENDING,,64,1,5e-05,,,,,
train_model_3bbc1_00007,PENDING,,64,1,2e-05,,,,,
train_model_3bbc1_00008,PENDING,,16,1,3e-05,,,,,
train_model_3bbc1_00009,PENDING,,64,1,3e-05,,,,,




Trial name,epoch,loss,step,training_loss
train_model_3bbc1_00000,1,,3625,0.123434
train_model_3bbc1_00001,1,,9050,0.138649
train_model_3bbc1_00002,1,,3625,0.123358
train_model_3bbc1_00003,1,,3625,0.142264
train_model_3bbc1_00004,1,,3625,0.122919


[36m(train_model pid=14048)[0m Average T loss at step 25: 3.5628770446777343
[36m(train_model pid=14048)[0m Average T loss at step 50: 3.450376024246216
[36m(train_model pid=16260)[0m Average T loss at step 25: 3.4625025844573973
[36m(train_model pid=14048)[0m Average T loss at step 100: 3.1990654945373533[32m [repeated 5x across cluster][0m
[36m(train_model pid=16260)[0m Average T loss at step 50: 2.9885331797599792
[36m(train_model pid=14048)[0m Average T loss at step 125: 3.0087327051162718
[36m(train_model pid=14048)[0m Average T loss at step 150: 2.750977282524109[32m [repeated 4x across cluster][0m
[36m(train_model pid=14048)[0m Average T loss at step 175: 2.5171081709861753
[36m(train_model pid=16260)[0m Average T loss at step 75: 2.345032615661621
[36m(train_model pid=14048)[0m Average T loss at step 200: 2.3262782740592955[32m [repeated 4x across cluster][0m
[36m(train_model pid=14048)[0m Average T loss at step 225: 2.152107840379079
[36m(train_mod

[36m(train_model pid=16260)[0m Average T loss at step 750: 0.38390364866331217
[36m(train_model pid=14048)[0m Average T loss at step 1900: 0.4149329481933168[32m [repeated 5x across cluster][0m
[36m(train_model pid=14048)[0m Average T loss at step 1925: 0.4104510240360804
[36m(train_model pid=16260)[0m Average T loss at step 775: 0.37412310683318684
[36m(train_model pid=14048)[0m Average T loss at step 1950: 0.406065712909047[32m [repeated 4x across cluster][0m
[36m(train_model pid=14048)[0m Average T loss at step 1975: 0.4026529175097335
[36m(train_model pid=16260)[0m Average T loss at step 800: 0.364622739237966
[36m(train_model pid=14048)[0m Average T loss at step 2025: 0.3948349411141725[32m [repeated 5x across cluster][0m
[36m(train_model pid=14048)[0m Average T loss at step 2050: 0.39016939612167956
[36m(train_model pid=16260)[0m Average T loss at step 825: 0.3556865559484471
[36m(train_model pid=14048)[0m Average T loss at step 2075: 0.386523603289163

[36m(train_model pid=14048)[0m Average T loss at step 3700: 0.24335722851044675[32m [repeated 4x across cluster][0m
[36m(train_model pid=14048)[0m Average T loss at step 3725: 0.24210437796616163
[36m(train_model pid=16260)[0m Average T loss at step 1500: 0.22576009724661708
[36m(train_model pid=14048)[0m Average T loss at step 3775: 0.2397706780149077[32m [repeated 5x across cluster][0m
[36m(train_model pid=14048)[0m Average T loss at step 3800: 0.23857288632005771
[36m(train_model pid=16260)[0m Average T loss at step 1525: 0.22332248643284938
[36m(train_model pid=14048)[0m Average T loss at step 3825: 0.23749897281033613[32m [repeated 4x across cluster][0m
[36m(train_model pid=14048)[0m Average T loss at step 3850: 0.23643100856026247
[36m(train_model pid=16260)[0m Average T loss at step 1550: 0.22092994966574253
[36m(train_model pid=14048)[0m Average T loss at step 3900: 0.23443155304179528[32m [repeated 5x across cluster][0m
[36m(train_model pid=14048)

[36m(train_model pid=14048)[0m Average T loss at step 5525: 0.18336805814429402[32m [repeated 5x across cluster][0m
[36m(train_model pid=14048)[0m Average T loss at step 5550: 0.1828138553466845
[36m(train_model pid=16260)[0m Average T loss at step 2225: 0.17561509629774294
[36m(train_model pid=14048)[0m Average T loss at step 5575: 0.18242184732801536[32m [repeated 4x across cluster][0m
[36m(train_model pid=14048)[0m Average T loss at step 5600: 0.18190602649472046
[36m(train_model pid=16260)[0m Average T loss at step 2250: 0.1741513715961741
[36m(train_model pid=14048)[0m Average T loss at step 5650: 0.18101667471933672[32m [repeated 5x across cluster][0m
[36m(train_model pid=14048)[0m Average T loss at step 5675: 0.18057091642871687
[36m(train_model pid=16260)[0m Average T loss at step 2275: 0.17276965843820638
[36m(train_model pid=14048)[0m Average T loss at step 5700: 0.18015533045721607[32m [repeated 4x across cluster][0m
[36m(train_model pid=14048)[

[36m(train_model pid=14048)[0m Average T loss at step 7325: 0.15591548642019246[32m [repeated 4x across cluster][0m
[36m(train_model pid=14048)[0m Average T loss at step 7350: 0.15559518377015882
[36m(train_model pid=16260)[0m Average T loss at step 2950: 0.14286229991573066
[36m(train_model pid=14048)[0m Average T loss at step 7400: 0.1549882614740754[32m [repeated 5x across cluster][0m
[36m(train_model pid=14048)[0m Average T loss at step 7425: 0.1547170813282856
[36m(train_model pid=16260)[0m Average T loss at step 2975: 0.14190616570833772
[36m(train_model pid=14048)[0m Average T loss at step 7450: 0.15445646807150962[32m [repeated 4x across cluster][0m
[36m(train_model pid=14048)[0m Average T loss at step 7475: 0.15425030280782587
[36m(train_model pid=16260)[0m Average T loss at step 3000: 0.14103750680198815
[36m(train_model pid=14048)[0m Average T loss at step 7525: 0.15373769295064227[32m [repeated 5x across cluster][0m
[36m(train_model pid=14048)[

In [None]:
best_config = analysis.get_best_config(metric="loss", mode="min")
print("Best hyperparameters found were: ", best_config)

In [None]:
#check for overfitting by comparing performance on training data and evaluation data
plt.plot(t_lossi)


In [None]:
plt.plot(e_lossi)

# Evaluation

In [None]:



def test(test_loader):
    model.eval()
    predictions, labels = [], []

    with torch.no_grad():
        for i, batch in enumerate(test_loader):
            _ids, at, lab = batch
            out = model(input_ids=_ids, attention_mask=at)

            #print(out.logits.shape)  #dim 2 are predictions for each input
            preds = torch.argmax(out.logits, dim=2)


            # Only compare indicies where there is not padding or special tokens
            mask = (lab != -100) #& (lab != 0)

            # Also test with mask = lab != 0, since the class is so large

            # Retrieve correct indices
            preds = preds[mask]
            labs = lab[mask]

            # Tolist for comparison
            predictions.extend(preds.tolist())
            labels.extend(labs.tolist())
    
    return predictions, labels


#accuracy is high, because most are just 0
sum([1 for x,y in zip(predictions, labels) if x==y]) / len(labels)

preds, labels = test(test_loader)


In [None]:


cm = metrics.confusion_matrix([itos[x] for x in labels], [itos[x] for x in predictions])

# Plot the confusion matrix
plt.figure(figsize=(30, 30))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.ylabel("True Label")
plt.xlabel("Predicted Label")
plt.title("Confusion Matrix")
plt.show()