## Prep

In [2]:
#import sys
import pandas as pd
import os
import numpy as np
from collections import Counter
from datasets import Dataset, DatasetDict
cwd = os.getcwd()
pol_dir = cwd+"/src/d01_data"
#os.environ["CUDA_VISIBLE_DEVICES"]="0"

## Exploring original dataset

In [3]:
pol_df = pd.read_pickle(pol_dir+"/preprocessed_dataframe.pkl")[["Policy","Text","Tokens","Curation"]]
all_layers= []
all_features = []
all_tags = []
span_count = 0
# how many spans for different features and layers
for art in pol_df.index:
    for span in pol_df.loc[art, "Curation"]:
        span_count+=1
        all_layers.append(span.layer)
        all_features.append(span.feature)
        all_tags.append(span.tag)
print(Counter(all_layers))
print(Counter(all_features))
print(Counter(all_tags))
print(span_count)
print(list(set(all_features)), len(list(set(all_features))))
print(list(set(all_tags)), len(list(set(all_tags))))

Counter({'Policydesigncharacteristics': 11380, 'Technologyandapplicationspecificity': 5989, 'Instrumenttypes': 3487})
Counter({'Actor': 6021, 'InstrumentType': 3388, 'TechnologySpecificity': 2526, 'EnergySpecificity': 1908, 'Compliance': 1829, 'ApplicationSpecificity': 1550, 'Reference': 1181, 'Objective': 992, 'Time': 798, 'Resource': 511, 'InstrumentType_2': 98, 'Reversibility': 40, 'end': 14})
Counter({'Addressee_default': 2081, 'Form_monitoring': 1781, 'Tech_LowCarbon': 1468, 'Energy_LowCarbon': 1358, 'Addressee_sector': 1356, 'RegulatoryInstr': 1231, 'Unspecified': 1100, 'Tech_Other': 1058, 'Authority_default': 939, 'App_Other': 786, 'App_LowCarbon': 764, 'Ref_OtherPolicy': 737, 'Authority_monitoring': 659, 'Objective_QualIntention': 556, 'Energy_Other': 550, 'Addressee_monitored': 483, 'Resource_Other': 362, 'Authority_legislative': 330, 'Edu_Outreach': 329, 'Time_Monitoring': 323, 'Ref_Strategy_Agreement': 319, 'FrameworkPolicy': 318, 'Time_Compliance': 310, 'Objective_QualInten

In [4]:
all_feature_names = ['Objective', 'Compliance', 'Time', 'Resource', 'InstrumentType', 'ApplicationSpecificity', 'TechnologySpecificity', 'EnergySpecificity', 'Reference', 'Reversibility', 'Actor']
all_feature_names = sorted(all_feature_names)
#all_feature_names

In [5]:
all_tag_names = ['Authority_default', 'Objective_QualIntention', 'TaxIncentives', 'Unspecified', 'Addressee_default', 'Energy_LowCarbon', 'Ref_PolicyAmended', 'Ref_Strategy_Agreement', 'FrameworkPolicy', 'Reversibility_policy', 'Time_Monitoring', 'Energy_Other', 'Ref_OtherPolicy', 'Addressee_sector', 'Resource_MonSpending', 'Form_monitoring', 'Authority_legislative', 'Subsidies_Incentives', 'Tech_Other', 'Addressee_resource', 'Time_Resources', 'Authority_established', 'Objective_QualIntention_noCCM', 'RD_D', 'Resource_MonRevenues', 'Addressee_monitored', 'Time_InEffect', 'Tech_LowCarbon', 'RegulatoryInstr', 'PublicInvt', 'Objective_QuantTarget', 'Objective_QuantTarget_noCCM', 'VoluntaryAgrmt', 'TradablePermit', 'Time_PolDuration', 'Time_Compliance', 'App_Other', 'Resource_Other', 'App_LowCarbon', 'Authority_monitoring', 'Form_sanctioning', 'Edu_Outreach']
all_tag_names = sorted(all_tag_names)
#all_tag_names

In [6]:
# how many spans the tokens have
toks_cts= []
for art in pol_df.index:
    for tokensp in pol_df.loc[art, "Tokens"]:
        sc = tokensp.get_span_count(annotators ='Curation')
        toks_cts.append(sc)
print(len(toks_cts))
print(Counter(toks_cts))
#52298/(45356+7062+708+70+6)

263752
Counter({0: 210550, 1: 45356, 2: 7062, 3: 708, 4: 70, 5: 6})


In [7]:
# how many spans the tokens have
toks_cts= []
for art in pol_df.index:
    for tokensp in pol_df.loc[art, "Tokens"]:
        sc = tokensp.get_token_spans(annotators ='Curation')
        if sc:
            count = 0
            for s in sc:
                if s.feature in ["Actor","InstrumentType","Objective","Resource","Time"]:
                    count+=1
            toks_cts.append(count)
            if count==4:
                print("\n",tokensp.token_id, tokensp.text)
                for s in sc:
                    print(s.span_id, s.text, s.feature)
print(Counter(toks_cts))


 T15813 information
CUR653 provide adequate and targeted information and advice InstrumentType
CUR654 information and advice InstrumentType
CUR676 information and advice Resource
CUR677 information Resource

 T15815 advice
CUR653 provide adequate and targeted information and advice InstrumentType
CUR654 information and advice InstrumentType
CUR676 information and advice Resource
CUR678 advice Resource

 T81467 market
CUR6566 right to switch supplier or market participants InstrumentType
CUR6601 right to switch supplier or market participants engaged in aggregation is granted to customers in a non-discriminatory manner as regards cost, effort and time Objective
CUR6603 market participants engaged in aggregation Actor
CUR6604 market participants Actor

 T81468 participants
CUR6566 right to switch supplier or market participants InstrumentType
CUR6601 right to switch supplier or market participants engaged in aggregation is granted to customers in a non-discriminatory manner as regards c

In [8]:
# for each layer and feature
# accrue list of number of tokens
# then calculate mean+sd, median, mode, and range

layers_dct ={key:[] for key in list(dict(Counter(all_layers)))}
features_dct ={key:[] for key in list(dict(Counter(all_features)))}
tags_dct={key:[] for key in list(dict(Counter(all_tags)))}
# token counts for each span of these specific layers/features
for art in pol_df.index:
    for span in pol_df.loc[art, "Curation"]:
        layers_dct[span.layer].append(len(span.text.split(" ")))
        features_dct[span.feature].append(len(span.text.split(" ")))
        tags_dct[span.tag].append(len(span.text.split(" ")))
print(len(layers_dct['Policydesigncharacteristics']))

11380


In [11]:
from create_datasets import get_label_set, deduped_df_fxn
pol_a = deduped_df_fxn(pol_df, "a")
len(list(pol_a.index))
# if split("_")[-1] == front or Whereas
'''
removal = []
for code in list(pol_a.index):
    if code.split("_")[-1] in ["front", "Whereas"]:
        removal.append(code)
'''   
removal = [code for code in list(pol_a.index) if code.split("_")[-1] in ["front", "Whereas"]]     
print(len(removal))
pol_a = pol_a.drop(removal, axis=0)
len(list(pol_a.index))

36


412

In [18]:
from datasets import load_from_disk
mode = "a"
sghead_ds = load_from_disk(cwd+f"/inputs/{mode}/sghead_ds")
for row in sghead_ds:
    if row["id"].split("_")[-1] in ["front","Whereas"]:
        print(list(set(row["ner_tags"])))

In [16]:
sp_cnt = 0
for art in pol_df.index:
    for span in pol_df.loc[art,"Curation"]:
        sp_cnt+=1
print(sp_cnt)
for ltr in ["a","b","c","d","e"]:
    #x = identify_dup_spans(pol_df, ltr)
    #print(ltr, len(x))
    new_df = deduped_df_fxn(pol_df, ltr)
    sp_cnt = 0
    for art in new_df.index:
        for span in new_df.loc[art,"Curation"]:
            sp_cnt+=1
    print(sp_cnt)

20856
20007
20856
18068
19507
20856


In [33]:
for layer in list(layers_dct):
    print("\n", layer)
    values = layers_dct[layer]
    print("Total:", np.sum(values))
    print("Mean:", np.mean(values), "\tSD:", np.std(values))
    print("Median:", np.median(values))
    bc = np.bincount(values)
    print("Mode:", np.argmax(bc))
    print("Range:", np.min(values), "-", np.max(values))

for feature in list(features_dct):
    print("\n", feature)
    values = features_dct[feature]
    print("Total:", np.sum(values))
    print("Mean:", np.mean(values), "\tSD:", np.std(values))
    print("Median:", np.median(values))
    bc = np.bincount(values)
    print("Mode:", np.argmax(bc))
    print("Range:", np.min(values), "-", np.max(values))


 Instrumenttypes
Total: 8102
Mean: 2.323487238313737 	SD: 2.067065017042484
Median: 2.0
Mode: 1
Range: 1 - 52

 Policydesigncharacteristics
Total: 32441
Mean: 2.8507029876977152 	SD: 3.752086055706703
Median: 2.0
Mode: 1
Range: 1 - 79

 Technologyandapplicationspecificity
Total: 15561
Mean: 2.5982634830522624 	SD: 2.5907931939638527
Median: 2.0
Mode: 1
Range: 1 - 56

 InstrumentType
Total: 7838
Mean: 2.3134592680047223 	SD: 2.0796638783524948
Median: 2.0
Mode: 1
Range: 1 - 52

 Actor
Total: 12253
Mean: 2.035044012622488 	SD: 1.5666346355393967
Median: 2.0
Mode: 2
Range: 1 - 27

 Time
Total: 2878
Mean: 3.606516290726817 	SD: 3.752433815934359
Median: 2.0
Mode: 1
Range: 1 - 26

 Compliance
Total: 4422
Mean: 2.417714598141061 	SD: 3.4569021967239286
Median: 1.0
Mode: 1
Range: 1 - 53

 Reversibility
Total: 418
Mean: 10.45 	SD: 15.9952336650641
Median: 3.0
Mode: 1
Range: 1 - 79

 Reference
Total: 3326
Mean: 2.8162574089754444 	SD: 2.6355107472792207
Median: 2.0
Mode: 2
Range: 1 - 37

 Obje

## Validating our datasets and dataset dicts

In [32]:
for mode in ["a", "b", "c", "d"]:
    print(mode)
    ds = Dataset.load_from_disk(f"{cwd}/inputs/{mode}/sghead_ds")
    flatList = [element for innerList in ds['ner_tags'] for element in innerList]
    print(Counter(flatList))

a
Counter({'O': 232246, 'I-Actor': 6668, 'I-Objective': 6626, 'B-Actor': 5807, 'I-InstrumentType': 3904, 'B-InstrumentType': 3090, 'I-Time': 2467, 'B-Objective': 952, 'B-Time': 781, 'I-Resource': 710, 'B-Resource': 501})
b
Counter({'O': 210550, 'I-Policydesigncharacteristics': 22731, 'B-Policydesigncharacteristics': 10781, 'I-Technologyandapplicationspecificity': 7767, 'B-Technologyandapplicationspecificity': 5548, 'I-Instrumenttypes': 3515, 'B-Instrumenttypes': 2860})
c
Counter({'O': 211241, 'I-Actor': 6204, 'B-Actor': 5641, 'I-Objective': 5445, 'I-Reference': 5322, 'I-TechnologySpecificity': 3406, 'I-InstrumentType': 3384, 'B-InstrumentType': 2779, 'I-Time': 2449, 'I-Compliance': 2282, 'B-TechnologySpecificity': 2258, 'I-ApplicationSpecificity': 2182, 'I-EnergySpecificity': 2177, 'B-EnergySpecificity': 1881, 'B-Compliance': 1767, 'B-ApplicationSpecificity': 1405, 'B-Reference': 1166, 'B-Objective': 894, 'B-Time': 780, 'I-Resource': 604, 'B-Resource': 485})
d
Counter({'O': 236344, 'I-

In [None]:
for mode in ["a", "b", "c", "d"]:
    print(mode)
    ds = Dataset.load_from_disk(f"{cwd}/inputs/{mode}/mhead_ds")
    for cat in list(ds.features):
        if cat not in ['id', 'text', 'tokens']:
            flatList = [element for innerList in ds[cat] for element in innerList]
            print(Counter(flatList))


In [None]:
r = 0
for mode in ["a", "b", "c", "d"]:
    dsdct = DatasetDict.load_from_disk(f"{cwd}/inputs/{mode}/sghead_dsdcts/dsdct_r{r}")
    print(dsdct)
    dsdct = DatasetDict.load_from_disk(f"{cwd}/inputs/{mode}/mhead_dsdcts/dsdct_r{r}")
    print(dsdct)

## Start

In [None]:
import json
import sys
import pandas as pd
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
import numpy as np
sys.path.insert(0, '..')
from collections import Counter
from transformers import pipeline, AutoModel, PreTrainedTokenizerBase, AutoConfig, PreTrainedModel, PretrainedConfig
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification, TrainingArguments, Trainer
from transformers.modeling_outputs import TokenClassifierOutput
from datasets import Dataset, DatasetDict
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
import evaluate
from typing import Any, Dict, List
from sklearn.metrics import f1_score

cwd = os.getcwd()
pol_dir = cwd+"/../src/d01_data"

so many things to try even just for NER, especially given how long some of these token sequences are

trying multiple classifiction heads
- different base models
- using last hidden state vs using last few hidden states (average or concatenation)
- weighted vs unweighted loss
- evaluation metrics (token micro F1 or overlap instead of seqeval)

## Pytorch

In [None]:
def sghead_tokenize_and_align_labels(tokens, ner_tags, tokenizer, label2id, max_length=512):
    tokenized_inputs = tokenizer(tokens, truncation=True, is_split_into_words=True, max_length=max_length, return_tensors=None)
    word_ids = tokenized_inputs.word_ids()
    previous_word_idx = None
    label_ids = []
    for word_idx in word_ids:
        if word_idx is None:
            label_ids.append(-100)
        elif word_idx != previous_word_idx:
            label_ids.append(label2id[ner_tags[word_idx]])
        else:
            label_ids.append(-100)
        previous_word_idx = word_idx
    tokenized_inputs["labels"] = label_ids
    return tokenized_inputs

In [None]:
from torch.utils.data import Dataset as TorchDataset

class SgheadDataset(TorchDataset):
    def __init__(self, hf_dataset, tokenizer, max_length=512):
        self.dataset = hf_dataset
        self.tokenizer = tokenizer
        self.label_list = ['O', 'B-Actor', 'I-Actor', 'B-InstrumentType', 'I-InstrumentType', 'B-Objective', 'I-Objective', 'B-Resource', 'I-Resource', 'B-Time', 'I-Time']
        self.label2id = {l: i for i, l in enumerate(self.label_list)}
        self.max_length = max_length

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        sample = self.dataset[idx]
        tokens = sample["tokens"]
        ner_tags = sample["ner_tags"]

        encoding = sghead_tokenize_and_align_labels(
            tokens,
            ner_tags,
            self.tokenizer,
            self.label2id,
            self.max_length
        )

        return {
            "input_ids": torch.tensor(encoding["input_ids"]),
            "attention_mask": torch.tensor(encoding["attention_mask"]),
            "labels": torch.tensor(encoding["labels"])
        }


In [None]:
from torch.nn.utils.rnn import pad_sequence

def sghead_collate_fn(batch, pad_token_id):
    input_ids = [b["input_ids"] for b in batch]
    attention_masks = [b["attention_mask"] for b in batch]
    labels = [b["labels"] for b in batch]

    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=pad_token_id)
    attention_masks = pad_sequence(attention_masks, batch_first=True, padding_value=0)
    labels = pad_sequence(labels, batch_first=True, padding_value=-100)

    return {
        "input_ids": input_ids,
        "attention_mask": attention_masks,
        "labels": labels
    }


In [None]:
from torch.utils.data import DataLoader

r=0
model_name = "microsoft/deberta-v3-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
dataset_dict = DatasetDict.load_from_disk(cwd+f"/inputs/sghead_dsdcts/dsdct_r{r}")

train_dataset = SgheadDataset(
    dataset_dict["train"],
    tokenizer
)

dev_dataset = SgheadDataset(
    dataset_dict["dev"],
    tokenizer
)

train_loader = DataLoader(
    train_dataset,
    batch_size=16,
    shuffle=True,
    collate_fn=lambda b: sghead_collate_fn(b, tokenizer.pad_token_id)
)

dev_loader = DataLoader(
    dev_dataset,
    batch_size=16,
    shuffle=False,
    collate_fn=lambda b: sghead_collate_fn(b, tokenizer.pad_token_id)
)


In [None]:
import torch
from transformers import AutoModelForTokenClassification, get_linear_schedule_with_warmup

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

label_list = ['O', 'B-Actor', 'I-Actor', 'B-InstrumentType', 'I-InstrumentType', 'B-Objective', 'I-Objective', 'B-Resource', 'I-Resource', 'B-Time', 'I-Time']
label2id = {l: i for i, l in enumerate(label_list)}
id2label = {i: l for i, l in enumerate(label_list)}

model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=(model_name == model_name)
).to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5, weight_decay=0.01)

num_epochs = 10
num_training_steps = num_epochs * len(train_loader)

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)


In [None]:
model.train()
for epoch in range(num_epochs):
    total_loss = 0.0

    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}

        outputs = model(**batch)
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1} | Train loss: {avg_loss:.4f}")

In [None]:
import numpy as np
import evaluate

seqeval = evaluate.load("seqeval")

def evaluate_model(model, dataloader):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in dataloader:
            labels = batch["labels"]
            batch = {k: v.to(device) for k, v in batch.items()}

            outputs = model(**batch)
            logits = outputs.logits

            predictions = torch.argmax(logits, dim=-1).cpu().numpy()
            labels = labels.numpy()

            for preds, labs in zip(predictions, labels):
                true_preds = []
                true_labs = []
                for p, l in zip(preds, labs):
                    if l != -100:
                        true_preds.append(id2label[p])
                        true_labs.append(id2label[l])
                all_preds.append(true_preds)
                all_labels.append(true_labs)

    return seqeval.compute(predictions=all_preds, references=all_labels)

metrics = evaluate_model(model, dev_loader)
print(metrics)

In [None]:
save_path = f"{model_save_addr}/{model_name.split('/')[-1]}_{r}"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)


## Messin 

In [None]:
class SgheadDataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.df = df
    def __len__(self) -> int:
        return len(self.df)
    def __getitem__(self, idx: int):


In [None]:
dir_addr = cwd+"/inputs/sghead_ds"
ds.save_to_disk(dir_addr)
print(f"Created dataset in {dir_addr}")

In [None]:
def span_to_mhead_lbls(feature_name, tokens, spans):
    '''
    Helper function to df_to_mhead_dataset(df)
    For an article in the dataframe, for a specific feature type in the annotated spans,
    creates a list of token labels in bio format then converts to integers.
    
    :param feature_name: Name of feature whose BIO list is being created for the article
    :param tokens: list of token objects
    :param spans: list of span objects
    :param label2id: dictionary mapping labels to integers
    '''
    token_labels = ["O"] * len(tokens)
    for spn in spans:
        if spn.feature == feature_name:
            start_char = spn.start
            end_char = spn.stop
            inside_tokens = []
            for i, tok in enumerate(tokens):
                tok_start = tok.start
                tok_end = tok.stop
                overlap = not (tok_end <= start_char or tok_start >= end_char)
                if overlap:
                    inside_tokens.append(i)
            if inside_tokens:
                token_labels[inside_tokens[0]] = f"B"
                for i in inside_tokens[1:]:
                    token_labels[i] = f"I"
    return token_labels

def df_to_mhead_ds(df):
    '''
    Converts pandas dataframe to huggingface dataset
    ***Currently ignores any articles longer than 512 tokens
    
    :param df: POLIANNA dataframe
    '''
    datapoints = []
    for artid in df.index:
        tokens = df.loc[artid,"Tokens"]
        if len(tokens) <= 512: # we'll change this eventually
            text = df.loc[artid,"Text"]
            spans = df.loc[artid,"Curation"]
            token_texts = [t.text for t in tokens]
            datapoint = {}
            datapoint['id'] = artid
            datapoint["text"] = text
            datapoint["tokens"] = token_texts
            for ftr in ["Actor", "InstrumentType", "Objective", "Resource", "Time"]:
                token_level_labels = span_to_mhead_lbls(ftr, tokens, spans)
                datapoint[f"labels_{ftr}"] = token_level_labels
            datapoints.append(datapoint)
    return Dataset.from_list(datapoints)

dir_addr = cwd+"/inputs/mhead_ds"
ds = df_to_mhead_ds(pol_df)
ds.save_to_disk(dir_addr)

In [None]:
for ent in zip(ds['tokens'][0], ds['ner_tags'][0]):
    print(ent)

https://medium.com/@shahrukhx01/multi-task-learning-with-transformers-part-1-multi-prediction-heads-b7001cf014bf

In [None]:
def span_to_bio_tok_lbls(feature_name, tokens, spans, label2id):
    token_labels = ["O"] * len(tokens)
    for spn in spans:
        if spn.feature == feature_name:
            start_char = spn.start
            end_char = spn.stop
            inside_tokens = []
            for i, tok in enumerate(tokens):
                tok_start = tok.start
                tok_end = tok.stop
                overlap = not (tok_end <= start_char or tok_start >= end_char)
                if overlap:
                    inside_tokens.append(i)
            if inside_tokens:
                token_labels[inside_tokens[0]] = f"B"
                for i in inside_tokens[1:]:
                    token_labels[i] = f"I"
    return [label2id[l] for l in token_labels]

def df_to_dataset(df):
    label2id = {
        "O":0, "B":1, "I":2
    }
    dataset = {
        "id":[],
        "text":[],
        "tokens":[],
        "labels_Actor":[],
        "labels_InstrumentType":[],
        "labels_Objective":[],
        "labels_Resource":[],
        "labels_Time":[]
    }
    for artid in df.index:
        tokens = df.loc[artid,"Tokens"]
        if len(tokens) <= 512: # we'll change this eventually
            text = df.loc[artid,"Text"]
            spans = df.loc[artid,"Curation"]
            token_texts = [t.text for t in tokens]
            dataset['id'].append(artid)
            dataset["text"].append(text)
            dataset["tokens"].append(token_texts)
            for ftr in ["Actor", "InstrumentType", "Objective", "Resource", "Time"]:
                token_level_labels = span_to_bio_tok_lbls(ftr, tokens, spans, label2id)
                dataset[f"labels_{ftr}"].append(token_level_labels)
    return Dataset.from_dict(dataset), list(label2id)

In [None]:
dataset, label_list = df_to_dataset(pol_df)
id2label = {}
label2id = {}
for i, lbl in enumerate(label_list):
    id2label[i] = lbl
    label2id[lbl] = i
# do the datasets need to differ by model used for tokenization of results too??

In [None]:
'''
for r in [0,1,2]:
    td_test = dataset.train_test_split(test_size=0.2, seed=r)
    train_dev = td_test['train'].train_test_split(test_size=0.25, seed=r)
    ds_dct = DatasetDict({"train":train_dev['train'], "dev":train_dev['test'], "test":td_test['test']})
    print(ds_dct)
    ds_dct.save_to_disk(cwd+f"/inputs/sep/dsdct_r{r}")
'''

#### Tokenize

In [None]:
label2id = {
        "O":0, "B":1, "I":2
    }
id2label = {
    0:"O", 1:"B", 2:"I"
}

In [None]:
model_name = "microsoft/deberta-v3-base" # suggested lr of 3e-5
#model_name = "dslim/bert-base-NER-uncased"
#model_name = "FacebookAI/xlm-roberta-base"

have to adapt the tokenizing and aligning script to account for the separate label lists

In [None]:
# all feature/label types
label_cols = [
    "labels_Actor",
    "labels_InstrumentType",
    "labels_Objective",
    "labels_Resource",
    "labels_Time"
]

tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_and_align_labels(examples):
    # adapted for multi-head from https://huggingface.co/docs/transformers/en/tasks/token_classification
    # even tho the token lists area already split into words, we need to break them into subwords
    # and then ensure that the label sequences still align in the new token sequence
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True, padding=True, return_attention_mask=True)
    # for each label type/list
    for col in label_cols:
        all_aligned_labels = []
        # loop through this label type's sequence in each sample and realign
        for sample_idx, labels in enumerate(examples[col]):
            word_ids = tokenized_inputs.word_ids(batch_index=sample_idx)
            # smth like [None, 0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 19, 20]
            previous_word_idx = None
            label_ids = []
            for word_idx in word_ids:
                if word_idx is None:
                    label_ids.append(-100)
                elif word_idx != previous_word_idx:
                    label_ids.append(labels[word_idx])
                else:
                    label_ids.append(-100)
                previous_word_idx = word_idx
            all_aligned_labels.append(label_ids)
        tokenized_inputs[col] = all_aligned_labels
    return tokenized_inputs

In [None]:
r=0
dataset_dict = DatasetDict.load_from_disk(cwd+f"/inputs/sep/dsdct_r{r}")
tokenized_dsdct = dataset_dict.map(tokenize_and_align_labels, batched=True)
#tokenized_dsdct.set_format(type="torch", columns=["input_ids", "attention_mask"] + label_cols)

In [None]:
# sanity checkign
tokenized_inputs = tokenizer(dataset_dict['train'][0:5]["tokens"], truncation=True, is_split_into_words=True)
for sample_idx, labels in enumerate(dataset_dict['train'][0:5]['labels_Actor']):
    word_ids = tokenized_inputs.word_ids(batch_index=sample_idx)
    print(word_ids)

new custom data collator for multi-heads

we need a new data collator because we have mutliple label lists

In [None]:
class MultiHeadDataCollator:
    '''
    Using PreTrainedTokenizerBase i.e. whatever pretrained tokenizer we have from tokenize_and_align_labels
    And using pad_sequence
    '''
    def __init__(self, tokenizer: PreTrainedTokenizerBase, label_columns: List[str], padding=True, max_length=None):
        #initializing the essentials
        self.tokenizer = tokenizer
        self.label_columns = label_columns
        self.padding = padding
        self.max_length = max_length
    def __call__(self, features):
        input_ids = [torch.tensor(f["input_ids"], dtype=torch.long) for f in features]
        #attention_mask = [torch.tensor(f["attention_mask"], dtype=torch.long) for f in features] # something isnt working
        attention_mask = [
            torch.tensor(f.get("attention_mask", [1]*len(f["input_ids"])), dtype=torch.long)
            for f in features
        ]
        # padding inputids and attnmask
        input_ids = pad_sequence(input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id)
        attention_mask = pad_sequence(attention_mask, batch_first=True, padding_value=0)
        # batch
        batch = {
            "input_ids": input_ids,
            "attention_mask": attention_mask
        }
        # padding labels
        for col in self.label_columns:
            #label_lists = [torch.tensor(f[col], dtype=torch.long) for f in features] # something isnt working here either
            label_lists = [
                torch.tensor(f.get(col, [-100]*len(f["input_ids"])), dtype=torch.long)
                for f in features
            ]
            labels_padded = pad_sequence(label_lists, batch_first=True, padding_value=-100)
            # then finally adding to batch
            batch[col] = labels_padded
        return batch

In [None]:
# sanity checking
data_collator = MultiHeadDataCollator(tokenizer=tokenizer, label_columns=label_cols, max_length=512)
sample_batch = [tokenized_dsdct['train'][i] for i in range(16)]
batch = data_collator(sample_batch)
for k, v in batch.items():
    print(k, v.shape)

In [None]:
#sanity checking
tokens = tokenizer.convert_ids_to_tokens(batch["input_ids"][0])
for idx, tok in enumerate(tokens[:50]):
    print(f"{idx:03} | {tok:15} | "
          f"A:{batch['labels_Actor'][0][idx].item():2}  "
          f"T:{batch['labels_Time'][0][idx].item():2}  "
          f"I:{batch['labels_InstrumentType'][0][idx].item():2}")


new model structure

microsoft/deberta-v3-base

#### Weight calculations (currently only using train split)

In [None]:
def get_weights(label_cols, tokenized_dsdct):
    # lets create class (BIO) weights for each feature type
    num_classes = 3
    class_weights = {}
    class_weights_norm = {}
    for lname in label_cols:
        name = lname.replace("labels_", "")
        labels = np.concatenate([np.array(l) for l in tokenized_dsdct['train'][lname]])
        labels = labels[labels != -100]  # remove padding
        counter = Counter(labels)
        # inverse frequency weighting
        weights=[0]*num_classes
        for i in range(num_classes):
            count = counter.get(i, 0)
            if count == 0: #handle no-shows so no zer-os (no dividing by zeros that is)
                weights[i] = 1.0
            else:
                weights[i] = len(labels) / (num_classes * count)
        class_weights[name] = torch.tensor(weights, dtype=torch.float)
        # old vers
        #total = sum(counter.values())
        #class_weights[name] = torch.tensor([total/(num_classes*counter[i]) for i in range(num_classes)], dtype=torch.float)
        # then normalize
        w = torch.tensor(weights, dtype=torch.float)
        w = w / w.mean()
        class_weights_norm[name] = w
    #cls_wt_tot = sum([class_weights[cls] for cls in list(class_weights)])
    #class_weights_norm = {cls: torch.tensor(class_weights[cls]/cls_wt_tot, dtype=torch.float) for cls in list(class_weights)}
    # weights for each head
    head_counts = {}
    for lname in label_cols:
        name = lname.replace("labels_", "")
        # concatenate all labels and remove -100s
        labels = np.concatenate([np.array(l) for l in tokenized_dsdct['train'][lname]])
        labels = labels[labels != -100]
        labels = labels[labels != 0]
        head_counts[name] = len(labels) # only tokens B or I
    total_tokens = sum(head_counts.values())
    head_weights = {head: total_tokens / (len(head_counts) * count) for head, count in head_counts.items()}
    #hd_wt_tot = sum([head_weights[cls] for cls in list(head_weights)])
    #head_weights_norm = {head: math.log(head_weights[head]) for head in list(head_weights)}
    w = torch.tensor(list(head_weights.values()), dtype=torch.float)
    w = w / w.mean()
    head_weights_norm = {head: w[i] for i, head in enumerate(head_weights.keys())}
    return {"class_weights": class_weights, "class_weights_norm": class_weights_norm, "head_weights": head_weights, "head_weights_norm": head_weights_norm}

WEIGHTS = get_weights(label_cols, tokenized_dsdct)
WEIGHTS

##### spoiler
weights = {'class_weights': {'Actor': torch.tensor([ 0.3560,  9.8114, 11.2143], dtype=torch.float),
  'InstrumentType': torch.tensor([ 0.3483, 16.4053, 14.7887], dtype=torch.float),
  'Objective': torch.tensor([ 0.3513, 53.5521,  7.4079], dtype=torch.float),
  'Resource': torch.tensor([ 0.3367, 92.7964, 53.0844], dtype=torch.float),
  'Time': torch.tensor([ 0.3412, 59.8834, 19.0240], dtype=torch.float)},
 'class_weights_norm': {'Actor': torch.tensor([-1.0328,  2.2835,  2.4172], dtype=torch.float),
  'InstrumentType': torch.tensor([-1.0548,  2.7976,  2.6939], dtype=torch.float),
  'Objective': torch.tensor([-1.0460,  3.9807,  2.0025], dtype=torch.float),
  'Resource': torch.tensor([-1.0887,  4.5304,  3.9719], dtype=torch.float),
  'Time': torch.tensor([-1.0753,  4.0924,  2.9457], dtype=torch.float)},
 'head_weights': {'Actor': 0.5988807576409815,
  'InstrumentType': 0.8900831733845169,
  'Objective': 0.7447537473233404,
  'Resource': 3.8644444444444446,
  'Time': 1.6522565320665084},
 'head_weights_norm': {'Actor': -0.5126927697303358,
  'InstrumentType': -0.11644036738140337,
  'Objective': -0.2947016557487147,
  'Resource': 1.3518179315899177,
  'Time': 0.5021419487977465}}

since loss is computed per token instead of per span, we'll look at the All instead of the Ents results

## Training components

In [None]:
class MultiHeadTokenConfig(PretrainedConfig):
    model_type = "deberta-multihead"
    def __init__(
            self,
            base_model_name="microsoft/deberta-v3-base",
            n_labels=3,
            heads=None,
            hidden_size=None,
            id2label=None,
            label2id=None,
            **kwargs):
        super().__init__(**kwargs)
        self.base_model_name = base_model_name
        self.n_labels = n_labels
        self.heads = heads or ["Actor", "InstrumentType", "Objective", "Resource", "Time"]
        self.hidden_size = hidden_size 
        self.id2label = id2label or {0: "O", 1: "B", 2: "I"}
        self.label2id = label2id or {v: k for k, v in self.id2label.items()}

class DebertaForMultiHeadTokClass(PreTrainedModel):
    config_class = MultiHeadTokenConfig
    def __init__(self, config):
        super().__init__(config)
        self.encoder_config = AutoConfig.from_pretrained(config.base_model_name)
        self.encoder = AutoModel.from_pretrained(config.base_model_name, config=self.encoder_config)
        hidden_size = self.base_model.config.hidden_size
        #sep linear head for each feature type classification
        self.classifiers = nn.ModuleDict({
            head: nn.Linear(hidden_size, config.num_labels) for head in config.heads
        })
        self.dropout = nn.Dropout(config.hidden_dropout_prob if hasattr(config, 'hidden_dropout_prob') else 0.1)
        self.init_weights()
    def forward(self, input_ids, attention_mask=None, **labels):
        # batch of inputs encoded by base model
        outputs = self.encoder(input_ids, attention_mask=attention_mask)
        # only uses last hidden state... for now
        # will look into averaging/concatenating last few hidden states
        sequence_output = outputs.last_hidden_state
        #sequence_output = self.dropout(sequence_output)
        # passes encoded input sequence to each classifier to get logits
        logits = {name: self.classifiers[name](sequence_output) for name in self.classifiers}
        loss = None
        if labels:
            loss = 0
            # for labels_Feature, tensor(batch_sz,seq_ln)
            for lname, label in labels.items():
                if label is not None:
                    name = lname.replace("labels_", "")
                    # flatten attn mask
                    active_loss = attention_mask.view(-1) == 1
                    # get active logits, flatten to (num_act_tokens, num_classes) 
                    # then apply active loss mask (to both logits and labels)
                    active_logits = logits[name].view(-1, 3)[active_loss]
                    active_labels = label.view(-1)[active_loss]
                    # weighting BIO classes for this feature
                    #weight = WEIGHTS['class_weights'][name].to(active_logits.device)
                    weight = WEIGHTS['class_weights_norm'][name].to(active_logits.device)
                    loss_fct = nn.CrossEntropyLoss(weight=weight)
                    # computing loss for this head
                    head_loss = loss_fct(active_logits, active_labels)
                    # weight the loss for this head
                    head_loss *= WEIGHTS['head_weights_norm'][name]
                    # sum loss across heads for single update to train simultaneously
                    loss += head_loss
        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
        )

In [None]:
# work on model that concatenates or averages last few hidden states for encoded representation

new compute_metrics for token micro-f1 instead of seqeval

In [None]:
def compute_metrics_multihead(p):
    prediction_dct, label_dct = p
    lblnames = [i[0] for i in prediction_dct.items()]
    metrics = {}
    # for each head
    for head_name, logits in prediction_dct.items():
        labels = label_dct[lblnames.index(head_name)] # size (batch, seq_len)
        labels_flat = labels.flatten()
        preds_flat = np.argmax(logits, axis=-1).flatten()
        # mask out -100s
        mask = labels_flat != -100
        labels_flat = labels_flat[mask]
        preds_flat = preds_flat[mask]
        # micro F1
        f1 = f1_score(labels_flat, preds_flat, average='micro')
        metrics[f"{head_name}_f1"] = f1
    return metrics

In [None]:
data_collator = MultiHeadDataCollator(tokenizer=tokenizer, label_columns=label_cols, max_length=512)

In [None]:
class MultiHeadTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        # Extract labels for each head from the inputs
        labels = {k: inputs.pop(k) for k in list(inputs.keys()) if k.startswith("labels_")}
        # Forward pass
        outputs = model(**inputs, **labels)
        # Your model returns TokenClassifierOutput
        loss = outputs.loss
        if return_outputs:
            return loss, outputs
        return loss

## Where thamagic happpens

In [None]:
training_args = TrainingArguments(
    output_dir=model_name.split("/")[-1],
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    label_names=label_cols
)

#trainer = Trainer(
trainer = MultiHeadTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dsdct["train"],
    eval_dataset=tokenized_dsdct["dev"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics_multihead
)
trainer.train()
trainer.save_model(cwd+f"/models/sep/{model_name.split('/')[-1]}_{r}")
del model
del trainer

add early stopping to trainer?

In [None]:
label_cols

## Test

old metrics (seqeval)

In [None]:
from transformers import AutoConfig
mode = "sep"
seqeval = evaluate.load("seqeval")
id2label={0: "O", 1: "B", 2: "I"}
label2id={"O":0, "B":1, "I":2}
results_dict = {
    "microsoft/deberta-v3-base":{},
    "FacebookAI/xlm-roberta-base":{},
    "dslim/bert-base-NER-uncased":{}
}
for model_name in list(results_dict):
    results_dict[model_name]["Overall"] = {"precision":[], "recall":[], "f1":[], "accuracy":[]}
    for ftr in ["Actor", "InstrumentType", "Objective", "Resource", "Time"]:
        results_dict[model_name][ftr] = {"precision":[], "recall":[], "f1":[], "number":[]}
for model_name in results_dict:
    for r in [0, 1, 2]:
        dataset_dict = DatasetDict.load_from_disk(cwd + f"/inputs/{mode}/dsdct_r{r}")
        
        # Load tokenizer
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        
        # Load your multi-head model
        config = MultiHeadTokenConfig(
            base_model_name=model_name,   # e.g., "microsoft/deberta-v3-base"
            num_labels=3,                 # per head
            heads=["Actor", "InstrumentType", "Objective", "Resource", "Time"],
            id2label=id2label,
            label2id=label2id,
            hidden_size=768                # or get from base model config
        )

        # Initialize model
        model_tt = DebertaForMultiHeadTokClass.from_pretrained(cwd+f"/models/{mode}/{model_name.split('/')[-1]}_{r}")

        model_tt.to("cuda")
        model_tt.eval()

        texts = list(dataset_dict['test']['text'])
        inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True).to("cuda")

        # Get logits per head
        with torch.no_grad():
            model_inputs = {
                "input_ids": inputs["input_ids"],
                "attention_mask": inputs["attention_mask"]
            }
            outputs = model_tt(**model_inputs)
            #outputs = model_tt(**inputs)
            logits_dict  = outputs.logits  # dict of (batch, seq_len, n_classes)
        label_list = [id2label[i] for i in range(len(id2label))]
        head_label_lists = {
            head: [config.id2label[i] for i in range(len(config.id2label))]
            for head in config.heads
        }

        for head_name, logit_tensor in logits_dict.items():
            preds = torch.argmax(logit_tensor, dim=-1).cpu().numpy()

            labels = [dataset_dict['test'][f"labels_{head_name}"][i] for i in range(len(dataset_dict['test']))]

            true_predictions = []
            true_labels = []

            for pred_seq, label_seq in zip(preds, labels):
                pred_labels = []
                gold_labels = []
                for p, l in zip(pred_seq, label_seq):
                    if l != -100:  # ignore padding
                        pred_labels.append(head_label_lists[head_name][p])
                        gold_labels.append(head_label_lists[head_name][l])
                true_predictions.append(pred_labels)
                true_labels.append(gold_labels)

            results = seqeval.compute(predictions=true_predictions, references=true_labels)
            print(f"Head: {head_name}", results)

            for k in list(results):
                if k[:4]=="over":
                    x, metric = k.split("_")
                    results_dict[model_name]['Overall'][metric].append(float(results[k]))
                else:
                    for mtr in list(results[k]):
                        results_dict[model_name][head_name][mtr].append(float(results[k][mtr]))

In [None]:
for m in list(results_dict):
    print(f"\n{m}")
    for res in list(results_dict[m]):
        print(f"{res}")
        print(results_dict[m][res])

In [None]:
fn = "2nd_results_separateheads_seqeval"
with open(cwd+f"/outputs/{fn}.json", "w", encoding="utf-8") as f:
    json.dump(results_dict, f, indent=4)

In [None]:
for m in list(results_dict):
    print(f"\n{m}")
    for res in list(results_dict[m]):
        print(f"\n{res}")
        df = pd.DataFrame(results_dict[m][res])
        df.loc['mean'] = df.mean()
        print(round(df.loc['mean']*100,2))

new metrics -- token micro f1

In [None]:
def compute_metrics_multihead_res(p):
    prediction_dct, label_dct = p
    lblnames = [i[0] for i in prediction_dct.items()]
    metrics = {head.replace('label_',""): {} for head in lblnames}
    # for each head
    for head_name, logits in prediction_dct.items():
        labels = label_dct["labels_"+head_name] # size (batch, seq_len)
        preds_flat = logits.argmax(dim=-1).flatten().cpu().numpy()
        labels_flat = labels.flatten().cpu().numpy()
        # mask out -100s
        mask = labels_flat != -100
        labels_flat = labels_flat[mask]
        preds_flat = preds_flat[mask]
        # micro F1
        f1 = f1_score(labels_flat, preds_flat, average='micro')
        metrics[head_name]["f1"] = f1
    return metrics

In [None]:
mode = "sep"
results_dict = {
    "microsoft/deberta-v3-base":{},
    "FacebookAI/xlm-roberta-base":{},
    "dslim/bert-base-NER-uncased":{}
}
for model_name in list(results_dict):
    #results_dict[model_name]["Overall"] = {"precision":[], "recall":[], "f1":[], "accuracy":[]}
    for ftr in ["Actor", "InstrumentType", "Objective", "Resource", "Time"]:
        results_dict[model_name][ftr] = {"f1":[]}
for model_name in list(results_dict):
    for r in [0,1,2]:
        dataset_dict = DatasetDict.load_from_disk(cwd+f"/inputs/{mode}/dsdct_r{r}")
        config = MultiHeadTokenConfig.from_pretrained(cwd + f"/models/{mode}/{model_name.split('/')[-1]}_{r}")
        model_tt = DebertaForMultiHeadTokClass(config)
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        tokenized_dsdct = dataset_dict.map(tokenize_and_align_labels, batched=True)
        model_tt.to('cuda')
        model_tt.eval()
        texts = list(tokenized_dsdct['test']['text'])
        # tokenize batch
        inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, is_split_into_words=False)
        inputs = {k: v.to('cuda') for k, v in inputs.items()}
        # get labels in batch form
        label_cols = ["labels_Actor","labels_InstrumentType","labels_Objective","labels_Resource","labels_Time"]
        labels_batch = {col: torch.tensor(tokenized_dsdct['test'][col]).to('cuda') for col in label_cols}
        with torch.no_grad():
            model_inputs = {
                "input_ids": inputs["input_ids"].to('cuda'),
                "attention_mask": inputs["attention_mask"].to('cuda')
            }
            outputs = model_tt(**model_inputs, **labels_batch)
            # outputs is TokenClassifierOutput
            logits = outputs.logits  # dict of logits per head
        results = compute_metrics_multihead_res((logits, labels_batch))
        print(results)
        for k, v in results.items():
            for mtr in v:
                results_dict[model_name][k][mtr].append(float(v[mtr]))

In [None]:
for m in list(results_dict):
    print(f"\n{m}")
    for res in list(results_dict[m]):
        print(f"{res}")
        print(results_dict[m][res])

In [None]:
fn = "2nd_results_separateheads_tokenmicrof1"
with open(cwd+f"/outputs/{fn}.json", "w", encoding="utf-8") as f:
    json.dump(results_dict, f, indent=4)

In [None]:
for m in list(results_dict):
    print(f"\n{m}")
    for res in list(results_dict[m]):
        print(f"\n{res}")
        df = pd.DataFrame(results_dict[m][res])
        df.loc['mean'] = df.mean()
        #print(df)
        print(round(df.loc['mean']*100,2))