In [1]:
from datasets import load_dataset, concatenate_datasets, DatasetDict
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
import evaluate
import numpy as np

In [2]:
#Function to compute various metrics. Can be adapted as needed later. 
def compute_metrics(p):
    seqeval = evaluate.load("seqeval")
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [3]:
#tokenization
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [4]:
#Training the model for the first step. 
def train_one_model(tokenized_dataset, newtest_dataset, model, tokenizer, save_path):
    
        training_args = TrainingArguments(
                        output_dir="tmp",
                        report_to="none",
                        learning_rate=2e-5,
                        per_device_train_batch_size=16,
                        per_device_eval_batch_size=16,
                        num_train_epochs=1,
                        weight_decay=0.01,
                        evaluation_strategy="epoch",
                        save_strategy="epoch",
                        load_best_model_at_end=True
                        )
    
        trainer = Trainer(
                model=model,
                args=training_args,
                train_dataset=tokenized_dataset["train"],
                eval_dataset=tokenized_dataset["validation"],
                tokenizer=tokenizer,
                data_collator=data_collator,
                compute_metrics=compute_metrics,
                )

        print("training starts")

        trainer.train()
        
        print("Same language test set performance")
        results = trainer.evaluate(tokenized_dataset["test"])
        print(results)
        
        print("Zero-shot performance on target language")
        results = trainer.evaluate(newtest_dataset)
        print(results)
        
        trainer.save_model(save_path)

    #n = int(0.8*len(tokenized_dataset['train']))
    #small_dataset = tokenized_dataset["train"].select(range(1,n))

In [5]:
"""train a couple of models for step 2, taking a proportion of training data each time, 
keeping the rest of the settings constant
"""
def train_many_models(tokenized_dataset, trained_model, tokenizer):
    
    training_args = TrainingArguments(
                    output_dir="tmp",
                    report_to="none",
                    learning_rate=2e-5,
                    per_device_train_batch_size=16,
                    per_device_eval_batch_size=16,
                    num_train_epochs=1,
                    weight_decay=0.01,
                    evaluation_strategy="epoch",
                    save_strategy="epoch",
                    load_best_model_at_end=True
                )
        
    prop = [0.2, 0.4, 0.6, 0.8, 1.0]
    for temp in prop:
        print("For %s training data" %temp)
        n = int(temp*len(tokenized_dataset['train']))
        small_dataset = tokenized_dataset["train"].select(range(1,n))
        trainer = Trainer(
                    model=trained_model,
                    args=training_args,
                    train_dataset=small_dataset,
                    eval_dataset=tokenized_dataset["validation"],
                    tokenizer=tokenizer,
                    data_collator=data_collator,
                    compute_metrics=compute_metrics,
                    )

        print("training starts")

        trainer.train()
        print("Test set performance")
        results = trainer.evaluate(tokenized_dataset["test"])
        print(results)

    #n = int(0.8*len(tokenized_dataset['train']))
    #small_dataset = tokenized_dataset["train"].select(range(1,n))

In [7]:
#Step 0: Stuff that changes for each expt: 
dataset_name = "conll2003" #Dataset for Step 1 [There can be more than one dataset here via concatenation]
model_name = "distilbert-base-uncased" #Pre-trained LM
save_path="tmp/conll03enmodel" #Path to save the model at step 1

mydata1 = load_dataset(dataset_name) #Load dataset for Step 1
mydata2 = load_dataset("conll2002", "nl")

#any tags except id, ner, tokens can be removed. 
mydata1 = mydata1.remove_columns(["pos_tags", "chunk_tags"])
mydata2 = mydata2.remove_columns(["pos_tags"])

mydata_train = concatenate_datasets([mydata1["train"], mydata2["train"]])
mydata_valid = concatenate_datasets([mydata1["validation"], mydata2["validation"]])
mydata_test  = concatenate_datasets([mydata1["test"], mydata2["test"]])

mydata = DatasetDict({"train": mydata_train, "validation": mydata_valid, "test":mydata_test})

cldataset = load_dataset("conll2002", "es") #Load dataset for Step 2

Found cached dataset conll2003 (/Users/Vajjalas/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/9a4d16a94f8674ba3466315300359b0acd891b68b6c8743ddf60b9c702adce98)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset conll2002 (/Users/Vajjalas/.cache/huggingface/datasets/conll2002/nl/1.0.0/a3a8a8612caf57271f5b35c5ae1dd25f99ddb9efb9c1667abaa70ede33e863e5)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset conll2002 (/Users/Vajjalas/.cache/huggingface/datasets/conll2002/es/1.0.0/a3a8a8612caf57271f5b35c5ae1dd25f99ddb9efb9c1667abaa70ede33e863e5)


  0%|          | 0/3 [00:00<?, ?it/s]

In [8]:
#Step 1: Fine-tune the first level NER model (which can be mono or multilingual)
label_list = mydata["train"].features[f"ner_tags"].feature.names
id2label = {k: v for k,v in enumerate(label_list)}
label2id = {v: k for k,v in enumerate(label_list)}
model = AutoModelForTokenClassification.from_pretrained(
        model_name, num_labels=len(label_list), id2label=id2label, label2id=label2id)
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenized_dataset = mydata.map(tokenize_and_align_labels, batched=True)
cl_tokenized_dataset = cldataset.map(tokenize_and_align_labels, batched=True)

#dynamically padding:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

train_model = train_one_model(tokenized_dataset, cl_tokenized_dataset["test"], model, tokenizer, save_path)


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForTokenClassification: ['vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream

  0%|          | 0/30 [00:00<?, ?ba/s]

  0%|          | 0/7 [00:00<?, ?ba/s]

  0%|          | 0/9 [00:00<?, ?ba/s]

Loading cached processed dataset at /Users/Vajjalas/.cache/huggingface/datasets/conll2002/es/1.0.0/a3a8a8612caf57271f5b35c5ae1dd25f99ddb9efb9c1667abaa70ede33e863e5/cache-359eb48310ad7d56.arrow
Loading cached processed dataset at /Users/Vajjalas/.cache/huggingface/datasets/conll2002/es/1.0.0/a3a8a8612caf57271f5b35c5ae1dd25f99ddb9efb9c1667abaa70ede33e863e5/cache-7f7012fd28dd0e95.arrow
Loading cached processed dataset at /Users/Vajjalas/.cache/huggingface/datasets/conll2002/es/1.0.0/a3a8a8612caf57271f5b35c5ae1dd25f99ddb9efb9c1667abaa70ede33e863e5/cache-af3d67afac984d69.arrow
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


training starts


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.1024,0.098128,0.86129,0.84237,0.851725,0.975025


Same language test set performance


{'eval_loss': 0.12056293338537216, 'eval_precision': 0.817501626545218, 'eval_recall': 0.7863773860436007, 'eval_f1': 0.8016375139560848, 'eval_accuracy': 0.9706140008500083, 'eval_runtime': 101.1634, 'eval_samples_per_second': 85.495, 'eval_steps_per_second': 5.348, 'epoch': 1.0}
Zero-shot performance on target language
{'eval_loss': 0.7953947186470032, 'eval_precision': 0.1622243127417163, 'eval_recall': 0.43607754987356, 'eval_f1': 0.2364772207831784, 'eval_accuracy': 0.7255544990588555, 'eval_runtime': 29.5763, 'eval_samples_per_second': 51.325, 'eval_steps_per_second': 3.212, 'epoch': 1.0}


In [None]:
#Step 2: Use the model from Step-1 for doing Cross-lingual transfer experiment with a new dataset.
trained_model = AutoModelForTokenClassification.from_pretrained(save_path)
train_many_models(cl_tokenized_dataset, trained_model, tokenizer)

For 0.2 training data
training starts




Epoch,Training Loss,Validation Loss


In [None]:
#for custom datasets: https://huggingface.co/transformers/v3.2.0/custom_datasets.html - search for .conll here. 