In [1]:

from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
import datasets
import torch
from datasets import Dataset, load_dataset, Sequence, ClassLabel, Features, Value
import evaluate
from preprocessing import preprocessing
from transformers import DataCollatorForTokenClassification
import numpy as np
import random
%load_ext autoreload
%autoreload 2


  from .autonotebook import tqdm as notebook_tqdm
  metric = load_metric("seqeval")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [2]:
df = preprocessing('en_ewt-up-train.conllu')
label_list = list(df['label'].unique())

In [3]:
sent_df = df.groupby(['sentence_id']).agg(lambda x: x.tolist()).reset_index()

In [4]:
features = Features({
    'token_id': Sequence(feature=Value('float32')),
    'sentence_num': Sequence(feature=Value('int32')),
    'token': Sequence(feature=Value('string')),
    'lemma': Sequence(feature=Value('string')),
    'upos': Sequence(feature=Value('string')),
    'POS': Sequence(feature=Value('string')),
    'feats': Sequence(feature=Value('string')),
    'head': Sequence(feature=Value('string')),
    'deprel': Sequence(feature=Value('string')),
    'deps': Sequence(feature=Value('string')),
    'misc': Sequence(feature=Value('string')),
    'predicate': Sequence(feature=Value('string')),
    'predicate_token': Sequence(feature=Value('string')),
    'predicate_token_id': Sequence(feature=Value('int32')),
    'sentence_id': Value('int32'),
    'label': Sequence(feature=ClassLabel(names=label_list)),

})

ds = Dataset.from_pandas(sent_df[list(features.keys())], features=features)


In [5]:
PER_DS = 0.5

40482

In [6]:
ds = ds.select(random.sample(range(len(ds)), int(len(ds)*PER_DS)))
len(ds)

4048

In [7]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
SEP_TOKEN_ID = tokenizer.all_special_ids[tokenizer.all_special_tokens.index('[SEP]')]

In [8]:
from utils import tokenize_and_align_labels
tokenized_datasets = ds.map(lambda x: tokenize_and_align_labels(tokenizer, x))

Map: 100%|██████████| 4048/4048 [00:02<00:00, 1500.69 examples/s]


In [9]:
tokenized_datasets['labels'][0]

[-100, 0, 0, 0, -100, 3, 2, 0, 8, 0, 0, 0, 0, 0, 0, -100]

In [10]:
from transformers import DataCollatorForTokenClassification

task = 'SRL'
batch_size = 32
model_name = 'bert-base-uncased'

args = TrainingArguments(
    f"{model_name}-finetuned-{task}",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=False,
)

model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(label_list))

data_collator = DataCollatorForTokenClassification(tokenizer)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
import warnings
metric = evaluate.load("seqeval")


labels = [label_list[j] for l in ds["label"] for j in l]
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    m = metric.compute(predictions=[labels], references=[labels])

In [12]:
from utils import compute_metrics
td = tokenized_datasets.remove_columns(ds.column_names)
trainer = Trainer(
    model,
    args,
    train_dataset=td,
    eval_dataset=td,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


In [13]:
trainer.train()

  0%|          | 1/381 [01:09<7:22:14, 69.83s/it]