In [46]:
# !git clone https://github.com/panyutsriwirote/NLPSysProject.git
# %cd NLPSysProject

# Uncomment this if you want to re-split the dataset
# !python preprocess.py

In [47]:
!pip install datasets transformers==4.9.2 seqeval -q
!pip install sentencepiece -q

In [48]:
LIMIT = 10
encoder = "airesearch/wangchanberta-base-att-spm-uncased"
batch_size = 16
num_epoch = 50
limit_n_tokens = 200
save="no"
#save=("no","epoch", or "step")

In [85]:
import pandas as pd
import numpy as np
import json

from transformers import (
    AutoModel, AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments, Trainer,
    DataCollatorForTokenClassification
)

from datasets import (
    load_dataset, load_metric,
    Dataset,
    DatasetDict,
    Features, Sequence, ClassLabel, Value
)
from sklearn.metrics import classification_report
from ast import literal_eval

In [51]:
tokenizer = AutoTokenizer.from_pretrained(encoder)

loading configuration file https://huggingface.co/airesearch/wangchanberta-base-att-spm-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/616a9e2dfc52e9d019b75d219ed800a27158ed299bd4fad91363110fe93dfce1.27c4f6581fbedf3d12e9fae96d4fbb8bc3064cd88ae545414e7cffc7c5bbc52f
Model config CamembertConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "camembert",
  "num_attention_head": 12,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.9.2",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 25005
}

loading weights file htt

In [52]:
train_df = pd.read_csv("our_model/train.tsv", sep='\t')
dev_df = pd.read_csv("our_model/dev.tsv", sep='\t')
test_df = pd.read_csv("our_model/test.tsv", sep='\t')

In [53]:
train_df["tokens"] = train_df["tokens"].apply(literal_eval)
dev_df["tokens"] = dev_df["tokens"].apply(literal_eval)
test_df["tokens"] = test_df["tokens"].apply(literal_eval)

labelize = lambda x: [h if h == "OUT_OF_RANGE" or -LIMIT <= int(h) <= LIMIT else "OUT_OF_RANGE" for h in literal_eval(x)]

train_df["rel_heads"] = train_df["rel_heads"].apply(labelize)
dev_df["rel_heads"] = dev_df["rel_heads"].apply(labelize)
test_df["rel_heads"] = test_df["rel_heads"].apply(labelize)

In [54]:
train = train_df[["tokens","rel_heads"]].to_dict(orient="series")
dev = dev_df[["tokens","rel_heads"]].to_dict(orient="series")
test = test_df[["tokens","rel_heads"]].to_dict(orient="series")

In [55]:
rel_heads = [str(l) for l in range(-LIMIT, LIMIT+1)] + ["OUT_OF_RANGE"]
features = Features({
    "tokens": Sequence(Value('string')),
    "rel_heads": Sequence(feature=ClassLabel(names=rel_heads))
})

In [56]:
data = DatasetDict({
    'train': Dataset.from_dict(train,features=features), 
    'dev': Dataset.from_dict(dev,features=features),
    'test': Dataset.from_dict(test,features=features)
})

In [57]:
label_all_tokens = True
#True: all tokens have same label as original word
#False: only the first token have same label as original word

In [58]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    tokens = [tokenizer.convert_ids_to_tokens(x) for x in tokenized_inputs["input_ids"] ]
    ids=[]
    labels = []
    for i, label in enumerate(examples["rel_heads"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # Set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)
        ids.append(word_ids)

    tokenized_inputs["labels"] = labels
    tokenized_inputs["word_ids"] = ids
    tokenized_inputs["tokens_bert"] = tokens
    return tokenized_inputs


In [59]:
tokenized_datasets = data.map(tokenize_and_align_labels, batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [60]:
label_list = data['train'].features["rel_heads"].feature.names

In [61]:
model = AutoModelForTokenClassification.from_pretrained(encoder, num_labels=len(label_list))
model_name = encoder.split("/")[-1]

args = TrainingArguments(
    "dep_parsing_as_token_class",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epoch,
    save_strategy=save,
    # save_total_limit = 2,
    # load_best_model_at_end=True,
    weight_decay=0.01,
    push_to_hub=False,
)
data_collator = DataCollatorForTokenClassification(tokenizer)
metric = load_metric("seqeval")

loading configuration file https://huggingface.co/airesearch/wangchanberta-base-att-spm-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/616a9e2dfc52e9d019b75d219ed800a27158ed299bd4fad91363110fe93dfce1.27c4f6581fbedf3d12e9fae96d4fbb8bc3064cd88ae545414e7cffc7c5bbc52f
Model config CamembertConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17",
    "18": "LABEL_18",
    "19":

In [62]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [63]:
HEAD_CLASSIFIER = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['dev'],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [64]:
HEAD_CLASSIFIER.train()

The following columns in the training set  don't have a corresponding argument in `CamembertForTokenClassification.forward` and have been ignored: rel_heads, tokens, word_ids, tokens_bert.
***** Running training *****
  Num examples = 735
  Num Epochs = 50
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 2300


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,1.882928,0.19555,0.189917,0.192692,0.469753
2,No log,1.426266,0.389183,0.417362,0.40278,0.584937
3,No log,1.178168,0.473236,0.506065,0.489101,0.638863
4,No log,1.038746,0.536551,0.573161,0.554252,0.672472
5,No log,0.948393,0.581042,0.608795,0.594595,0.697525
6,No log,0.897234,0.600144,0.632676,0.615981,0.709441
7,No log,0.841248,0.620267,0.652009,0.635742,0.727314
8,No log,0.807757,0.637026,0.662623,0.649573,0.735105
9,No log,0.791494,0.654366,0.678923,0.666419,0.745952
10,No log,0.776321,0.663623,0.688779,0.675967,0.753284


The following columns in the evaluation set  don't have a corresponding argument in `CamembertForTokenClassification.forward` and have been ignored: rel_heads, tokens, word_ids, tokens_bert.
***** Running Evaluation *****
  Num examples = 157
  Batch size = 16
  _warn_prf(average, modifier, msg_start, len(result))
The following columns in the evaluation set  don't have a corresponding argument in `CamembertForTokenClassification.forward` and have been ignored: rel_heads, tokens, word_ids, tokens_bert.
***** Running Evaluation *****
  Num examples = 157
  Batch size = 16
The following columns in the evaluation set  don't have a corresponding argument in `CamembertForTokenClassification.forward` and have been ignored: rel_heads, tokens, word_ids, tokens_bert.
***** Running Evaluation *****
  Num examples = 157
  Batch size = 16
The following columns in the evaluation set  don't have a corresponding argument in `CamembertForTokenClassification.forward` and have been ignored: rel_heads, to

TrainOutput(global_step=2300, training_loss=0.3766581328018852, metrics={'train_runtime': 633.8146, 'train_samples_per_second': 57.982, 'train_steps_per_second': 3.629, 'total_flos': 1474600875111168.0, 'train_loss': 0.3766581328018852, 'epoch': 50.0})

In [82]:
def evaluate(head_classifier, label_classifier, test_set):
    predictions, labels, _ = head_classifier.predict(test_set)
    predictions = np.argmax(predictions, axis=2)

    bert_prediction = [
    [(id, token, prediction) for (id, token, prediction) in zip(row_id, row_token,row_predictions) ]
        for row_id, row_token, row_predictions in zip(tokenized_datasets['test']['word_ids'],tokenized_datasets['test']['tokens_bert'],predictions)
    ]

    gold_label = [
        [label for label in row_label ] for row_label in tokenized_datasets['test']['rel_heads']
    ]

    wrap_predict = [] 
    for row in bert_prediction:
        predict_row = []
        previous_word=None
        for item in row:
            if(item[0] is not None):
                #skip special token
                if(item[0] != previous_word):
                    predict_row.append(item[-1])
                    previous_word = item[0]
        wrap_predict.append(predict_row)
    
    def flatten(t):
        return [item for sublist in t for item in sublist]
    flat_true_predictions = flatten(wrap_predict)
    flat_true_labels = flatten(gold_label)

    # result = classification_report(flat_true_labels,flat_true_predictions,output_dict=True,labels=label_list)
    # result_df = pd.DataFrame(result).transpose()
    # display(result_df)

    correct, total = 0, 0
    for pred, gold in zip(flat_true_predictions, flat_true_labels):
        if pred != "OUT_OF_RANGE" and pred == gold:
            correct += 1
        total += 1
    UAS = (correct / total) * 100

    predictions, labels, _ = label_classifier.predict(test_set)
    predictions = np.argmax(predictions, axis=2)

    bert_prediction = [
    [(id, token, prediction) for (id, token, prediction) in zip(row_id, row_token,row_predictions) ]
        for row_id, row_token, row_predictions in zip(tokenized_datasets['test']['word_ids'],tokenized_datasets['test']['tokens_bert'],predictions)
    ]

    gold_label = [
        [label for label in row_label ] for row_label in tokenized_datasets['test']['rel_heads']
    ]

    wrap_predict = [] 
    for row in bert_prediction:
        predict_row = []
        previous_word=None
        for item in row:
            if(item[0] is not None):
                #skip special token
                if(item[0] != previous_word):
                    predict_row.append(item[-1])
                    previous_word = item[0]
        wrap_predict.append(predict_row)
    flat_true_predictions = flatten(wrap_predict)
    flat_true_labels = flatten(gold_label)

    # result = classification_report(flat_true_labels,flat_true_predictions,output_dict=True,labels=label_list)
    # result_df = pd.DataFrame(result).transpose()
    # display(result_df)

    correct, total = 0, 0
    for pred, gold in zip(flat_true_predictions, flat_true_labels):
        if pred == gold:
            correct += 1
        total += 1
    LAS = correct / total

    return UAS, LAS

In [86]:
evaluate(HEAD_CLASSIFIER, LABEL_CLASSIFIER, tokenized_datasets["test"])

The following columns in the test set  don't have a corresponding argument in `CamembertForTokenClassification.forward` and have been ignored: rel_heads, tokens, word_ids, tokens_bert.
***** Running Prediction *****
  Num examples = 158
  Batch size = 16




(78.69051227644741, 0)