In [None]:
# Uncomment this if you want to re-split the dataset
# !python preprocess.py

In [47]:
!pip install datasets transformers==4.9.2 seqeval -q
!pip install sentencepiece -q

In [48]:
LIMIT = 10
encoder = "airesearch/wangchanberta-base-att-spm-uncased"
batch_size = 16
num_epoch = 50
save = "no"

In [1]:
import pandas as pd
import numpy as np

from transformers import (
    AutoTokenizer, AutoModelForTokenClassification,
    TrainingArguments, Trainer,
    DataCollatorForTokenClassification
)

from datasets import (
    load_metric,
    Dataset, DatasetDict,
    Features, Sequence, ClassLabel, Value
)
from sklearn.metrics import classification_report
from ast import literal_eval
from train_wangchan import train_classifier

In [51]:
tokenizer = AutoTokenizer.from_pretrained(encoder)

loading configuration file https://huggingface.co/airesearch/wangchanberta-base-att-spm-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/616a9e2dfc52e9d019b75d219ed800a27158ed299bd4fad91363110fe93dfce1.27c4f6581fbedf3d12e9fae96d4fbb8bc3064cd88ae545414e7cffc7c5bbc52f
Model config CamembertConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "camembert",
  "num_attention_head": 12,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.9.2",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 25005
}

loading weights file htt

In [52]:
train_df = pd.read_csv("our_model/train.tsv", sep='\t')
dev_df = pd.read_csv("our_model/dev.tsv", sep='\t')
test_df = pd.read_csv("our_model/test.tsv", sep='\t')

In [53]:
train_df["tokens"] = train_df["tokens"].apply(literal_eval)
dev_df["tokens"] = dev_df["tokens"].apply(literal_eval)
test_df["tokens"] = test_df["tokens"].apply(literal_eval)

labelize = lambda x: [h if h == "OUT_OF_RANGE" or -LIMIT <= int(h) <= LIMIT else "OUT_OF_RANGE" for h in literal_eval(x)]

train_df["rel_heads"] = train_df["rel_heads"].apply(labelize)
dev_df["rel_heads"] = dev_df["rel_heads"].apply(labelize)
test_df["rel_heads"] = test_df["rel_heads"].apply(labelize)

train_df["dep_types"] = train_df["dep_types"].apply(literal_eval)
dev_df["dep_types"] = dev_df["dep_types"].apply(literal_eval)
test_df["dep_types"] = test_df["dep_types"].apply(literal_eval)

In [54]:
train_classifier(
    train=train_df,
    dev=dev_df,
    test=test_df,
    label_name="rel_heads",
    label_list=[str(i) for i in range(-LIMIT, LIMIT+1)] + ["OUT_OF_RANGE"],
    limit=LIMIT,
    tokenizer=tokenizer,
    batch_size=batch_size,
    num_epoch=num_epoch,
    sace_strategy=save
)

In [82]:
def evaluate(head_classifier, label_classifier, test_set):
    predictions, labels, _ = head_classifier.predict(test_set)
    predictions = np.argmax(predictions, axis=2)

    bert_prediction = [
    [(id, token, prediction) for (id, token, prediction) in zip(row_id, row_token,row_predictions) ]
        for row_id, row_token, row_predictions in zip(tokenized_datasets['test']['word_ids'],tokenized_datasets['test']['tokens_bert'],predictions)
    ]

    gold_label = [
        [label for label in row_label ] for row_label in tokenized_datasets['test']['rel_heads']
    ]

    wrap_predict = [] 
    for row in bert_prediction:
        predict_row = []
        previous_word=None
        for item in row:
            if(item[0] is not None):
                #skip special token
                if(item[0] != previous_word):
                    predict_row.append(item[-1])
                    previous_word = item[0]
        wrap_predict.append(predict_row)
    
    def flatten(t):
        return [item for sublist in t for item in sublist]
    flat_true_predictions = flatten(wrap_predict)
    flat_true_labels = flatten(gold_label)

    # result = classification_report(flat_true_labels,flat_true_predictions,output_dict=True,labels=label_list)
    # result_df = pd.DataFrame(result).transpose()
    # display(result_df)

    correct, total = 0, 0
    for pred, gold in zip(flat_true_predictions, flat_true_labels):
        if pred != "OUT_OF_RANGE" and pred == gold:
            correct += 1
        total += 1
    UAS = (correct / total) * 100

    predictions, labels, _ = label_classifier.predict(test_set)
    predictions = np.argmax(predictions, axis=2)

    bert_prediction = [
    [(id, token, prediction) for (id, token, prediction) in zip(row_id, row_token,row_predictions) ]
        for row_id, row_token, row_predictions in zip(tokenized_datasets['test']['word_ids'],tokenized_datasets['test']['tokens_bert'],predictions)
    ]

    gold_label = [
        [label for label in row_label ] for row_label in tokenized_datasets['test']['rel_heads']
    ]

    wrap_predict = [] 
    for row in bert_prediction:
        predict_row = []
        previous_word=None
        for item in row:
            if(item[0] is not None):
                #skip special token
                if(item[0] != previous_word):
                    predict_row.append(item[-1])
                    previous_word = item[0]
        wrap_predict.append(predict_row)
    flat_true_predictions = flatten(wrap_predict)
    flat_true_labels = flatten(gold_label)

    # result = classification_report(flat_true_labels,flat_true_predictions,output_dict=True,labels=label_list)
    # result_df = pd.DataFrame(result).transpose()
    # display(result_df)

    correct, total = 0, 0
    for pred, gold in zip(flat_true_predictions, flat_true_labels):
        if pred == gold:
            correct += 1
        total += 1
    LAS = correct / total

    return UAS, LAS

In [86]:
evaluate(HEAD_CLASSIFIER, LABEL_CLASSIFIER, tokenized_datasets["test"])

The following columns in the test set  don't have a corresponding argument in `CamembertForTokenClassification.forward` and have been ignored: rel_heads, tokens, word_ids, tokens_bert.
***** Running Prediction *****
  Num examples = 158
  Batch size = 16




(78.69051227644741, 0)