In [1]:
import transformers
import datasets

In [2]:
from datasets import Dataset
from pathlib import Path
import pickle
from itertools import chain

In [3]:
TARGET_DIR = "20210412"
result_dir = Path(f"../../data/annot_data/annotated_data_bkup/{TARGET_DIR}")
with open(result_dir / f"seq_pairs_{TARGET_DIR}.pkl", "rb") as fin:
    seq_pairs_list = pickle.load(fin)
with open(result_dir / f"noise_pairs_{TARGET_DIR}.pkl", "rb") as fin:
    noise_pairs_list = pickle.load(fin)

In [4]:
sr = 0.9
n_seq = len(seq_pairs_list)
n_noise = len(noise_pairs_list)
si = int(n_seq*0.9)
ni = int(n_noise*0.9)

train_data = {
    "text": [x[0] for x in chain(seq_pairs_list[:si], noise_pairs_list[:ni])],
    "label": [1]*si + [0]*ni
}

test_data = {
    "text": [x[0] for x in chain(seq_pairs_list[si:], noise_pairs_list[ni:])],
    "label": [1]*(n_seq-si) + [0]*(n_noise-ni)
}


In [5]:
train_ds = Dataset.from_dict(train_data)
test_ds = Dataset.from_dict(test_data)

In [6]:
from transformers import (
    AutoConfig, AutoTokenizer, 
    AutoModelForSequenceClassification,
    default_data_collator,
    EvalPrediction,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)

In [9]:
model_name = "distilbert-base-multilingual-cased"
num_label = 2
out_dir = "../../data/model/absa-seq"
cache_dir = "../../data/model/absa-seq/cache"
model_version = "main"

config = AutoConfig.from_pretrained(
        model_name,
        num_labels=num_label,
        finetuning_task="absa-seq",
        cache_dir=cache_dir,
        revision=model_version,
        use_auth_token=None,
    )
tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        cache_dir=cache_dir,
        use_fast=True,
        revision=model_version,
        use_auth_token=None,
    )
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    from_tf=False,
    config=config,
    cache_dir=cache_dir,
    revision=model_version,    
)

Some weights of the model checkpoint at distilbert-base-multilingual-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['pre_classifier.weight', 'pre_cla

In [10]:
def preprocess_function(examples):
    # Tokenize the texts    
    result = tokenizer(examples["text"], padding="max_length", max_length=500, truncation=True)
    result["label"] = examples["label"]
    return result

In [11]:
train_ds = train_ds.map(preprocess_function, batched=True)
test_ds = test_ds.map(preprocess_function, batched=True)

HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [12]:
def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1)
    return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()}

In [15]:
training_args = TrainingArguments(output_dir=out_dir, no_cuda=True)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=default_data_collator,
)

In [16]:
checkpoint = None

train_result = trainer.train(resume_from_checkpoint=checkpoint)
metrics = train_result.metrics
max_train_samples = (
    data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
)
metrics["train_samples"] = min(max_train_samples, len(train_dataset))

trainer.save_model()  # Saves the tokenizer too for easy upload

trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()

Step,Training Loss


KeyboardInterrupt: 