In [None]:
# %%capture
# !git clone https://github.com/nikitakapitan/nlphub.git
# !mkdir logs
# !pip install datasets transformers evaluate accelerate

# %cd nlphub
# !pip install 

In [None]:
import yaml
import torch
import json
from datasets import load_dataset
from transformers import pipeline
from transformers import TrainingArguments, Trainer
from transformers import AutoTokenizer
from nlphub import FineTuner

from huggingface_hub import login
login("TOKEN")

In [None]:
with open('train.yaml', 'r') as f:
    config = yaml.safe_load(f)
config

In [None]:
finetuner = FineTuner(config)

In [None]:
tokenize = lambda batch: finetuner.tokenizer(batch['text'], padding='max_length', truncation=True)
dataset_encoded = finetuner.dataset.map(tokenize, batched=True)

In [None]:
training_args = TrainingArguments(
        output_dir=f'{config["BASE_MODEL_NAME"]}-finetuned-{config["DATASET_NAME"]}_{config["DATASET_CONFIG_NAME"]}',
        num_train_epochs=config['NUM_EPOCHS'],
        learning_rate=config['LEARNING_RATE'],
        per_device_train_batch_size=config['BATCH_SIZE'],
        per_device_eval_batch_size=config['BATCH_SIZE'],
        warmup_steps=500,
        weight_decay=0.01,
        evaluation_strategy='epoch',
        disable_tqdm=False,
        logging_dir='./logs',
        push_to_hub=True,
        log_level=config['LOG_LEVEL'],
    )

In [None]:
trainer = Trainer(
        model=finetuner.model,
        args=training_args,
        train_dataset=dataset_encoded['train'],
        eval_dataset=dataset_encoded['validation'],
        compute_metrics=finetuner.compute_metrics_func,
    )

In [None]:
trainer.evaluate()

In [None]:
trainer.train()

In [None]:
trainer.push_to_hub()