In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, default_data_collator, TrainingArguments, Trainer
from torch.utils.data import DataLoader
from datasets import load_dataset
import pandas as pd
import numpy as np
import evaluate
import logging
import torch
import ast
import sys

In [None]:
def init_logging():
    logger = logging.getLogger(__name__)
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )
    logger.setLevel(logging.INFO)
    return logger
    # console_handler = logging.StreamHandler()
    # logger.setLevel(logging.INFO)
    # console_handler.setLevel(logging.INFO)
    # formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s: %(message)s')
    # console_handler.setFormatter(formatter)
    # logger.addHandler(console_handler)
    # return logger, console_handler

In [None]:
logger = init_logging() #, console_handler = init_logging()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
logger.info(f'Device = {device}')

In [None]:
model_name = 'microsoft/deberta-v3-xsmall'
data_path = '../data/en.tsv'

model = AutoModelForSequenceClassification.from_pretrained(model_name, output_hidden_states=True)
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False, return_tensors='pt')

In [None]:
dataset = load_dataset('csv', data_files=data_path, sep='\t', converters={'sentences': ast.literal_eval})
dataset = dataset['train'].train_test_split(test_size=0.2)
logger.info(f"Dataset loaded!")

In [None]:
dataset['train']

In [None]:
def preprocess_dataset(dataset, tokenizer):
    # dataset = dataset['train'].remove_columns('sub_sentences')

    def concatenate_sentences(example):
        # example['sentences'] = ' '.join(example['sentences'])
        example['sentences'] = ' '.join(example[sentence] for sentence in ['sentence_0', 'sentence_1', 'sentence_2', 'sentence_3'])
        return example
    
    dataset = dataset.map(concatenate_sentences, 
                          desc='Concatenatings passage sentences.')

    def preprocessing_function(examples):
        result = tokenizer(examples['sentences'], padding='max_length', max_length=256, truncation=True)
        result['label'] = [1 if perturb_type is None else 0 for perturb_type in examples['perturbation']]
        return result

    
    tokenized_dataset = dataset.map(
            preprocessing_function,
            batched=True,
            remove_columns=dataset.column_names,
            desc="Running tokenizer on dataset",
        )

    return tokenized_dataset

In [None]:
train_dataset = preprocess_dataset(dataset['train'], tokenizer)
eval_dataset = preprocess_dataset(dataset['test'], tokenizer)

In [None]:
logger.info(f"Dataset is ready!")

In [None]:
out_dir = 'prova'

In [None]:
training_args = TrainingArguments(
    output_dir=out_dir,
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    # warmup_steps=500,                # number of warmup steps for learning rate scheduler
    # weight_decay=cf.weight_decay,               # strength of weight decay
    save_strategy="no",
    # learning_rate=cf.lr
)

In [None]:
metric = evaluate.load('accuracy')#, cache_dir=training_args.cache_dir)

def compute_metrics(p):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1)
    result = metric.compute(predictions=preds, references=p.label_ids)
    if len(result) > 1:
        result["combined_score"] = np.mean(list(result.values())).item()
    return result

In [None]:
# from torch.nn import CrossEntropyLoss

# def compute_metrics(p):
#     preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
#     preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1)
    
#     # Compute the loss
#     loss_fct = CrossEntropyLoss()
#     loss = loss_fct(preds.view(-1, preds.shape[-1]), p.label_ids.view(-1))
    
#     result = metric.compute(predictions=preds, references=p.label_ids)
#     result["loss"] = loss.item()
    
#     if len(result) > 1:
#         result["combined_score"] = np.mean(list(result.values())).item()
#     return result

In [None]:
data_collator = default_data_collator

In [None]:
eval_dataset

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()