In [1]:
%pip install -q datasets evaluate --upgrade
%pip install -q transformers==4.26.0

!python -c "from huggingface_hub.hf_api import HfFolder; HfFolder.save_token('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')"
import pandas as pd
from datasets import DatasetDict, Dataset

def transform_dataset(original_dataset):
    en_sentences = original_dataset['en']
    ro_sentences = original_dataset['ro']

    new_sentences = en_sentences + ro_sentences
    new_labels = [0] * len(en_sentences) + [1] * len(ro_sentences)

    new_data_dict = {
        'text': new_sentences,
        'label': new_labels
    }
    new_df = pd.DataFrame(new_data_dict)
    new_dataset = Dataset.from_pandas(new_df)
    return new_dataset

from datasets import load_dataset, concatenate_datasets, DatasetDict
dataset_ag_mt = transform_dataset(load_dataset("mateiaassAI/AG-NEWS-MT")["train"])
dataset_ag_mt = dataset_ag_mt.train_test_split(test_size=0.2, seed=42)
train_dataset = dataset_ag_mt['train'].select(range(1000))
temp_dataset = dataset_ag_mt['test'].select(range(500))

validation_test_split = temp_dataset.train_test_split(test_size=0.5, seed=42)
validation_dataset = validation_test_split['train']
test_dataset = validation_test_split['test']

dataset_ag_mt = DatasetDict({
    'train': train_dataset,
    'validation': validation_dataset,
    'test': test_dataset
})


print(dataset_ag_mt)
print(dataset_ag_mt['train'][0])

dataset_moroco = load_dataset("universityofbucharest/moroco", trust_remote_code=True)

columns_to_remove = ['id']
dataset_moroco = dataset_moroco.remove_columns(columns_to_remove)
dataset_moroco = dataset_moroco.rename_column('sample', 'text')
dataset_moroco = dataset_moroco.rename_column('category', 'label')

print(dataset_moroco)
print(dataset_moroco['train'][0])

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 1500
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 250
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 250
    })
})
{'text': 'Washingtonul propune să coste costul parcului de balon la finanțarea de 630 de milioane de dolari pentru un parc de bal pentru expunerea legată de Washington ar fi plafonată la \\ 630 milioane dolari în cadrul unui amendament aprobat marți de Consiliul Districtului Columbia.', 'label': 1}
DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 21719
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 5924
    })
    validation: Dataset({
        features: ['label', 'text'],
        num_rows: 5921
    })
})

In [2]:
from transformers import AutoTokenizer, BertTokenizer
import torch
from datasets import DatasetDict, Dataset

dataset = DatasetDict()
dataset["moroco"] = dataset_moroco
dataset["ag-mt"] = dataset_ag_mt
names = ["moroco", "ag-mt"]

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_name = "mateiaassAI/teacher_ag-news"
tokenizer = AutoTokenizer.from_pretrained(model_name)



In [3]:
import re
%pip -q install clean-text
from cleantext import clean

def normalize(batch):
    sentence = batch['text']
    sentence = str(sentence)
    sentence = re.sub(r'ş', 'ș', sentence)
    sentence = re.sub(r'Ş', 'Ș', sentence)
    sentence = re.sub(r'ţ', 'ț', sentence)
    sentence = re.sub(r'Ţ', 'Ț', sentence)
    sentence = re.sub(r'\s+', ' ', sentence)
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"
                           u"\U0001F300-\U0001F5FF"
                           u"\U0001F680-\U0001F6FF"
                           u"\U0001F1E0-\U0001F1FF"
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    
    sentence = emoji_pattern.sub(r'', sentence)
    clean(sentence, no_emoji = True)
    return {'text2': sentence}

def label_to_float_dataset_ag_mt(batch):
    if batch['label'] == 0:
        batch['label2'] = [0.0, 1.0]
    elif batch['label'] == 1:
        batch['label2'] = [1.0, 0.0]
    return batch

def label_to_float(batch):
    if batch['label'] == 0:
        batch['label2'] = [0.0, 0.0, 0.0, 0.0, 0.0, 1.0]
    elif batch['label'] == 1:
        batch['label2'] = [0.0, 0.0, 0.0, 0.0, 1.0, 0.0]
    elif batch['label'] == 2:
        batch['label2'] = [0.0, 0.0, 0.0, 1.0, 0.0, 0.0]
    elif batch['label'] == 3:
        batch['label2'] = [0.0, 0.0, 1.0, 0.0, 0.0, 0.0]
    elif batch['label'] == 4:
        batch['label2'] = [0.0, 1.0, 0.0, 0.0, 0.0, 0.0]
    elif batch['label'] == 5:
        batch['label2'] = [1.0, 0.0, 0.0, 0.0, 0.0, 0.0]
    return batch

  pid, fd = os.forkpty()


Note: you may need to restart the kernel to use updated packages.


In [4]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding = "max_length", max_length=512, add_special_tokens = True)

features_dict = {}
for task_name, ds in dataset.items():
    features_dict[task_name] = {}

    if task_name == 'moroco':
        dataset[task_name] = dataset[task_name].map(lambda batch: label_to_float(batch), batched=False)
        dataset[task_name] = dataset[task_name].map(lambda batch: normalize(batch), batched=False)
        print(dataset[task_name])
        dataset[task_name] = dataset[task_name].remove_columns(['label', 'text'])
        dataset[task_name] = dataset[task_name].rename_column('label2', 'labels')
        dataset[task_name] = dataset[task_name].rename_column('text2', 'text')
    elif task_name == 'ag-mt':
        dataset[task_name] = dataset[task_name].map(lambda batch: label_to_float_dataset_ag_mt(batch), batched=False)
        dataset[task_name] = dataset[task_name].map(lambda batch: normalize(batch), batched=False)
        dataset[task_name] = dataset[task_name].remove_columns(['label', 'text'])
        dataset[task_name] = dataset[task_name].rename_column('label2', 'labels')
        dataset[task_name] = dataset[task_name].rename_column('text2', 'text')

    dataset[task_name] = dataset[task_name].map(preprocess_function, batched=True)
    dataset[task_name] = dataset[task_name].remove_columns(['text'])

    print(task_name)
    print("input_ids", len(dataset[task_name]["train"][0]["input_ids"]))
    print("attention_mask", len(dataset[task_name]["train"][0]["attention_mask"]))
    print()

    dataset[task_name].set_format(
            type="torch",
            columns=['input_ids', 'attention_mask', 'labels'],
    )


    for phase, phase_dataset in dataset[task_name].items():
        print(phase, phase_dataset)
        print(task_name, phase, len(phase_dataset), len(dataset[task_name][phase]))
        print(task_name, phase)
        print(dataset[task_name][phase])
        print(task_name, phase, len(phase_dataset), len(dataset[task_name][phase]))
        features_dict[task_name][phase] = dataset[task_name][phase]

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'label2', 'text2'],
        num_rows: 21719
    })
    test: Dataset({
        features: ['label', 'text', 'label2', 'text2'],
        num_rows: 5924
    })
    validation: Dataset({
        features: ['label', 'text', 'label2', 'text2'],
        num_rows: 5921
    })
})
moroco
input_ids 512
attention_mask 512

train Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 21719
})
moroco train 21719 21719
moroco train
Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 21719
})
moroco train 21719 21719
test Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 5924
})
moroco test 5924 5924
moroco test
Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 5924
})
moroco test 5924 5924
validation Dataset({
    features: ['labels', 'input_ids

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

ag-mt
input_ids 512
attention_mask 512

train Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1500
})
ag-mt train 1500 1500
ag-mt train
Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1500
})
ag-mt train 1500 1500
validation Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 250
})
ag-mt validation 250 250
ag-mt validation
Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 250
})
ag-mt validation 250 250
test Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 250
})
ag-mt test 250 250
ag-mt test
Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 250
})
ag-mt test 250 250


## Data Collator

In [5]:
import dataclasses
import transformers
from torch.utils.data.dataloader import DataLoader
from transformers.data.data_collator import InputDataClass
from torch.utils.data.distributed import DistributedSampler
from torch.utils.data.sampler import RandomSampler
from typing import List, Union, Dict

class NLPDataCollator:
    def __call__(
        self, features: List[Union[InputDataClass, Dict]]
    ) -> Dict[str, torch.Tensor]:
        first = features[0]
        if isinstance(first, dict):
            if "labels" in first and first["labels"] is not None:
                if first["labels"].dtype == torch.int64:
                    labels = torch.tensor(
                        [f["labels"] for f in features], dtype=torch.long
                    )
                else:
                    if len(first["labels"]) > 1: 
                          labels = torch.stack(
                            [f["labels"] for f in features])
                    else:
                          labels = torch.tensor(
                            [f["labels"] for f in features], dtype=torch.float
                          )
                batch = {"labels": labels}
            else:
                  batch = {}
            for k, v in first.items():
                if k != "labels" and v is not None and not isinstance(v, str):
                    batch[k] = torch.stack([f[k] for f in features])
            return batch
        else:
            return DefaultDataCollator().collate_batch(features)

class StrIgnoreDevice(str):
    def to(self, device):
        return self

class DataLoaderWithTaskname:
    def __init__(self, task_name, data_loader):
        self.task_name = task_name
        self.data_loader = data_loader

        self.batch_size = data_loader.batch_size
        self.dataset = data_loader.dataset

    def __len__(self):
        return len(self.data_loader)

    def __iter__(self):
        for batch in self.data_loader:
            batch["task_name"] = StrIgnoreDevice(self.task_name)
            yield batch

class MultitaskDataloader:
    def __init__(self, dataloader_dict):
        self.dataloader_dict = dataloader_dict
        self.num_batches_dict = {
            task_name: len(dataloader)
            for task_name, dataloader in self.dataloader_dict.items()
        }

        self.task_name_list = list(self.dataloader_dict)
        self.dataset = [None] * sum(
            len(dataloader.dataset) for dataloader in self.dataloader_dict.values()
        )

    def __len__(self):
        return sum(self.num_batches_dict.values())

    def __iter__(self):
        task_choice_list = []
        for i, task_name in enumerate(self.task_name_list):
            task_choice_list += [i] * self.num_batches_dict[task_name]
        task_choice_list = np.array(task_choice_list)
        np.random.shuffle(task_choice_list)
        dataloader_iter_dict = {
            task_name: iter(dataloader)
            for task_name, dataloader in self.dataloader_dict.items()
        }

        for task_choice in task_choice_list:
            task_name = self.task_name_list[task_choice]
            yield next(dataloader_iter_dict[task_name])

# Model


In [6]:
import torch
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

import transformers
from transformers import BertTokenizer
from transformers import models
from transformers.modeling_outputs import SequenceClassifierOutput

from transformers.models.bert.configuration_bert import BertConfig
from transformers.models.bert.modeling_bert import (
    BertPreTrainedModel,
    BERT_INPUTS_DOCSTRING,
    _CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION,
    _CONFIG_FOR_DOC,
    BertModel,
)

from transformers.file_utils import (
    add_code_sample_docstrings,
    add_start_docstrings_to_model_forward,
)


class BertForSequenceClassification(BertPreTrainedModel):
    def __init__(self, config, **kwargs):
        super().__init__(transformers.PretrainedConfig())
        self.num_labels = kwargs.get("task_labels_map", {})
        self.config = config

        self.bert = BertModel(config)
        classifier_dropout = (
            config.classifier_dropout
            if config.classifier_dropout is not None
            else config.hidden_dropout_prob
        )
        self.dropout = nn.Dropout(classifier_dropout)

        ## add task specific output heads
        self.classifier1 = nn.Linear(
            config.hidden_size, list(self.num_labels.values())[0]
        )

        self.classifier2 = nn.Linear(
            config.hidden_size, list(self.num_labels.values())[1]
        )

        self.init_weights()

    @add_start_docstrings_to_model_forward(
        BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")
    )
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION,
        output_type=SequenceClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        task_name=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        pooled_output = outputs[1]

        pooled_output = self.dropout(pooled_output)
        logits = None

        # For each task, forward to the corresponding head.
        if task_name == list(self.num_labels.keys())[0]:
            logits = self.classifier1(pooled_output)
        elif task_name == list(self.num_labels.keys())[1]:
            logits = self.classifier2(pooled_output)

        loss = None
        if labels is not None:
            if self.config.problem_type is None:
                if self.num_labels[task_name] == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels[task_name] > 1 and (
                    labels.dtype == torch.long or labels.dtype == torch.int
                ):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels[task_name] == 1:
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":                
                if task_name == list(self.num_labels.keys())[0]:
                    loss_fct = BCEWithLogitsLoss()
                    loss = loss_fct(logits, labels)
                else:
                    loss_fct = CrossEntropyLoss()
                    loss = loss_fct(
                        logits.view(-1, self.num_labels[task_name]), labels.view(-1)
                    )
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

## Metrics

In [7]:
%pip install -q evaluate
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score, precision_score, recall_score
from transformers import EvalPrediction
import torch
import evaluate
import numpy as np

def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='weighted')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'weighted')
    
    precision = precision_score(labels, y_pred, average='weighted')
    recall = recall_score(labels, y_pred, average='weighted')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy,
               "pr": precision,
               "recall": recall
               }
    return metrics

def compute_metrics_multi_label(p):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result

  pid, fd = os.forkpty()


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Note: you may need to restart the kernel to use updated packages.


## Trainer

In [8]:
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
import time
import math
import transformers

from transformers.trainer_utils import (
    PREFIX_CHECKPOINT_DIR,
    BestRun,
    EvalLoopOutput,
    EvalPrediction,
    FSDPOption,
    HPSearchBackend,
    HubStrategy,
    IntervalStrategy,
    PredictionOutput,
    RemoveColumnsCollator,
    ShardedDDPOption,
    TrainerMemoryTracker,
    TrainOutput,
    default_compute_objective,
    default_hp_space,
    denumpify_detensorize,
    enable_full_determinism,
    find_executable_batch_size,
    get_last_checkpoint,
    has_length,
    number_of_arguments,
    seed_worker,
    set_seed,
    speed_metrics,
)
from transformers.debug_utils import DebugOption, DebugUnderflowOverflow\

class MultitaskTrainer(transformers.Trainer):
    def __init__(self, *args, compute_metrics_multi_label = None,**kwargs):
        super().__init__(*args, **kwargs)
        
        self.compute_metrics_multi_label = compute_metrics_multi_label;


    def get_single_train_dataloader(self, task_name, train_dataset):
        """
        Create a single-task data loader that also yields task names
        """
        if self.train_dataset is None:
            raise ValueError("Trainer: training requires a train_dataset.")

        train_sampler = (
            RandomSampler(train_dataset)
            if self.args.local_rank == -1
            else DistributedSampler(train_dataset)
        )

        data_loader = DataLoaderWithTaskname(
            task_name=task_name,
            data_loader=DataLoader(
                train_dataset,
                batch_size=self.args.train_batch_size,
                sampler=train_sampler,
                collate_fn=self.data_collator,
            ),
        )
        return data_loader

    def get_train_dataloader(self):
        """
        Returns a MultitaskDataloader, which is not actually a Dataloader
        but an iterable that returns a generator that samples from each
        task Dataloader
        """
        return MultitaskDataloader(
            {
                task_name: self.get_single_train_dataloader(task_name, task_dataset)
                for task_name, task_dataset in self.train_dataset.items()
            }
        )
    
    def get_eval_dataloader(self, eval_dataset, task_name):
          return MultitaskDataloader({
              task_name: self.get_single_train_dataloader(task_name, eval_dataset)
          })


    def get_test_dataloader(self, test_dataset):
          return MultitaskDataloader(
              {
                  task_name: self.get_single_train_dataloader(task_name, task_dataset)
                  for task_name, task_dataset in test_dataset.items()
              }
          )

    def _maybe_log_save_evaluate(self, tr_loss, model, trial, epoch, ignore_keys_for_eval):
        if self.control.should_log:
            logs: Dict[str, float] = {}

            # all_gather + mean() to get average loss over all processes
            tr_loss_scalar = self._nested_gather(tr_loss).mean().item()

            # reset tr_loss to zero
            tr_loss -= tr_loss

            logs["loss"] = round(tr_loss_scalar / (self.state.global_step - self._globalstep_last_logged), 4)
            logs["learning_rate"] = self._get_learning_rate()

            self._total_loss_scalar += tr_loss_scalar
            self._globalstep_last_logged = self.state.global_step
            self.store_flos()

            self.log(logs)

        metrics = None
        if self.control.should_evaluate:
            if isinstance(self.eval_dataset, dict):
                for eval_dataset_name, eval_dataset in self.eval_dataset.items():
                    metrics = self.evaluate(
                        eval_dataset=eval_dataset,
                        ignore_keys=ignore_keys_for_eval,
                        eval_dataset_name = eval_dataset_name,
                        metric_key_prefix=f"eval_{eval_dataset_name}",
                    )
            else:
                metrics = self.evaluate(ignore_keys=ignore_keys_for_eval)
            self._report_to_hp_search(trial, self.state.global_step, metrics)

        if self.control.should_save:
            #self._save_checkpoint(model, trial, metrics=metrics)
            self.control = self.callback_handler.on_save(self.args, self.state, self.control)
    
    def predict(
        self, test_dataset: Dataset, ignore_keys: Optional[List[str]] = None, metric_key_prefix: str = "test", test_dataset_name = None
    ) -> PredictionOutput:

        self._memory_tracker.start()

        test_dataloader = self.get_test_dataloader(test_dataset)
        start_time = time.time()

        eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop
        
#         if test_dataset_name == "REDv2":
#               aux = self.compute_metrics_multi_label;
#               self.compute_metrics_multi_label = self.compute_metrics;
#               self.compute_metrics = aux;
        self.compute_metrics = self.compute_metrics_multi_label;

        output = eval_loop(
            test_dataloader, description="Prediction", ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix
        )
        
        total_batch_size = self.args.eval_batch_size * self.args.world_size
        if f"{metric_key_prefix}_jit_compilation_time" in output.metrics:
            start_time += output.metrics[f"{metric_key_prefix}_jit_compilation_time"]
        output.metrics.update(
            speed_metrics(
                metric_key_prefix,
                start_time,
                num_samples=output.num_samples,
                num_steps=math.ceil(output.num_samples / total_batch_size),
            )
        )

        self.control = self.callback_handler.on_predict(self.args, self.state, self.control, output.metrics)
        self._memory_tracker.stop_and_update_metrics(output.metrics)

#         if test_dataset_name == "REDv2":
#           aux = self.compute_metrics_multi_label;
#           self.compute_metrics_multi_label = self.compute_metrics;
#           self.compute_metrics = aux;

        return PredictionOutput(predictions=output.predictions, label_ids=output.label_ids, metrics=output.metrics)
    
    def evaluate(
        self,
        eval_dataset: Optional[Dataset] = None,
        ignore_keys: Optional[List[str]] = None,
        eval_dataset_name : str = "none",
        metric_key_prefix: str = "eval",
    ) -> Dict[str, float]:

        # memory metrics - must set up as early as possible
        self._memory_tracker.start()

        eval_dataloader = self.get_eval_dataloader(eval_dataset, eval_dataset_name)
        start_time = time.time()

#         if eval_dataset_name == "REDv2":
#           aux = self.compute_metrics_multi_label;
#           self.compute_metrics_multi_label = self.compute_metrics;
#           self.compute_metrics = aux;
        self.compute_metrics = self.compute_metrics_multi_label;

        eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop
        output = eval_loop(
            eval_dataloader,
            description="Evaluation",
            # No point gathering the predictions if there are no metrics, otherwise we defer to
            # self.args.prediction_loss_only
            prediction_loss_only=True if self.compute_metrics is None else None,
            ignore_keys=ignore_keys,
            metric_key_prefix=metric_key_prefix,
        )

        total_batch_size = self.args.eval_batch_size * self.args.world_size
        if f"{metric_key_prefix}_jit_compilation_time" in output.metrics:
            start_time += output.metrics[f"{metric_key_prefix}_jit_compilation_time"]
        output.metrics.update(
            speed_metrics(
                metric_key_prefix,
                start_time,
                num_samples=output.num_samples,
                num_steps=math.ceil(output.num_samples / total_batch_size),
            )
        )

        self.log(output.metrics)

        if DebugOption.TPU_METRICS_DEBUG in self.args.debug:
            xm.master_print(met.metrics_report())

        # Print results.
        print(output.metrics, " = ", eval_dataset_name)

        self._memory_tracker.stop_and_update_metrics(output.metrics)

#         if eval_dataset_name == "REDv2":
#           aux = self.compute_metrics_multi_label;
#           self.compute_metrics_multi_label = self.compute_metrics;
#           self.compute_metrics = aux;

        return output.metrics

## Trainer args

In [9]:
import logging
import torch
import nltk
import numpy as np
from datasets import load_dataset
from torch.utils.data.dataloader import DataLoader
from tqdm.auto import tqdm
from tqdm import tqdm as tqdm1

import transformers
from filelock import FileLock
from transformers import set_seed
from transformers.file_utils import is_offline_mode
from pathlib import Path
from huggingface_hub import HfFolder

dataset_dict = features_dict;
multitask_model = BertForSequenceClassification.from_pretrained(
    model_name,
    task_labels_map={"moroco": 6, "ag-mt": 2},
    classifier_dropout = 0.1
)

train_dataset = {
    task_name: dataset["train"] for task_name, dataset in features_dict.items()
}

test_dataset = {
    task_name: dataset["test"] for task_name, dataset in features_dict.items()
}

eval_dataset = {
    task_name: dataset["test"] for task_name, dataset in features_dict.items()
}

training_args = transformers.TrainingArguments(
    output_dir="teacherMoRoCo-MTL-cl",
    overwrite_output_dir=True,
    learning_rate=2e-05,
    do_train=True,
    weight_decay=0.01,
    num_train_epochs=3,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    fp16 = True,
    hub_strategy = "every_save",
    hub_model_id = "teacherMoRoCo-mtl-cl",
    hub_token = HfFolder.get_token(),
    evaluation_strategy="steps",
    eval_steps=500,
    save_strategy="epoch",
    push_to_hub=False,
    report_to="none"
)


trainer = MultitaskTrainer(
    model = multitask_model,
    args = training_args,
    data_collator = NLPDataCollator(),
    train_dataset = train_dataset,
    eval_dataset = test_dataset,
    compute_metrics = compute_metrics_multi_label,
    compute_metrics_multi_label = compute_metrics_multi_label
)

Some weights of the model checkpoint at mateiaassAI/teacher_ag-news were not used when initializing BertForSequenceClassification: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at mateiaassAI/teacher_ag-news and are newly initialized: ['classifier2.weight', 'classifier2.bias', 'classifier1.weight', 'classifier1.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and

In [10]:
trainer.train()

***** Running training *****
  Num examples = 23219
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 4356
  Number of trainable parameters = 124447496
  else torch.cuda.amp.autocast(cache_enabled=cache_enabled, dtype=self.amp_dtype)


Step,Training Loss,Validation Loss


***** Running Evaluation *****
  Num examples = 5924
  Batch size = 16
***** Running Evaluation *****
  Num examples = 250
  Batch size = 16


{'eval_moroco_loss': 0.14312659204006195, 'eval_moroco_f1': 0.8317883139521676, 'eval_moroco_roc_auc': 0.8883977515821091, 'eval_moroco_accuracy': 0.8046927751519244, 'eval_moroco_pr': 0.8551474865910267, 'eval_moroco_recall': 0.8176907494935854, 'eval_moroco_runtime': 110.8596, 'eval_moroco_samples_per_second': 53.437, 'eval_moroco_steps_per_second': 3.347, 'epoch': 0.34}  =  moroco


  else torch.cuda.amp.autocast(cache_enabled=cache_enabled, dtype=self.amp_dtype)


{'eval_ag-mt_loss': 0.05963155999779701, 'eval_ag-mt_f1': 0.9959979850176311, 'eval_ag-mt_roc_auc': 0.9954545454545455, 'eval_ag-mt_accuracy': 0.996, 'eval_ag-mt_pr': 0.9960283687943262, 'eval_ag-mt_recall': 0.996, 'eval_ag-mt_runtime': 4.7298, 'eval_ag-mt_samples_per_second': 52.856, 'eval_ag-mt_steps_per_second': 3.383, 'epoch': 0.34}  =  ag-mt


  else torch.cuda.amp.autocast(cache_enabled=cache_enabled, dtype=self.amp_dtype)
***** Running Evaluation *****
  Num examples = 5924
  Batch size = 16
***** Running Evaluation *****
  Num examples = 250
  Batch size = 16


{'eval_moroco_loss': 0.11572142690420151, 'eval_moroco_f1': 0.8642664802970978, 'eval_moroco_roc_auc': 0.9091626065234507, 'eval_moroco_accuracy': 0.837778528021607, 'eval_moroco_pr': 0.8782221723858197, 'eval_moroco_recall': 0.8521269412559082, 'eval_moroco_runtime': 110.908, 'eval_moroco_samples_per_second': 53.414, 'eval_moroco_steps_per_second': 3.345, 'epoch': 0.69}  =  moroco


  else torch.cuda.amp.autocast(cache_enabled=cache_enabled, dtype=self.amp_dtype)


{'eval_ag-mt_loss': 0.019827067852020264, 'eval_ag-mt_f1': 0.9959979850176311, 'eval_ag-mt_roc_auc': 0.9954545454545455, 'eval_ag-mt_accuracy': 0.996, 'eval_ag-mt_pr': 0.9960283687943262, 'eval_ag-mt_recall': 0.996, 'eval_ag-mt_runtime': 4.7265, 'eval_ag-mt_samples_per_second': 52.893, 'eval_ag-mt_steps_per_second': 3.385, 'epoch': 0.69}  =  ag-mt


  else torch.cuda.amp.autocast(cache_enabled=cache_enabled, dtype=self.amp_dtype)
***** Running Evaluation *****
  Num examples = 5924
  Batch size = 16
***** Running Evaluation *****
  Num examples = 250
  Batch size = 16


{'eval_moroco_loss': 0.11850282549858093, 'eval_moroco_f1': 0.8648590459007676, 'eval_moroco_roc_auc': 0.9117865688818253, 'eval_moroco_accuracy': 0.849257258609048, 'eval_moroco_pr': 0.874702591554073, 'eval_moroco_recall': 0.8593855503038488, 'eval_moroco_runtime': 110.8862, 'eval_moroco_samples_per_second': 53.424, 'eval_moroco_steps_per_second': 3.346, 'epoch': 1.03}  =  moroco


  else torch.cuda.amp.autocast(cache_enabled=cache_enabled, dtype=self.amp_dtype)


{'eval_ag-mt_loss': 0.009501912631094456, 'eval_ag-mt_f1': 0.9959979850176311, 'eval_ag-mt_roc_auc': 0.9954545454545455, 'eval_ag-mt_accuracy': 0.996, 'eval_ag-mt_pr': 0.9960283687943262, 'eval_ag-mt_recall': 0.996, 'eval_ag-mt_runtime': 4.7259, 'eval_ag-mt_samples_per_second': 52.899, 'eval_ag-mt_steps_per_second': 3.386, 'epoch': 1.03}  =  ag-mt


  else torch.cuda.amp.autocast(cache_enabled=cache_enabled, dtype=self.amp_dtype)
***** Running Evaluation *****
  Num examples = 5924
  Batch size = 16
***** Running Evaluation *****
  Num examples = 250
  Batch size = 16


{'eval_moroco_loss': 0.10898949950933456, 'eval_moroco_f1': 0.8688597932719233, 'eval_moroco_roc_auc': 0.9142098135527184, 'eval_moroco_accuracy': 0.8531397704253882, 'eval_moroco_pr': 0.8781453983998442, 'eval_moroco_recall': 0.8605671843349089, 'eval_moroco_runtime': 110.835, 'eval_moroco_samples_per_second': 53.449, 'eval_moroco_steps_per_second': 3.347, 'epoch': 1.38}  =  moroco


  else torch.cuda.amp.autocast(cache_enabled=cache_enabled, dtype=self.amp_dtype)


{'eval_ag-mt_loss': 0.01060398854315281, 'eval_ag-mt_f1': 0.9959979850176311, 'eval_ag-mt_roc_auc': 0.9954545454545455, 'eval_ag-mt_accuracy': 0.996, 'eval_ag-mt_pr': 0.9960283687943262, 'eval_ag-mt_recall': 0.996, 'eval_ag-mt_runtime': 4.7266, 'eval_ag-mt_samples_per_second': 52.892, 'eval_ag-mt_steps_per_second': 3.385, 'epoch': 1.38}  =  ag-mt


  else torch.cuda.amp.autocast(cache_enabled=cache_enabled, dtype=self.amp_dtype)
***** Running Evaluation *****
  Num examples = 5924
  Batch size = 16
***** Running Evaluation *****
  Num examples = 250
  Batch size = 16


{'eval_moroco_loss': 0.10409459471702576, 'eval_moroco_f1': 0.8699572762487533, 'eval_moroco_roc_auc': 0.9139036725993313, 'eval_moroco_accuracy': 0.8539837947332883, 'eval_moroco_pr': 0.8794084110593661, 'eval_moroco_recall': 0.8636056718433491, 'eval_moroco_runtime': 110.8178, 'eval_moroco_samples_per_second': 53.457, 'eval_moroco_steps_per_second': 3.348, 'epoch': 1.72}  =  moroco


  else torch.cuda.amp.autocast(cache_enabled=cache_enabled, dtype=self.amp_dtype)


{'eval_ag-mt_loss': 0.008675593882799149, 'eval_ag-mt_f1': 0.9959979850176311, 'eval_ag-mt_roc_auc': 0.9954545454545455, 'eval_ag-mt_accuracy': 0.996, 'eval_ag-mt_pr': 0.9960283687943262, 'eval_ag-mt_recall': 0.996, 'eval_ag-mt_runtime': 4.7186, 'eval_ag-mt_samples_per_second': 52.981, 'eval_ag-mt_steps_per_second': 3.391, 'epoch': 1.72}  =  ag-mt


  else torch.cuda.amp.autocast(cache_enabled=cache_enabled, dtype=self.amp_dtype)
***** Running Evaluation *****
  Num examples = 5924
  Batch size = 16
***** Running Evaluation *****
  Num examples = 250
  Batch size = 16


{'eval_moroco_loss': 0.10600028187036514, 'eval_moroco_f1': 0.8668770004353988, 'eval_moroco_roc_auc': 0.9129367549470058, 'eval_moroco_accuracy': 0.8489196488858879, 'eval_moroco_pr': 0.8736275620454766, 'eval_moroco_recall': 0.8622552329507089, 'eval_moroco_runtime': 110.7844, 'eval_moroco_samples_per_second': 53.473, 'eval_moroco_steps_per_second': 3.349, 'epoch': 2.07}  =  moroco


  else torch.cuda.amp.autocast(cache_enabled=cache_enabled, dtype=self.amp_dtype)


{'eval_ag-mt_loss': 0.006880397442728281, 'eval_ag-mt_f1': 0.9959979850176311, 'eval_ag-mt_roc_auc': 0.9954545454545455, 'eval_ag-mt_accuracy': 0.996, 'eval_ag-mt_pr': 0.9960283687943262, 'eval_ag-mt_recall': 0.996, 'eval_ag-mt_runtime': 4.7244, 'eval_ag-mt_samples_per_second': 52.916, 'eval_ag-mt_steps_per_second': 3.387, 'epoch': 2.07}  =  ag-mt


  else torch.cuda.amp.autocast(cache_enabled=cache_enabled, dtype=self.amp_dtype)
***** Running Evaluation *****
  Num examples = 5924
  Batch size = 16
***** Running Evaluation *****
  Num examples = 250
  Batch size = 16


{'eval_moroco_loss': 0.11320546269416809, 'eval_moroco_f1': 0.8671125391252917, 'eval_moroco_roc_auc': 0.9138591421820207, 'eval_moroco_accuracy': 0.849426063470628, 'eval_moroco_pr': 0.868968187624889, 'eval_moroco_recall': 0.8654625253207292, 'eval_moroco_runtime': 110.7581, 'eval_moroco_samples_per_second': 53.486, 'eval_moroco_steps_per_second': 3.35, 'epoch': 2.41}  =  moroco


  else torch.cuda.amp.autocast(cache_enabled=cache_enabled, dtype=self.amp_dtype)


{'eval_ag-mt_loss': 0.011943168006837368, 'eval_ag-mt_f1': 0.9959979850176311, 'eval_ag-mt_roc_auc': 0.9954545454545455, 'eval_ag-mt_accuracy': 0.996, 'eval_ag-mt_pr': 0.9960283687943262, 'eval_ag-mt_recall': 0.996, 'eval_ag-mt_runtime': 4.7246, 'eval_ag-mt_samples_per_second': 52.915, 'eval_ag-mt_steps_per_second': 3.387, 'epoch': 2.41}  =  ag-mt


  else torch.cuda.amp.autocast(cache_enabled=cache_enabled, dtype=self.amp_dtype)
***** Running Evaluation *****
  Num examples = 5924
  Batch size = 16
***** Running Evaluation *****
  Num examples = 250
  Batch size = 16


{'eval_moroco_loss': 0.10754852741956711, 'eval_moroco_f1': 0.8701283491806527, 'eval_moroco_roc_auc': 0.9149804217984965, 'eval_moroco_accuracy': 0.8576975016880486, 'eval_moroco_pr': 0.8767546514218422, 'eval_moroco_recall': 0.8666441593517893, 'eval_moroco_runtime': 110.7711, 'eval_moroco_samples_per_second': 53.48, 'eval_moroco_steps_per_second': 3.349, 'epoch': 2.75}  =  moroco


  else torch.cuda.amp.autocast(cache_enabled=cache_enabled, dtype=self.amp_dtype)


{'eval_ag-mt_loss': 0.01254546269774437, 'eval_ag-mt_f1': 0.9959979850176311, 'eval_ag-mt_roc_auc': 0.9954545454545455, 'eval_ag-mt_accuracy': 0.996, 'eval_ag-mt_pr': 0.9960283687943262, 'eval_ag-mt_recall': 0.996, 'eval_ag-mt_runtime': 4.7215, 'eval_ag-mt_samples_per_second': 52.949, 'eval_ag-mt_steps_per_second': 3.389, 'epoch': 2.75}  =  ag-mt


  else torch.cuda.amp.autocast(cache_enabled=cache_enabled, dtype=self.amp_dtype)


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=4356, training_loss=0.09897281271914367, metrics={'train_runtime': 4696.2627, 'train_samples_per_second': 14.832, 'train_steps_per_second': 0.928, 'total_flos': 1.8328514116018176e+16, 'train_loss': 0.09897281271914367, 'epoch': 3.0})

In [11]:
multitask_model.push_to_hub("mateiaassAI/teacherMoRoCo-mtl-cl")
tokenizer.push_to_hub("mateiaassAI/teacherMoRoCo-mtl-cl")

Configuration saved in /tmp/tmp34c0czhf/config.json
Model weights saved in /tmp/tmp34c0czhf/pytorch_model.bin
Uploading the following files to mateiaassAI/teacherMoRoCo-mtl-cl: pytorch_model.bin,config.json


pytorch_model.bin:   0%|          | 0.00/498M [00:00<?, ?B/s]

tokenizer config file saved in /tmp/tmpkeqmdjld/tokenizer_config.json
Special tokens file saved in /tmp/tmpkeqmdjld/special_tokens_map.json
Uploading the following files to mateiaassAI/teacherMoRoCo-mtl-cl: vocab.txt,special_tokens_map.json,tokenizer.json,tokenizer_config.json
No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/mateiaassAI/teacherMoRoCo-mtl-cl/commit/d732389c2a8dbbc71b7192a246989d6673f5d4a2', commit_message='Upload tokenizer', commit_description='', oid='d732389c2a8dbbc71b7192a246989d6673f5d4a2', pr_url=None, repo_url=RepoUrl('https://huggingface.co/mateiaassAI/teacherMoRoCo-mtl-cl', endpoint='https://huggingface.co', repo_type='model', repo_id='mateiaassAI/teacherMoRoCo-mtl-cl'), pr_revision=None, pr_num=None)

In [12]:
print(test_dataset)

test_sst = DatasetDict()
test_sst['ag-mt'] = test_dataset["ag-mt"]
test_laroseda = DatasetDict()
test_laroseda['moroco'] = test_dataset["moroco"]

print("test_moroco")
print(trainer.predict(test_laroseda, test_dataset_name = "moroco").metrics)
print()
print("test_ag")
print(trainer.predict(test_sst, test_dataset_name = "ag-mt").metrics)

***** Running Prediction *****
  Num examples = 5924
  Batch size = 16


{'moroco': Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 5924
}), 'ag-mt': Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 250
})}
test_moroco


***** Running Prediction *****
  Num examples = 250
  Batch size = 16


{'test_loss': 0.10773372650146484, 'test_f1': 0.8689361359096289, 'test_roc_auc': 0.9141801869834584, 'test_accuracy': 0.8528021607022283, 'test_pr': 0.873046672822144, 'test_recall': 0.8654625253207292, 'test_runtime': 110.7629, 'test_samples_per_second': 53.484, 'test_steps_per_second': 3.349}

test_ag


  else torch.cuda.amp.autocast(cache_enabled=cache_enabled, dtype=self.amp_dtype)


{'test_loss': 0.011247623711824417, 'test_f1': 0.9959979850176311, 'test_roc_auc': 0.9954545454545455, 'test_accuracy': 0.996, 'test_pr': 0.9960283687943262, 'test_recall': 0.996, 'test_runtime': 4.7172, 'test_samples_per_second': 52.998, 'test_steps_per_second': 3.392}


In [13]:
print(eval_dataset)

eval_sst = DatasetDict()
eval_sst['ag-mt'] = test_dataset["ag-mt"]
eval_laroseda = DatasetDict()
eval_laroseda['moroco'] = test_dataset["moroco"]

print("test_moroco")
print(trainer.predict(eval_laroseda, test_dataset_name = "moroco").metrics)
print()
print("test_ag")
print(trainer.predict(eval_sst, test_dataset_name = "ag-mt").metrics)

***** Running Prediction *****
  Num examples = 5924
  Batch size = 16


{'moroco': Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 5924
}), 'ag-mt': Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 250
})}
test_moroco


  else torch.cuda.amp.autocast(cache_enabled=cache_enabled, dtype=self.amp_dtype)


***** Running Prediction *****
  Num examples = 250
  Batch size = 16


{'test_loss': 0.1077335998415947, 'test_f1': 0.8689361359096289, 'test_roc_auc': 0.9141801869834584, 'test_accuracy': 0.8528021607022283, 'test_pr': 0.873046672822144, 'test_recall': 0.8654625253207292, 'test_runtime': 110.8257, 'test_samples_per_second': 53.453, 'test_steps_per_second': 3.348}

test_ag


  else torch.cuda.amp.autocast(cache_enabled=cache_enabled, dtype=self.amp_dtype)


{'test_loss': 0.011247623711824417, 'test_f1': 0.9959979850176311, 'test_roc_auc': 0.9954545454545455, 'test_accuracy': 0.996, 'test_pr': 0.9960283687943262, 'test_recall': 0.996, 'test_runtime': 4.7215, 'test_samples_per_second': 52.95, 'test_steps_per_second': 3.389}
