In [1]:
%pip install -q datasets evaluate --upgrade
%pip install -q transformers==4.26.0

!python -c "from huggingface_hub.hf_api import HfFolder; HfFolder.save_token('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')"
import pandas as pd
from datasets import DatasetDict, Dataset, concatenate_datasets

def transform_dataset(original_dataset):
    en_sentences = original_dataset['en']
    ro_sentences = original_dataset['ro']
    
    new_sentences = en_sentences + ro_sentences
    new_labels = [0] * len(en_sentences) + [1] * len(ro_sentences)
    
    new_data_dict = {
        'text': new_sentences,
        'label': new_labels
    }
    new_df = pd.DataFrame(new_data_dict)
    new_dataset = Dataset.from_pandas(new_df)
    
    return new_dataset

from datasets import load_dataset, concatenate_datasets, DatasetDict

dataset_redv2 = load_dataset("mateiaassAI/Redv2", data_files={
    'train': 'train.json',
    'validation': 'valid.json',
    'test': 'test.json'
})
#train_datasets_redv2 = [dataset_redv2['train']] * 7
#expanded_train_dataset = concatenate_datasets(train_datasets_redv2)
#dataset_redv2["train"] = expanded_train_dataset

dataset_emo_mt = transform_dataset(load_dataset("mateiaassAI/EMO-MT")["train"])
# dataset_emo_mt = dataset_emo_mt['train']

dataset_emo_mt = dataset_emo_mt.train_test_split(test_size=0.2, seed=42)
train_dataset = dataset_emo_mt['train'].select(range(1000))
temp_dataset = dataset_emo_mt['test']

validation_test_split = temp_dataset.train_test_split(test_size=0.5, seed=42)
validation_dataset = validation_test_split['train']
test_dataset = validation_test_split['test']

dataset_emo_mt = DatasetDict({
    'train': train_dataset,
    'validation': validation_dataset.select(range(100)),
    'test': test_dataset.select(range(100))
})

columns_to_remove = ['procentual_labels']
dataset_redv2 = dataset_redv2.remove_columns(columns_to_remove)
dataset_redv2 = dataset_redv2.rename_column('agreed_labels', 'label')

print(dataset_redv2)
print(dataset_emo_mt)

from transformers import AutoTokenizer, BertTokenizer
import torch

dataset = DatasetDict()
dataset["emo-mt"] = dataset_emo_mt

dataset["redv2"] = dataset_redv2

names = ["emo-mt", "redv2"]

teacher_emo = "mateiaassAI/teacher_emo"
teacher_redv2 = "mateiaassAI/teacher_redv2"

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_name = "mateiaassAI/teacher_emo"
tokenizer = AutoTokenizer.from_pretrained(model_name)

import re
%pip -q install clean-text
from cleantext import clean

def normalize(batch):
    sentence = batch['text']
    sentence = str(sentence)
    sentence = re.sub(r'ş', 'ș', sentence)
    sentence = re.sub(r'Ş', 'Ș', sentence)
    sentence = re.sub(r'ţ', 'ț', sentence)
    sentence = re.sub(r'Ţ', 'Ț', sentence)
    sentence = re.sub(r'\s+', ' ', sentence)
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"
                           u"\U0001F300-\U0001F5FF"
                           u"\U0001F680-\U0001F6FF"
                           u"\U0001F1E0-\U0001F1FF"
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    
    sentence = emoji_pattern.sub(r'', sentence)
    clean(sentence, no_emoji = True)
    return {'text2': sentence}

import random

def label_to_float_dataset_emo_random(batch):
    # Generează o listă cu două valori aleatorii între 0 și 1
    random_value = random.random()
    if random_value > 0.5:
        batch['label2'] = [0.0, 1.0]
    else:
        batch['label2'] = [1.0, 0.0]
    return batch

def label_to_float_dataset_emo(batch):
    if batch['label'] == 0:
        batch['label2'] = [0.0, 1.0]
    elif batch['label'] == 1:
        batch['label2'] = [1.0, 0.0]
#     elif batch['label'] == 4:
#         batch['label2'] = [0.0, 0.0, 0.0, 1.0, 0.0, 0.0]
#     elif batch['label'] == 3:
#         batch['label2'] = [0.0, 0.0, 1.0, 0.0, 0.0, 0.0]
#     elif batch['label'] == 2:
#         batch['label2'] = [0.0, 1.0, 0.0, 0.0, 0.0, 0.0]
#     elif batch['label'] == 1:
#         batch['label2'] = [1.0, 0.0, 0.0, 0.0, 0.0, 0.0]
    return batch

def label_to_float_dataset_redv2(batch):
    label = []
    for x in batch['label']:
        label.append(float(x))
    batch['label2'] = label
    return batch

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding = "max_length", max_length=512, add_special_tokens = True)

features_dict = {}
for task_name, ds in dataset.items():
    features_dict[task_name] = {}
    
    if task_name == 'emo-mt':
        dataset[task_name] = dataset[task_name].map(lambda batch: label_to_float_dataset_emo(batch), batched=False)
        dataset[task_name] = dataset[task_name].map(lambda batch: normalize(batch), batched=False)
        print(dataset[task_name])
        dataset[task_name] = dataset[task_name].remove_columns(['label', 'text'])
        dataset[task_name] = dataset[task_name].rename_column('label2', 'labels')
        dataset[task_name] = dataset[task_name].rename_column('text2', 'text')
    elif task_name == 'redv2':
        dataset[task_name] = dataset[task_name].map(lambda batch: label_to_float_dataset_redv2(batch), batched=False)
        dataset[task_name] = dataset[task_name].map(lambda batch: normalize(batch), batched=False)
        dataset[task_name] = dataset[task_name].remove_columns(['label', 'text'])
        dataset[task_name] = dataset[task_name].rename_column('label2', 'labels')
        dataset[task_name] = dataset[task_name].rename_column('text2', 'text')
    
    dataset[task_name] = dataset[task_name].map(preprocess_function, batched=True)
    dataset[task_name] = dataset[task_name].remove_columns(['text'])

    print(task_name)
    print("input_ids", len(dataset[task_name]["train"][0]["input_ids"]))
    print("attention_mask", len(dataset[task_name]["train"][0]["attention_mask"]))
    print()
    
    dataset[task_name].set_format(
            type="torch",
            columns=['input_ids', 'attention_mask', 'labels'],
    )
    
    for phase, phase_dataset in dataset[task_name].items():
        print(phase, phase_dataset)

        print(task_name, phase, len(phase_dataset), len(dataset[task_name][phase]))
        
        print(task_name, phase)
        print(dataset[task_name][phase])

        print(task_name, phase, len(phase_dataset), len(dataset[task_name][phase]))

        features_dict[task_name][phase] = dataset[task_name][phase]
    
# print(features_dict)
# features_dict['emo-mt']['test'] = features_dict['redv2']['test']
# features_dict['emo-mt']['validation'] = features_dict['redv2']['validation']
# print("WOWO")
# print(features_dict)

Note: you may need to restart the kernel to use updated packages.
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
kaggle-environments 1.14.15 requires transformers>=4.33.1, but you have transformers 4.26.0 which is incompatible.[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.


train.json:   0%|          | 0.00/1.93M [00:00<?, ?B/s]

valid.json:   0%|          | 0.00/257k [00:00<?, ?B/s]

test.json:   0%|          | 0.00/389k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

1.jsonl:   0%|          | 0.00/2.41M [00:00<?, ?B/s]

2.jsonl:   0%|          | 0.00/2.38M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 4088
    })
    validation: Dataset({
        features: ['label', 'text'],
        num_rows: 543
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 818
    })
})
DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 100
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 100
    })
})


The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


Moving 0 files to the new cache system


0it [00:00, ?it/s]



tokenizer_config.json:   0%|          | 0.00/1.26k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/397k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.19M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

  pid, fd = os.forkpty()


Note: you may need to restart the kernel to use updated packages.


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'label2', 'text2'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['text', 'label', 'label2', 'text2'],
        num_rows: 100
    })
    test: Dataset({
        features: ['text', 'label', 'label2', 'text2'],
        num_rows: 100
    })
})


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

emo-mt
input_ids 512
attention_mask 512

train Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1000
})
emo-mt train 1000 1000
emo-mt train
Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1000
})
emo-mt train 1000 1000
validation Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 100
})
emo-mt validation 100 100
emo-mt validation
Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 100
})
emo-mt validation 100 100
test Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 100
})
emo-mt test 100 100
emo-mt test
Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 100
})
emo-mt test 100 100


Map:   0%|          | 0/4088 [00:00<?, ? examples/s]

Map:   0%|          | 0/543 [00:00<?, ? examples/s]

Map:   0%|          | 0/818 [00:00<?, ? examples/s]

Map:   0%|          | 0/4088 [00:00<?, ? examples/s]

Map:   0%|          | 0/543 [00:00<?, ? examples/s]

Map:   0%|          | 0/818 [00:00<?, ? examples/s]

Map:   0%|          | 0/4088 [00:00<?, ? examples/s]

Map:   0%|          | 0/543 [00:00<?, ? examples/s]

Map:   0%|          | 0/818 [00:00<?, ? examples/s]

redv2
input_ids 512
attention_mask 512

train Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 4088
})
redv2 train 4088 4088
redv2 train
Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 4088
})
redv2 train 4088 4088
validation Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 543
})
redv2 validation 543 543
redv2 validation
Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 543
})
redv2 validation 543 543
test Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 818
})
redv2 test 818 818
redv2 test
Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 818
})
redv2 test 818 818


## Data Collator

In [2]:
import dataclasses
import transformers
from torch.utils.data.dataloader import DataLoader
from transformers.data.data_collator import InputDataClass
from torch.utils.data.distributed import DistributedSampler
from torch.utils.data.sampler import RandomSampler
from typing import List, Union, Dict

class NLPDataCollator:
    """
    Extending the existing DataCollator to work with NLP dataset batches
    """
    def __call__(
        self, features: List[Union[InputDataClass, Dict]]
    ) -> Dict[str, torch.Tensor]:
        first = features[0]
        if isinstance(first, dict):
            # NLP data sets current works presents features as lists of dictionary
            # (one per example), so we  will adapt the collate_batch logic for that
            if "labels" in first and first["labels"] is not None:
                if first["labels"].dtype == torch.int64:
                    labels = torch.tensor(
                        [f["labels"] for f in features], dtype=torch.long
                    )
                else:
                    if len(first["labels"]) > 1: 
                          labels = torch.stack(
                            [f["labels"] for f in features])
                    else:
                          labels = torch.tensor(
                            [f["labels"] for f in features], dtype=torch.float
                          )
                batch = {"labels": labels}
            else:
                  batch = {}
            for k, v in first.items():
                if k != "labels" and v is not None and not isinstance(v, str):
                    batch[k] = torch.stack([f[k] for f in features])
            return batch
        else:
            # otherwise, revert to using the default collate_batch
            return DefaultDataCollator().collate_batch(features)


class StrIgnoreDevice(str):
    """
    This is a hack. The Trainer is going call .to(device) on every input
    value, but we need to pass in an additional `task_name` string.
    This prevents it from throwing an error
    """

    def to(self, device):
        return self


class DataLoaderWithTaskname:
    """
    Wrapper around a DataLoader to also yield a task name
    """

    def __init__(self, task_name, data_loader):
        self.task_name = task_name
        self.data_loader = data_loader

        self.batch_size = data_loader.batch_size
        self.dataset = data_loader.dataset

    def __len__(self):
        return len(self.data_loader)

    def __iter__(self):
        for batch in self.data_loader:
            batch["task_name"] = StrIgnoreDevice(self.task_name)
            yield batch


class MultitaskDataloader:
    """
    Data loader that combines and samples from multiple single-task
    data loaders.
    """

    def __init__(self, dataloader_dict):
        self.dataloader_dict = dataloader_dict
        self.num_batches_dict = {
            task_name: len(dataloader)
            for task_name, dataloader in self.dataloader_dict.items()
        }
        self.task_name_list = list(self.dataloader_dict)
        self.dataset = [None] * sum(
            len(dataloader.dataset) for dataloader in self.dataloader_dict.values()
        )

    def __len__(self):
        return sum(self.num_batches_dict.values())

    def __iter__(self):
        """
        For each batch, sample a task, and yield a batch from the respective
        task Dataloader.
        We use size-proportional sampling, but you could easily modify this
        to sample from some-other distribution.
        """
        task_choice_list = []
        for i, task_name in enumerate(self.task_name_list):
            task_choice_list += [i] * self.num_batches_dict[task_name]
        task_choice_list = np.array(task_choice_list)
        np.random.shuffle(task_choice_list)
        dataloader_iter_dict = {
            task_name: iter(dataloader)
            for task_name, dataloader in self.dataloader_dict.items()
        }
        for task_choice in task_choice_list:
            task_name = self.task_name_list[task_choice]
            yield next(dataloader_iter_dict[task_name])

# Model


In [3]:
import torch
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

import transformers
from transformers import BertTokenizer
from transformers import models
from transformers.modeling_outputs import SequenceClassifierOutput

from transformers.models.bert.configuration_bert import BertConfig
from transformers.models.bert.modeling_bert import (
    BertPreTrainedModel,
    BERT_INPUTS_DOCSTRING,
    _CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION,
    _CONFIG_FOR_DOC,
    BertModel,
)

from transformers.file_utils import (
    add_code_sample_docstrings,
    add_start_docstrings_to_model_forward,
)


class BertForSequenceClassification(BertPreTrainedModel):
    def __init__(self, config, **kwargs):
        super().__init__(transformers.PretrainedConfig())
        self.num_labels = kwargs.get("task_labels_map", {})
        self.config = config

        self.bert = BertModel(config)
        classifier_dropout = (
            config.classifier_dropout
            if config.classifier_dropout is not None
            else config.hidden_dropout_prob
        )
        self.dropout = nn.Dropout(classifier_dropout)

        ## add task specific output heads
        self.classifier1 = nn.Linear(
            config.hidden_size, list(self.num_labels.values())[0]
        )

        self.classifier2 = nn.Linear(
            config.hidden_size, list(self.num_labels.values())[1]
        )

        self.init_weights()

    @add_start_docstrings_to_model_forward(
        BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")
    )
    @add_code_sample_docstrings(
        checkpoint=_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION,
        output_type=SequenceClassifierOutput,
        config_class=_CONFIG_FOR_DOC,
    )

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        task_name=None,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = (
            return_dict if return_dict is not None else self.config.use_return_dict
        )

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        pooled_output = outputs[1]

        pooled_output = self.dropout(pooled_output)
        logits = None

        # For each task, forward to the corresponding head.
        if task_name == list(self.num_labels.keys())[0]:
            logits = self.classifier1(pooled_output)
        elif task_name == list(self.num_labels.keys())[1]:
            logits = self.classifier2(pooled_output)

        loss = None
        if labels is not None:
            if self.config.problem_type is None:
                if self.num_labels[task_name] == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels[task_name] > 1 and (
                    labels.dtype == torch.long or labels.dtype == torch.int
                ):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                if self.num_labels[task_name] == 1:
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":                
                if task_name == list(self.num_labels.keys())[0]:
                    loss_fct = BCEWithLogitsLoss()
                    loss = loss_fct(logits, labels)
                else:
                    loss_fct = CrossEntropyLoss()
                    loss = loss_fct(
                        logits.view(-1, self.num_labels[task_name]), labels.view(-1)
                    )
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        if task_name == list(self.num_labels.keys())[1]:
            loss = -loss;
        
        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

In [4]:
%pip install -q evaluate
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score, precision_score, recall_score
from transformers import EvalPrediction
import torch
import evaluate
import numpy as np

def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='weighted')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'weighted')
    
    precision = precision_score(labels, y_pred, average='weighted')
    recall = recall_score(labels, y_pred, average='weighted')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy,
               "pr": precision,
               "recall": recall
               }
    return metrics

def compute_metrics_multi_label(p):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result

  pid, fd = os.forkpty()


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Note: you may need to restart the kernel to use updated packages.


## Trainer

In [5]:
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
import time
import math
import transformers

from transformers.trainer_utils import (
    PREFIX_CHECKPOINT_DIR,
    BestRun,
    EvalLoopOutput,
    EvalPrediction,
    FSDPOption,
    HPSearchBackend,
    HubStrategy,
    IntervalStrategy,
    PredictionOutput,
    RemoveColumnsCollator,
    ShardedDDPOption,
    TrainerMemoryTracker,
    TrainOutput,
    default_compute_objective,
    default_hp_space,
    denumpify_detensorize,
    enable_full_determinism,
    find_executable_batch_size,
    get_last_checkpoint,
    has_length,
    number_of_arguments,
    seed_worker,
    set_seed,
    speed_metrics,
)
from transformers.debug_utils import DebugOption, DebugUnderflowOverflow\

class MultitaskTrainer(transformers.Trainer):
    def __init__(self, *args, compute_metrics_multi_label = None,**kwargs):
        super().__init__(*args, **kwargs)
        
        self.compute_metrics_multi_label = compute_metrics_multi_label;


    def get_single_train_dataloader(self, task_name, train_dataset):
        """
        Create a single-task data loader that also yields task names
        """
        if self.train_dataset is None:
            raise ValueError("Trainer: training requires a train_dataset.")

        train_sampler = (
            RandomSampler(train_dataset)
            if self.args.local_rank == -1
            else DistributedSampler(train_dataset)
        )

        data_loader = DataLoaderWithTaskname(
            task_name=task_name,
            data_loader=DataLoader(
                train_dataset,
                batch_size=self.args.train_batch_size,
                sampler=train_sampler,
                collate_fn=self.data_collator,
            ),
        )
        return data_loader

    def get_train_dataloader(self):
        """
        Returns a MultitaskDataloader, which is not actually a Dataloader
        but an iterable that returns a generator that samples from each
        task Dataloader
        """
        return MultitaskDataloader(
            {
                task_name: self.get_single_train_dataloader(task_name, task_dataset)
                for task_name, task_dataset in self.train_dataset.items()
            }
        )
    
    def get_eval_dataloader(self, eval_dataset, task_name):
          return MultitaskDataloader({
              task_name: self.get_single_train_dataloader(task_name, eval_dataset)
          })


    def get_test_dataloader(self, test_dataset):
          return MultitaskDataloader(
              {
                  task_name: self.get_single_train_dataloader(task_name, task_dataset)
                  for task_name, task_dataset in test_dataset.items()
              }
          )

    def _maybe_log_save_evaluate(self, tr_loss, model, trial, epoch, ignore_keys_for_eval):
        if self.control.should_log:
            logs: Dict[str, float] = {}

            # all_gather + mean() to get average loss over all processes
            tr_loss_scalar = self._nested_gather(tr_loss).mean().item()

            # reset tr_loss to zero
            tr_loss -= tr_loss

            logs["loss"] = round(tr_loss_scalar / (self.state.global_step - self._globalstep_last_logged), 4)
            logs["learning_rate"] = self._get_learning_rate()

            self._total_loss_scalar += tr_loss_scalar
            self._globalstep_last_logged = self.state.global_step
            self.store_flos()

            self.log(logs)

        metrics = None
        if self.control.should_evaluate:
            if isinstance(self.eval_dataset, dict):
                for eval_dataset_name, eval_dataset in self.eval_dataset.items():
                    metrics = self.evaluate(
                        eval_dataset=eval_dataset,
                        ignore_keys=ignore_keys_for_eval,
                        eval_dataset_name = eval_dataset_name,
                        metric_key_prefix=f"eval_{eval_dataset_name}",
                    )
            else:
                metrics = self.evaluate(ignore_keys=ignore_keys_for_eval)
            self._report_to_hp_search(trial, self.state.global_step, metrics)

        if self.control.should_save:
            #self._save_checkpoint(model, trial, metrics=metrics)
            self.control = self.callback_handler.on_save(self.args, self.state, self.control)
    
    def predict(
        self, test_dataset: Dataset, ignore_keys: Optional[List[str]] = None, metric_key_prefix: str = "test", test_dataset_name = None
    ) -> PredictionOutput:

        self._memory_tracker.start()

        test_dataloader = self.get_test_dataloader(test_dataset)
        start_time = time.time()

        eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop
        
#         if test_dataset_name == "REDv2":
#               aux = self.compute_metrics_multi_label;
#               self.compute_metrics_multi_label = self.compute_metrics;
#               self.compute_metrics = aux;
        self.compute_metrics = self.compute_metrics_multi_label;

        output = eval_loop(
            test_dataloader, description="Prediction", ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix
        )
        
        total_batch_size = self.args.eval_batch_size * self.args.world_size
        if f"{metric_key_prefix}_jit_compilation_time" in output.metrics:
            start_time += output.metrics[f"{metric_key_prefix}_jit_compilation_time"]
        output.metrics.update(
            speed_metrics(
                metric_key_prefix,
                start_time,
                num_samples=output.num_samples,
                num_steps=math.ceil(output.num_samples / total_batch_size),
            )
        )

        self.control = self.callback_handler.on_predict(self.args, self.state, self.control, output.metrics)
        self._memory_tracker.stop_and_update_metrics(output.metrics)

#         if test_dataset_name == "REDv2":
#           aux = self.compute_metrics_multi_label;
#           self.compute_metrics_multi_label = self.compute_metrics;
#           self.compute_metrics = aux;

        return PredictionOutput(predictions=output.predictions, label_ids=output.label_ids, metrics=output.metrics)
    
    def evaluate(
        self,
        eval_dataset: Optional[Dataset] = None,
        ignore_keys: Optional[List[str]] = None,
        eval_dataset_name : str = "none",
        metric_key_prefix: str = "eval",
    ) -> Dict[str, float]:

        # memory metrics - must set up as early as possible
        self._memory_tracker.start()

        eval_dataloader = self.get_eval_dataloader(eval_dataset, eval_dataset_name)
        start_time = time.time()

#         if eval_dataset_name == "REDv2":
#           aux = self.compute_metrics_multi_label;
#           self.compute_metrics_multi_label = self.compute_metrics;
#           self.compute_metrics = aux;
        self.compute_metrics = self.compute_metrics_multi_label;

        eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop
        output = eval_loop(
            eval_dataloader,
            description="Evaluation",
            # No point gathering the predictions if there are no metrics, otherwise we defer to
            # self.args.prediction_loss_only
            prediction_loss_only=True if self.compute_metrics is None else None,
            ignore_keys=ignore_keys,
            metric_key_prefix=metric_key_prefix,
        )

        total_batch_size = self.args.eval_batch_size * self.args.world_size
        if f"{metric_key_prefix}_jit_compilation_time" in output.metrics:
            start_time += output.metrics[f"{metric_key_prefix}_jit_compilation_time"]
        output.metrics.update(
            speed_metrics(
                metric_key_prefix,
                start_time,
                num_samples=output.num_samples,
                num_steps=math.ceil(output.num_samples / total_batch_size),
            )
        )

        self.log(output.metrics)

        if DebugOption.TPU_METRICS_DEBUG in self.args.debug:
            xm.master_print(met.metrics_report())

        # Print results.
        print(output.metrics, " = ", eval_dataset_name)

        self._memory_tracker.stop_and_update_metrics(output.metrics)

#         if eval_dataset_name == "REDv2":
#           aux = self.compute_metrics_multi_label;
#           self.compute_metrics_multi_label = self.compute_metrics;
#           self.compute_metrics = aux;

        return output.metrics

## Trainer args

In [6]:
import logging
import torch
import nltk
import numpy as np
from datasets import load_dataset
from torch.utils.data.dataloader import DataLoader
from tqdm.auto import tqdm
from tqdm import tqdm as tqdm1

import transformers
from filelock import FileLock
from transformers import set_seed
from transformers.file_utils import is_offline_mode
from pathlib import Path
from huggingface_hub import HfFolder

# BackUp dataset_dict from features_dict.
dataset_dict = features_dict;

# for task_name, dataset in dataset_dict.items():
#     print(task_name)
#     print(dataset_dict[task_name]["train"][0])
#     print()

# Create MultiTask Model.
multitask_model = BertForSequenceClassification.from_pretrained(
    "mateiaassAI/teacher_emo",
    task_labels_map={"redv2": 7, "emo-mt": 2},#, "CaRoSeOf": 2, "offense": 4},
    classifier_dropout = 0.1,
    problem_type="multi_label_classification", ignore_mismatched_sizes=True
)

# print(multitask_model.bert.embeddings.word_embeddings.weight.data_ptr())

# Classify data according to the phase - "train | test | validation"
train_dataset = {
    task_name: dataset["train"] for task_name, dataset in features_dict.items()
}

test_dataset = {
    task_name: dataset["test"] for task_name, dataset in features_dict.items()
}

eval_dataset = {
    task_name: dataset["validation"] for task_name, dataset in features_dict.items()
}

training_args = transformers.TrainingArguments(
    output_dir="./kaggle/working/results",
    overwrite_output_dir=True,
    learning_rate=2e-5,
    do_train=True,
    weight_decay=0.01,
    num_train_epochs=4,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
#     hub_strategy = "every_save",
#     hub_model_id = "teacherRedV2-MTL-CL",
    hub_token = HfFolder.get_token(),
    evaluation_strategy="steps",
    eval_steps=200,
    save_strategy="steps",
    save_steps=1500,
#     load_best_model_at_end=True,
#     hub_private_repo=True,
    push_to_hub=False,
    report_to="none"
)

trainer = MultitaskTrainer(
    model = multitask_model,
    args = training_args,
    data_collator = NLPDataCollator(),
    train_dataset = train_dataset,
    eval_dataset = test_dataset,
    compute_metrics = compute_metrics_multi_label,
    compute_metrics_multi_label = compute_metrics_multi_label
)



config.json:   0%|          | 0.00/963 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

Some weights of the model checkpoint at mateiaassAI/teacher_emo were not used when initializing BertForSequenceClassification: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at mateiaassAI/teacher_emo and are newly initialized: ['classifier2.bias', 'classifier2.weight', 'classifier1.bias', 'classifier1.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inferen

In [7]:
trainer.train()

***** Running training *****
  Num examples = 5088
  Num Epochs = 4
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1276
  Number of trainable parameters = 124448265


Step,Training Loss,Validation Loss


***** Running Evaluation *****
  Num examples = 100
  Batch size = 16
***** Running Evaluation *****
  Num examples = 818
  Batch size = 16


{'eval_emo-mt_loss': -4.361631393432617, 'eval_emo-mt_f1': 0.0, 'eval_emo-mt_roc_auc': 0.0, 'eval_emo-mt_accuracy': 0.0, 'eval_emo-mt_pr': 0.0, 'eval_emo-mt_recall': 0.0, 'eval_emo-mt_runtime': 1.7911, 'eval_emo-mt_samples_per_second': 55.831, 'eval_emo-mt_steps_per_second': 3.908, 'epoch': 0.63}  =  emo-mt
{'eval_redv2_loss': 0.2905075252056122, 'eval_redv2_f1': 0.46179232568341677, 'eval_redv2_roc_auc': 0.6769312581938807, 'eval_redv2_accuracy': 0.38264058679706603, 'eval_redv2_pr': 0.7831329870690837, 'eval_redv2_recall': 0.3883495145631068, 'eval_redv2_runtime': 14.3704, 'eval_redv2_samples_per_second': 56.923, 'eval_redv2_steps_per_second': 3.619, 'epoch': 0.63}  =  redv2


***** Running Evaluation *****
  Num examples = 100
  Batch size = 16
***** Running Evaluation *****
  Num examples = 818
  Batch size = 16


{'eval_emo-mt_loss': -7.155379772186279, 'eval_emo-mt_f1': 0.0, 'eval_emo-mt_roc_auc': 0.0, 'eval_emo-mt_accuracy': 0.0, 'eval_emo-mt_pr': 0.0, 'eval_emo-mt_recall': 0.0, 'eval_emo-mt_runtime': 1.7704, 'eval_emo-mt_samples_per_second': 56.483, 'eval_emo-mt_steps_per_second': 3.954, 'epoch': 1.25}  =  emo-mt
{'eval_redv2_loss': 0.25834429264068604, 'eval_redv2_f1': 0.6514394347631797, 'eval_redv2_roc_auc': 0.7692577786993342, 'eval_redv2_accuracy': 0.539119804400978, 'eval_redv2_pr': 0.7523517826016092, 'eval_redv2_recall': 0.5868392664509169, 'eval_redv2_runtime': 14.3696, 'eval_redv2_samples_per_second': 56.926, 'eval_redv2_steps_per_second': 3.619, 'epoch': 1.25}  =  redv2


***** Running Evaluation *****
  Num examples = 100
  Batch size = 16
***** Running Evaluation *****
  Num examples = 818
  Batch size = 16


{'eval_emo-mt_loss': -8.16134262084961, 'eval_emo-mt_f1': 0.0, 'eval_emo-mt_roc_auc': 0.0, 'eval_emo-mt_accuracy': 0.0, 'eval_emo-mt_pr': 0.0, 'eval_emo-mt_recall': 0.0, 'eval_emo-mt_runtime': 1.7758, 'eval_emo-mt_samples_per_second': 56.311, 'eval_emo-mt_steps_per_second': 3.942, 'epoch': 1.88}  =  emo-mt
{'eval_redv2_loss': 0.24512356519699097, 'eval_redv2_f1': 0.65932098849756, 'eval_redv2_roc_auc': 0.7734578313753154, 'eval_redv2_accuracy': 0.5513447432762836, 'eval_redv2_pr': 0.7594665294862285, 'eval_redv2_recall': 0.5879180151024811, 'eval_redv2_runtime': 14.3663, 'eval_redv2_samples_per_second': 56.939, 'eval_redv2_steps_per_second': 3.62, 'epoch': 1.88}  =  redv2


***** Running Evaluation *****
  Num examples = 100
  Batch size = 16
***** Running Evaluation *****
  Num examples = 818
  Batch size = 16


{'eval_emo-mt_loss': -8.682581901550293, 'eval_emo-mt_f1': 0.0, 'eval_emo-mt_roc_auc': 0.0, 'eval_emo-mt_accuracy': 0.0, 'eval_emo-mt_pr': 0.0, 'eval_emo-mt_recall': 0.0, 'eval_emo-mt_runtime': 1.771, 'eval_emo-mt_samples_per_second': 56.466, 'eval_emo-mt_steps_per_second': 3.953, 'epoch': 2.51}  =  emo-mt
{'eval_redv2_loss': 0.24884635210037231, 'eval_redv2_f1': 0.6707791721343211, 'eval_redv2_roc_auc': 0.7858224245120204, 'eval_redv2_accuracy': 0.5806845965770171, 'eval_redv2_pr': 0.7391809529101737, 'eval_redv2_recall': 0.622437971952535, 'eval_redv2_runtime': 14.3648, 'eval_redv2_samples_per_second': 56.945, 'eval_redv2_steps_per_second': 3.62, 'epoch': 2.51}  =  redv2


***** Running Evaluation *****
  Num examples = 100
  Batch size = 16
***** Running Evaluation *****
  Num examples = 818
  Batch size = 16


{'eval_emo-mt_loss': -9.030858993530273, 'eval_emo-mt_f1': 0.0, 'eval_emo-mt_roc_auc': 0.0, 'eval_emo-mt_accuracy': 0.0, 'eval_emo-mt_pr': 0.0, 'eval_emo-mt_recall': 0.0, 'eval_emo-mt_runtime': 1.7774, 'eval_emo-mt_samples_per_second': 56.263, 'eval_emo-mt_steps_per_second': 3.938, 'epoch': 3.13}  =  emo-mt
{'eval_redv2_loss': 0.24867890775203705, 'eval_redv2_f1': 0.6791339487106555, 'eval_redv2_roc_auc': 0.7926807426017702, 'eval_redv2_accuracy': 0.5855745721271394, 'eval_redv2_pr': 0.7305917473521882, 'eval_redv2_recall': 0.6386192017259978, 'eval_redv2_runtime': 14.3655, 'eval_redv2_samples_per_second': 56.942, 'eval_redv2_steps_per_second': 3.62, 'epoch': 3.13}  =  redv2


***** Running Evaluation *****
  Num examples = 100
  Batch size = 16
***** Running Evaluation *****
  Num examples = 818
  Batch size = 16


{'eval_emo-mt_loss': -9.185649871826172, 'eval_emo-mt_f1': 0.0, 'eval_emo-mt_roc_auc': 0.0, 'eval_emo-mt_accuracy': 0.0, 'eval_emo-mt_pr': 0.0, 'eval_emo-mt_recall': 0.0, 'eval_emo-mt_runtime': 1.7704, 'eval_emo-mt_samples_per_second': 56.483, 'eval_emo-mt_steps_per_second': 3.954, 'epoch': 3.76}  =  emo-mt
{'eval_redv2_loss': 0.2522664964199066, 'eval_redv2_f1': 0.6844488962070354, 'eval_redv2_roc_auc': 0.7965059752179936, 'eval_redv2_accuracy': 0.5855745721271394, 'eval_redv2_pr': 0.7260806906741372, 'eval_redv2_recall': 0.6483279395900755, 'eval_redv2_runtime': 14.3855, 'eval_redv2_samples_per_second': 56.863, 'eval_redv2_steps_per_second': 3.615, 'epoch': 3.76}  =  redv2




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=1276, training_loss=-1.2303237675873089, metrics={'train_runtime': 1161.8677, 'train_samples_per_second': 17.517, 'train_steps_per_second': 1.098, 'total_flos': 5355172751081472.0, 'train_loss': -1.2303237675873089, 'epoch': 4.0})

In [8]:
multitask_model.push_to_hub("mateiaassAI/teacherRedV2-MTL-CL-LAST2")
tokenizer.push_to_hub("mateiaassAI/teacherRedV2-MTL-CL-LAST2")

Configuration saved in /tmp/tmprsb975rn/config.json
Model weights saved in /tmp/tmprsb975rn/pytorch_model.bin
Uploading the following files to mateiaassAI/teacherRedV2-MTL-CL-LAST2: pytorch_model.bin,config.json


pytorch_model.bin:   0%|          | 0.00/498M [00:00<?, ?B/s]

tokenizer config file saved in /tmp/tmp0tjwft3x/tokenizer_config.json
Special tokens file saved in /tmp/tmp0tjwft3x/special_tokens_map.json
Uploading the following files to mateiaassAI/teacherRedV2-MTL-CL-LAST2: vocab.txt,special_tokens_map.json,tokenizer.json,tokenizer_config.json
No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/mateiaassAI/teacherRedV2-MTL-CL-LAST2/commit/a668dd71f2bc7653172f8569c2e3802dde6de57c', commit_message='Upload tokenizer', commit_description='', oid='a668dd71f2bc7653172f8569c2e3802dde6de57c', pr_url=None, repo_url=RepoUrl('https://huggingface.co/mateiaassAI/teacherRedV2-MTL-CL-LAST2', endpoint='https://huggingface.co', repo_type='model', repo_id='mateiaassAI/teacherRedV2-MTL-CL-LAST2'), pr_revision=None, pr_num=None)

In [9]:
###### print(test_dataset)

test_redv2 = DatasetDict()
test_redv2['redv2'] = test_dataset["redv2"]
test_emo = DatasetDict()
test_emo['emo-mt'] = test_dataset["emo-mt"]

print("test_redv2")
print(trainer.predict(test_redv2, test_dataset_name = "redv2").metrics)
print()
print("test_emo")
print(trainer.predict(test_emo, test_dataset_name = "emo-mt").metrics)

***** Running Prediction *****
  Num examples = 818
  Batch size = 16


test_redv2


***** Running Prediction *****
  Num examples = 100
  Batch size = 16


{'test_loss': 0.25379678606987, 'test_f1': 0.681617905238172, 'test_roc_auc': 0.7945754869245414, 'test_accuracy': 0.5819070904645477, 'test_pr': 0.7242297666413006, 'test_recall': 0.645091693635383, 'test_runtime': 14.3716, 'test_samples_per_second': 56.918, 'test_steps_per_second': 3.618}

test_emo


{'test_loss': -9.198747634887695, 'test_f1': 0.0, 'test_roc_auc': 0.0, 'test_accuracy': 0.0, 'test_pr': 0.0, 'test_recall': 0.0, 'test_runtime': 1.7712, 'test_samples_per_second': 56.459, 'test_steps_per_second': 3.952}
