# Active Learning

## Preliminaries

Code is adapted from [here.](https://github.com/huggingface/transformers/blob/main/examples/pytorch/text-classification/run_glue.py) and [here](https://github.com/adapter-hub/adapter-transformers/blob/cffdf3974ea19f49e1febe6e3f5b74be4e2d496a/examples/pytorch/text-classification/run_glue.py)

In [1]:
! mkdir results

In [2]:
!pip install --quiet --upgrade gdown
!pip install --quiet -U transformers[torch]
!pip install --quiet datasets
!pip install --quiet scikit-learn
!pip install --quiet evaluate
!pip install --quiet sentencepiece

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m50.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m65.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m30.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
import logging
import random
import sys
import os

from scipy.stats import entropy
import torch
from datasets import load_dataset

random.seed("42")

In [4]:
import gdown

url = "https://drive.google.com/drive/folders/1m8LKDVNj0LCcztrdk-y7dMTY0bs7EQYJ"
gdown.download_folder(url, output="./", quiet=True, use_cookies=False)

['./dataset/test.csv', './dataset/train.csv', './dataset/validation.csv']

In [5]:
# Creating an object
logger = logging.getLogger()

# Setting the threshold of logger to DEBUG
logger.setLevel(logging.INFO)

In [6]:
task_to_keys = {
    "politics": ("sentence", None),
}

base_dir = './dataset/'

In [7]:
data_files = {"train": base_dir + "train.csv", "validation": base_dir + "validation.csv", "test": base_dir + "test.csv"}

In [8]:
# Active learning variables
BUDGET = 750
INITIAL_DATASET_SIZE = 150
ACQUISITION_SIZE = 100
ITERATIONS = int((BUDGET - INITIAL_DATASET_SIZE) / ACQUISITION_SIZE)
print(f"Budget: {BUDGET}")
print(f"Initial Dataset Size: {INITIAL_DATASET_SIZE}")
print(f"Acquisition size: {ACQUISITION_SIZE}")
print(f"Iterations: {ITERATIONS}")

# General variables
MAX_SEQ_LEN = 128
TRAIN_BATCH_SIZE = 16
EVAL_BATCH_SIZE = 16
LEARNING_RATE = 2e-5
LOGGING_STEPS = 50
EVAL_STEPS = 50
# epoch * (budget/batch size)
MAX_STEPS = 500
CANDIDATE_TO_SAMPLE_RATIO=5
BASE_MODEL = 'xlm-roberta-base'

print(MAX_STEPS)

Budget: 750
Initial Dataset Size: 150
Acquisition size: 100
Iterations: 6
500


## Train Model

In [9]:
from dataclasses import dataclass, field
from typing import Optional

In [10]:
@dataclass
class DataTrainingArguments:
    """
    Arguments pertaining to what data we are going to input our model for training and eval.
    Using `HfArgumentParser` we can turn this class
    into argparse arguments to be able to specify them on
    the command line.
    """

    task_name: Optional[str] = field(
        default=None,
        metadata={"help": "The name of the task to train on: " + ", ".join(task_to_keys.keys())},
    )
    dataset_name: Optional[str] = field(
        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
    )
    dataset_config_name: Optional[str] = field(
        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
    )
    max_seq_length: int = field(
        default=128,
        metadata={
            "help": (
                "The maximum total input sequence length after tokenization. Sequences longer "
                "than this will be truncated, sequences shorter will be padded."
            )
        },
    )
    overwrite_cache: bool = field(
        default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
    )
    pad_to_max_length: bool = field(
        default=True,
        metadata={
            "help": (
                "Whether to pad all samples to `max_seq_length`. "
                "If False, will pad the samples dynamically when batching to the maximum length in the batch."
            )
        },
    )
    max_train_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": (
                "For debugging purposes or quicker training, truncate the number of training examples to this "
                "value if set."
            )
        },
    )
    max_eval_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": (
                "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
                "value if set."
            )
        },
    )
    max_predict_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": (
                "For debugging purposes or quicker training, truncate the number of prediction examples to this "
                "value if set."
            )
        },
    )

In [11]:
@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
    """

    model_name_or_path: str = field(
        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
    )
    config_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
    )
    tokenizer_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
    )
    cache_dir: Optional[str] = field(
        default=None,
        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
    )
    use_fast_tokenizer: bool = field(
        default=True,
        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
    )
    model_revision: str = field(
        default="main",
        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
    )
    use_auth_token: bool = field(
        default=False,
        metadata={
            "help": (
                "Will use the token generated when running `huggingface-cli login` (necessary to use this script "
                "with private models)."
            )
        },
    )
    ignore_mismatched_sizes: bool = field(
        default=False,
        metadata={"help": "Will enable to load a pretrained model whose head dimensions are different."},
    )

In [12]:
import datasets
from datasets import concatenate_datasets, load_dataset, load_metric
import numpy as np
from datasets import load_dataset

import evaluate

import transformers
from transformers.trainer_utils import get_last_checkpoint
from transformers import (
    AutoModelForSequenceClassification,
    AutoConfig,
    AutoTokenizer,
    DataCollatorWithPadding,
    EvalPrediction,
    HfArgumentParser,
    PretrainedConfig,
    Trainer,
    TrainingArguments,
    default_data_collator,
    set_seed,
)

INFO:torch.distributed.nn.jit.instantiator:Created a temporary directory at /tmp/tmpl_760ae6
INFO:torch.distributed.nn.jit.instantiator:Writing /tmp/tmpl_760ae6/_remote_module_non_scriptable.py


In [13]:
def get_label_predictions(predicted, raw_datasets):
  tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, do_lower_case=True)
  t = transform_to_embedding_dataset(tokenizer, raw_datasets["test"], is_bert=False)
  labels = t[:][2]
  predicted_tensor = torch.argmax(torch.tensor(predicted), dim=1).flatten()
  return predicted_tensor.cpu().numpy(), labels.cpu().numpy()

In [14]:
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, roc_curve, roc_auc_score

def report(preds, labels):
    print("Metrics Report:\n")
    print(f"Accuracy: {accuracy_score(labels, preds)}")
    print(f"Confusion Matrix:\n{confusion_matrix(labels, preds)}")
    print(f"Precision: {precision_score(labels, preds,average='weighted')}")
    print(f"Recall: {recall_score(labels, preds,average='weighted')}")
    print(f"F1-score: {f1_score(labels, preds, average='weighted')}")

In [15]:
def experiment(raw_datasets, args_dict=None):
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
    if args_dict is not None:
        model_args, data_args, training_args = parser.parse_dict(args_dict)
    elif len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )

    log_level = training_args.get_process_log_level()
    logger.setLevel(log_level)
    datasets.utils.logging.set_verbosity(log_level)
    transformers.utils.logging.set_verbosity(log_level)
    transformers.utils.logging.enable_default_handler()
    transformers.utils.logging.enable_explicit_format()

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
    logger.info(f"Training/evaluation parameters {training_args}")

    # Detecting last checkpoint.
    last_checkpoint = None
    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
        last_checkpoint = get_last_checkpoint(training_args.output_dir)
        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
            raise ValueError(
                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
                "Use --overwrite_output_dir to overcome."
            )
        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
            logger.info(
                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
            )

    # Set seed before initializing model.
    set_seed(training_args.seed)

    # Labels
    if data_args.task_name is not None:
        is_regression = data_args.task_name == "stsb"
        if not is_regression:
            label_list = raw_datasets["train"].features["label"].names
            num_labels = len(label_list)
        else:
            num_labels = 1
    else:
        # Trying to have good defaults here, don't hesitate to tweak to your needs.
        is_regression = raw_datasets["train"].features["label"].dtype in ["float32", "float64"]
        if is_regression:
            num_labels = 1
        else:
            # A useful fast method:
            # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.unique
            label_list = raw_datasets["train"].unique("label")
            label_list.sort()  # Let's sort it for determinism
            num_labels = len(label_list)

    # Load pretrained model and tokenizer
    #
    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    config = AutoConfig.from_pretrained(
        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
        num_labels=num_labels,
        finetuning_task=data_args.task_name,
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        use_fast=model_args.use_fast_tokenizer,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
    )
    model = AutoModelForSequenceClassification.from_pretrained(
        model_args.model_name_or_path,
        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
        use_auth_token=True if model_args.use_auth_token else None,
        ignore_mismatched_sizes=model_args.ignore_mismatched_sizes,
    )

    # Preprocessing the raw_datasets
    if data_args.task_name is not None:
        sentence1_key, sentence2_key = task_to_keys[data_args.task_name]
    else:
        # Again, we try to have some nice defaults but don't hesitate to tweak to your use case.
        non_label_column_names = [name for name in raw_datasets["train"].column_names if name != "label"]
        if "sentence1" in non_label_column_names and "sentence2" in non_label_column_names:
            sentence1_key, sentence2_key = "sentence1", "sentence2"
        else:
            if len(non_label_column_names) > 2:
                sentence1_key, sentence2_key = non_label_column_names[:2]
            else:
                sentence1_key, sentence2_key = "sentence1", None

    # Padding strategy
    if data_args.pad_to_max_length:
        padding = "max_length"
    else:
        # We will pad later, dynamically at batch creation, to the max sequence length in each batch
        padding = False

    # Some models have set the order of the labels to use, so let's make sure we do use it.
    label_to_id = None
    if (
        model.config.label2id != PretrainedConfig(num_labels=num_labels).label2id
        and data_args.task_name is not None
        and not is_regression
    ):
        # Some have all caps in their config, some don't.
        label_name_to_id = {k.lower(): v for k, v in model.config.label2id.items()}
        if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)):
            label_to_id = {i: int(label_name_to_id[label_list[i]]) for i in range(num_labels)}
        else:
            logger.warning(
                f"""Your model seems to have been trained with labels, but they don't match the dataset:\n
                model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}.\n
                Ignoring the model labels as a result.""",
            )
    elif data_args.task_name is None and not is_regression:
        label_to_id = {v: i for i, v in enumerate(label_list)}

    if label_to_id is not None:
        model.config.label2id = label_to_id
        model.config.id2label = {id: label for label, id in config.label2id.items()}
    elif data_args.task_name is not None and not is_regression:
        model.config.label2id = {l: i for i, l in enumerate(label_list)}
        model.config.id2label = {id: label for label, id in config.label2id.items()}

    if data_args.max_seq_length > tokenizer.model_max_length:
        logger.warning(
            f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
            f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
        )
    max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)

    def preprocess_function(examples):
        # Tokenize the texts
        args = (
            (examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])
        )
        result = tokenizer(*args, padding=padding, max_length=max_seq_length, truncation=True)

        # Map labels to IDs (not necessary for GLUE tasks)
        if label_to_id is not None and "label" in examples:
            result["label"] = [(label_to_id[l] if l != -1 else -1) for l in examples["label"]]
        return result

    with training_args.main_process_first(desc="dataset map pre-processing"):
        raw_datasets = raw_datasets.map(
            preprocess_function,
            batched=True,
            load_from_cache_file=not data_args.overwrite_cache,
            desc="Running tokenizer on dataset",
        )
    if training_args.do_train:
        if "train" not in raw_datasets:
            raise ValueError("--do_train requires a train dataset")
        train_dataset = raw_datasets["train"]
        if data_args.max_train_samples is not None:
            max_train_samples = min(len(train_dataset), data_args.max_train_samples)
            train_dataset = train_dataset.select(range(max_train_samples))

    if training_args.do_eval:
        if "validation" not in raw_datasets and "validation_matched" not in raw_datasets:
            raise ValueError("--do_eval requires a validation dataset")
        eval_dataset = raw_datasets["validation_matched" if data_args.task_name == "mnli" else "validation"]
        if data_args.max_eval_samples is not None:
            max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
            eval_dataset = eval_dataset.select(range(max_eval_samples))

    if training_args.do_predict or data_args.task_name is not None or data_args.test_file is not None:
        if "test" not in raw_datasets and "test_matched" not in raw_datasets:
            raise ValueError("--do_predict requires a test dataset")
        predict_dataset = raw_datasets["test_matched" if data_args.task_name == "mnli" else "test"]
        if data_args.max_predict_samples is not None:
            max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples)
            predict_dataset = predict_dataset.select(range(max_predict_samples))

    # Log a few random samples from the training set:
    if training_args.do_train:
        for index in random.sample(range(len(train_dataset)), 3):
            logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")

    # Get the metric function
    if data_args.task_name is not None:
        metric = evaluate.load("glue", data_args.task_name)
    else:
        metric = evaluate.load("accuracy")

    # You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a
    # predictions and label_ids field) and has to return a dictionary string to float.
    def compute_metrics(p: EvalPrediction):
        preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
        preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1)
        if data_args.task_name is not None:
            result = metric.compute(predictions=preds, references=p.label_ids)
            if len(result) > 1:
                result["combined_score"] = np.mean(list(result.values())).item()
            return result
        elif is_regression:
            return {"mse": ((preds - p.label_ids) ** 2).mean().item()}
        else:
            return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()}

    # Data collator will default to DataCollatorWithPadding when the tokenizer is passed to Trainer, so we change it if
    # we already did the padding.
    if data_args.pad_to_max_length:
        data_collator = default_data_collator
    elif training_args.fp16:
        data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)
    else:
        data_collator = None

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset if training_args.do_train else None,
        eval_dataset=eval_dataset if training_args.do_eval else None,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    # Training
    if training_args.do_train:
        checkpoint = None
        if training_args.resume_from_checkpoint is not None:
            checkpoint = training_args.resume_from_checkpoint
        elif last_checkpoint is not None:
            checkpoint = last_checkpoint
        train_result = trainer.train(resume_from_checkpoint=checkpoint)
        metrics = train_result.metrics
        max_train_samples = (
            data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
        )
        metrics["train_samples"] = min(max_train_samples, len(train_dataset))

        trainer.save_model(training_args.output_dir)  # Saves the tokenizer too for easy upload

        trainer.log_metrics("train", metrics)
        trainer.save_metrics("train", metrics)
        trainer.save_state()

    # Evaluation
    evaluation_metrics = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        # Loop to handle MNLI double evaluation (matched, mis-matched)
        tasks = [data_args.task_name]
        eval_datasets = [eval_dataset]
        if data_args.task_name == "mnli":
            tasks.append("mnli-mm")
            eval_datasets.append(raw_datasets["validation_mismatched"])

        for eval_dataset, task in zip(eval_datasets, tasks):
            metrics = trainer.evaluate(eval_dataset=eval_dataset)

            max_eval_samples = (
                data_args.max_eval_samples
                if data_args.max_eval_samples is not None
                else len(eval_dataset)
            )
            metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))

            trainer.log_metrics("eval", metrics)
            trainer.save_metrics("eval", metrics)

            evaluation_metrics = metrics

    test_predictions = None
    if training_args.do_predict:
        logger.info("*** Predict ***")

        # Loop to handle MNLI double evaluation (matched, mis-matched)
        tasks = [data_args.task_name]
        predict_datasets = [predict_dataset]
        if data_args.task_name == "mnli":
            tasks.append("mnli-mm")
            predict_datasets.append(raw_datasets["test_mismatched"])

        for predict_dataset, task in zip(predict_datasets, tasks):
            # Removing the `label` columns because it contains -1 and Trainer won't like that.
            predict_dataset = predict_dataset.remove_columns("label")
            test_predictions = trainer.predict(predict_dataset, metric_key_prefix="predict").predictions

    return evaluation_metrics, test_predictions

In [16]:
def annotate(unlabled_samples):
    return unlabled_samples

## Contranstive Sampling

In [17]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [18]:
from torch.utils.data import SequentialSampler, DataLoader
from torch.nn.functional import normalize

def calculate_bert_representations(model, dataset):
    sampler = SequentialSampler(dataset)
    dataloader = DataLoader(dataset, sampler=sampler, batch_size=4)

    dataset_mean_bert_representation = torch.Tensor().to(device)
    dataset_cls_representation = torch.Tensor().to(device)
    dataset_mean_input_embeddings = torch.Tensor().to(device)
    dataset_logits = torch.Tensor().to(device)

    for batch in tqdm(dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(device) for t in batch)
        with torch.no_grad():
            inputs = {"input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2]}
            bert_outputs = model.bert(**inputs)
            mean_bert_representation = torch.mean(bert_outputs.last_hidden_state, dim=1)
            dataset_mean_bert_representation = torch.cat((dataset_mean_bert_representation, mean_bert_representation), dim=0)
            cls_representation = bert_outputs.pooler_output
            dataset_cls_representation = torch.cat((dataset_cls_representation, cls_representation), dim=0)
            mean_input_embeddings = torch.mean(model.bert.embeddings(inputs['input_ids']), dim=1)
            dataset_mean_input_embeddings = torch.cat((dataset_mean_input_embeddings, mean_input_embeddings), dim=0)
            logits = model(**inputs).logits
            dataset_logits = torch.cat((dataset_logits, logits), dim=0)


    # TODO: Normalize representations
    representations = {
        "mean_bert_representation": normalize(dataset_mean_bert_representation).detach().cpu(),
        "cls_representation": normalize(dataset_cls_representation).detach().cpu(),
        "mean_input_embeddings": normalize(dataset_mean_input_embeddings).detach().cpu(),
        "logits": dataset_logits
    }

    return representations

In [19]:
from torch.utils.data import SequentialSampler, DataLoader
from torch.nn.functional import normalize

def calculate_roberta_representations(model, dataset):
    sampler = SequentialSampler(dataset)
    dataloader = DataLoader(dataset, sampler=sampler, batch_size=4)

    dataset_mean_bert_representation = torch.Tensor().to(device)
    dataset_cls_representation = torch.Tensor().to(device)
    dataset_mean_input_embeddings = torch.Tensor().to(device)
    dataset_logits = torch.Tensor().to(device)

    for batch in tqdm(dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(device) for t in batch)
        with torch.no_grad():
            inputs = {"input_ids": batch[0], "attention_mask": batch[1]}
            bert_outputs = model.roberta(**inputs)
            mean_bert_representation = torch.mean(bert_outputs.last_hidden_state, dim=1)
            dataset_mean_bert_representation = torch.cat((dataset_mean_bert_representation, mean_bert_representation), dim=0)
            cls_representation = bert_outputs.last_hidden_state[:, 0, :]
            dataset_cls_representation = torch.cat((dataset_cls_representation, cls_representation), dim=0)
            mean_input_embeddings = torch.mean(model.roberta.embeddings(inputs['input_ids']), dim=1)
            dataset_mean_input_embeddings = torch.cat((dataset_mean_input_embeddings, mean_input_embeddings), dim=0)
            logits = model(**inputs).logits
            dataset_logits = torch.cat((dataset_logits, logits), dim=0)


    # TODO: Normalize representations
    representations = {
        "mean_bert_representation": normalize(dataset_mean_bert_representation).detach().cpu(),
        "cls_representation": normalize(dataset_cls_representation).detach().cpu(),
        "mean_input_embeddings": normalize(dataset_mean_input_embeddings).detach().cpu(),
        "logits": dataset_logits
    }

    return representations

In [20]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestNeighbors
from tqdm import tqdm

import torch.nn.functional as F
from torch import nn


def calculate_contrastive_scores(model, labeled_dataset, unlabeled_dataset, k_neighbours=10):
    # Labeled_dataset labels
    labels = labeled_dataset[:][2]
    # Calculate Unlabeled dataset representations
    unlabeled_dataset_representation = calculate_roberta_representations(model=model, dataset=unlabeled_dataset)
    unlabeled_embeddings = unlabeled_dataset_representation["cls_representation"]
    unlabeled_logits = unlabeled_dataset_representation["logits"]

    # Calculate Labeled dataset representations
    labeled_dataset_representation = calculate_roberta_representations(model=model, dataset=labeled_dataset)
    labeled_embeddings = labeled_dataset_representation["cls_representation"]
    labeled_logits = labeled_dataset_representation["logits"]

    # Create KNN Clusters for labeled data
    neigh = KNeighborsClassifier(n_neighbors=k_neighbours)
    neigh.fit(X=labeled_embeddings.cpu(), y=labels)

    scores = []
    criterion = nn.KLDivLoss(reduction='none')
    for index, item in enumerate(tqdm(zip(unlabeled_embeddings, unlabeled_logits), desc="Find nearest neighbours for each unlabeled item.")):
        unlabeled_embeddings, unlabeled_item_logits = item
        # Step1-Find K neighbours of item
        distances, neighbours = neigh.kneighbors(X=unlabeled_embeddings.reshape(1, -1).cpu(), return_distance=True)
        neighbours = neighbours[0]
        # Step2-For each neighbour of item calculate KL-divergence and calculate the mean as score
        item_log_prob = F.log_softmax(unlabeled_item_logits, dim=-1)
        neighbours_prob = F.softmax(labeled_logits[neighbours], dim=-1)
        neighbours_pairs_kl = np.array([torch.sum(criterion(item_log_prob, neighbour)).cpu().numpy() for neighbour in neighbours_prob])
        item_score = neighbours_pairs_kl.mean()
        scores.append(item_score)

    # return score of each index
    return torch.Tensor(scores)

In [21]:
from datasets import ClassLabel

raw_datasets = load_dataset("csv", data_files=data_files)
label_list = raw_datasets["train"].unique("label")
label_list.sort()
c2l = ClassLabel(num_classes=4, names=label_list)

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [22]:
from torch.utils.data import TensorDataset

def transform_to_embedding_dataset(tokenizer, dataset, is_bert=True):
    input_ids = []
    attention_masks = []
    token_type_ids = []
    labels = []

    for item in dataset:
        # `encode_plus` will:
        #   (1) Tokenize the sentence.
        #   (2) Prepend the `[CLS]` token to the start.
        #   (3) Append the `[SEP]` token to the end.
        #   (4) Map tokens to their IDs.
        #   (5) Pad or truncate the sentence to `max_length`
        #   (6) Create attention masks for [PAD] tokens.
        encoded_dict = tokenizer.encode_plus(
                            item["sentence1"],                      # Sentence to encode.
                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                            max_length = 256,           # Pad & truncate all sentences.
                            pad_to_max_length = True,
                            return_attention_mask = True,   # Construct attn. masks.
                            return_tensors = 'pt',     # Return pytorch tensors.
                    )

        input_ids.append(encoded_dict["input_ids"])
        attention_masks.append(encoded_dict["attention_mask"])
        if is_bert:
            token_type_ids.append(encoded_dict["token_type_ids"])

        labels.append(c2l.str2int(item["label"]))


    # Convert the lists into tensors.
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    if is_bert:
        token_type_ids = torch.cat(token_type_ids, dim=0)
    labels = torch.tensor(labels)



    # Combine the training inputs into a TensorDataset.
    if is_bert:
        dataset = TensorDataset(input_ids, attention_masks, token_type_ids, labels)
    else:
        dataset = TensorDataset(input_ids, attention_masks, labels)

    return dataset

In [23]:
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer


def contrastive_active_learning(
        hf_args,
        raw_datasets,
        initial_labeled_dataset_size,
        iteration_count,
        iteration_sample_count,
        candidate_to_selected_samples_ratio=10
    ):

    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, do_lower_case=True)

    original_train_dataset = raw_datasets["train"]
    active_learning_data = raw_datasets

    # select initial train dataset from raw dataset
    train_dataset = original_train_dataset.select(
        random.sample(
            range(original_train_dataset.num_rows),
            initial_labeled_dataset_size,
        )
    )

    unlabeled_dataset = original_train_dataset.filter(
        lambda s: s["idx"] not in train_dataset["idx"]
    )

    raw_datasets["train"] = train_dataset
    # Train Initial Model
    logger.info(f'Initial Training with {raw_datasets["train"].num_rows} samples.')
    evaluation_metrics, test_predictions = experiment(raw_datasets, args_dict=hf_args)

    head_preds, labels = get_label_predictions(test_predictions, raw_datasets)
    report(head_preds, labels)
    current_iteration = 1
    while  current_iteration <= iteration_count:
        print(f'Current Active Learning Iteration: {current_iteration}')

        if unlabeled_dataset.num_rows <= 0:
            logger.info(f'Not enough unlabeled data to continue. Stoped at iteration {current_iteration}')

        # Sample candidate_to_selected_samples_ratio larger than iteration_sample_count sample for acquisition function
        candidate_count = int(candidate_to_selected_samples_ratio * iteration_sample_count)
        logger.info(f"Candidate samples count for active learning : {candidate_count}")
        candidate_samples = unlabeled_dataset.select(
            random.sample(
                range(unlabeled_dataset.num_rows),
                candidate_count,
            )
        )


        # Acquisition Function - Contranstive Strategy
        model = AutoModelForSequenceClassification.from_pretrained(hf_args["output_dir"]).to(device)
        tlabeled = transform_to_embedding_dataset(tokenizer, raw_datasets["train"], is_bert=False)
        tpool = transform_to_embedding_dataset(tokenizer, candidate_samples, is_bert=False)
        sample_contrastive_scores = calculate_contrastive_scores(
            model=model,
            labeled_dataset=tlabeled,
            unlabeled_dataset=tpool)
        chosen_samples = torch.topk(sample_contrastive_scores, iteration_sample_count)


        # Annotate new samples
        new_train_samples = unlabeled_dataset.select(chosen_samples.indices.tolist())
        new_train_samples = annotate(new_train_samples)


        # Add new samples to labeled dataset
        extended_train_dataset = concatenate_datasets(
            [raw_datasets["train"], new_train_samples],
            info=original_train_dataset.info,
        )

        # Remove selected samples from unlabeled dataset
        unlabeled_dataset = original_train_dataset.filter(
            lambda s: s["idx"] not in new_train_samples["idx"]
        )

        # Train new model with new dataset
        raw_datasets["train"] = extended_train_dataset
        hf_args["do_train"] = True
        hf_args["do_eval"] = True
        hf_args["evaluation_strategy"] = "steps"
        hf_args["load_best_model_at_end"] = True
        _, candidate_test_predictions = experiment(raw_datasets, args_dict=hf_args)

        head_preds, labels = get_label_predictions(candidate_test_predictions, raw_datasets)
        report(head_preds, labels)

        current_iteration += 1

In [24]:
def run_contrastive_sampling():
    hf_args = {
        "model_name_or_path": BASE_MODEL,
        "do_train": True,
        "do_eval": True,
        "do_predict": True,
        "max_seq_length": MAX_SEQ_LEN,
        "per_device_train_batch_size": TRAIN_BATCH_SIZE,
        "per_device_eval_batch_size": EVAL_BATCH_SIZE,
        "learning_rate": LEARNING_RATE,
        "overwrite_output_dir": True,
        "output_dir": f"./results/contrastive/",
        "logging_strategy": "steps",
        "logging_steps": LOGGING_STEPS,
        "evaluation_strategy": "steps",
        "eval_steps": EVAL_STEPS,
        "seed": 12,
        "max_steps": MAX_STEPS,
        "load_best_model_at_end": True
    }

    raw_datasets = load_dataset("csv", data_files=data_files)
    contrastive_active_learning(
        hf_args,
        raw_datasets,
        initial_labeled_dataset_size=INITIAL_DATASET_SIZE,
        iteration_count=ITERATIONS,
        iteration_sample_count=ACQUISITION_SIZE,
        candidate_to_selected_samples_ratio=5
    )

In [25]:
run_contrastive_sampling()

Downloading (…)lve/main/config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Filter:   0%|          | 0/9000 [00:00<?, ? examples/s]

INFO:root:Initial Training with 150 samples.


Flattening the indices:   0%|          | 0/150 [00:00<?, ? examples/s]

Downloading model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Running tokenizer on dataset:   0%|          | 0/150 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/1200 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/1800 [00:00<?, ? examples/s]

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]



Step,Training Loss,Validation Loss,Accuracy
50,0.9561,0.648434,0.696667
100,0.2659,0.683539,0.829167
150,0.0408,1.057234,0.818333
200,0.004,1.174963,0.8175
250,0.0025,1.215822,0.815833
300,0.0018,1.240061,0.813333
350,0.0014,1.25594,0.82
400,0.0012,1.279297,0.8175
450,0.0012,1.291155,0.816667
500,0.0036,1.294052,0.815833


***** train metrics *****
  epoch                    =       50.0
  total_flos               =   459460GF
  train_loss               =     0.1279
  train_runtime            = 0:05:14.14
  train_samples            =        150
  train_samples_per_second =     25.466
  train_steps_per_second   =      1.592


***** eval metrics *****
  epoch                   =       50.0
  eval_accuracy           =     0.8158
  eval_loss               =     1.2941
  eval_runtime            = 0:00:08.37
  eval_samples            =       1200
  eval_samples_per_second =    143.297
  eval_steps_per_second   =      8.956




Metrics Report:

Accuracy: 0.8161111111111111
Confusion Matrix:
[[386  49   9  24]
 [ 74 284   6  84]
 [  6   4 422  21]
 [ 37   3  14 377]]
Precision: 0.821302704468258
Recall: 0.8161111111111111
F1-score: 0.8137599450131726
Current Active Learning Iteration: 1


Evaluating: 100%|██████████| 125/125 [00:13<00:00,  9.02it/s]
Evaluating: 100%|██████████| 38/38 [00:04<00:00,  9.06it/s]
Find nearest neighbours for each unlabeled item.: 500it [00:01, 431.34it/s]


Filter:   0%|          | 0/9000 [00:00<?, ? examples/s]



Flattening the indices:   0%|          | 0/250 [00:00<?, ? examples/s]

You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Running tokenizer on dataset:   0%|          | 0/250 [00:00<?, ? examples/s]



Step,Training Loss,Validation Loss,Accuracy
50,0.9781,0.600204,0.800833
100,0.3347,0.575207,0.841667
150,0.0814,0.779686,0.829167
200,0.0204,0.951852,0.846667
250,0.0032,1.038394,0.836667
300,0.0021,1.077353,0.841667
350,0.0017,1.092448,0.836667
400,0.0014,1.105392,0.84
450,0.0013,1.116424,0.8375
500,0.0013,1.117135,0.839167


***** train metrics *****
  epoch                    =      31.25
  total_flos               =   478696GF
  train_loss               =     0.1426
  train_runtime            = 0:05:22.59
  train_samples            =        250
  train_samples_per_second =     24.799
  train_steps_per_second   =       1.55


***** eval metrics *****
  epoch                   =      31.25
  eval_accuracy           =     0.8392
  eval_loss               =     1.1171
  eval_runtime            = 0:00:08.54
  eval_samples            =       1200
  eval_samples_per_second =    140.396
  eval_steps_per_second   =      8.775




Metrics Report:

Accuracy: 0.84
Confusion Matrix:
[[391  48  11  18]
 [ 61 317   6  64]
 [  8   5 429  11]
 [ 32   8  16 375]]
Precision: 0.8409033078985654
Recall: 0.84
F1-score: 0.8385755107211801
Current Active Learning Iteration: 2


Evaluating: 100%|██████████| 125/125 [00:14<00:00,  8.86it/s]
Evaluating: 100%|██████████| 63/63 [00:07<00:00,  8.89it/s]
Find nearest neighbours for each unlabeled item.: 500it [00:01, 416.60it/s]


Filter:   0%|          | 0/9000 [00:00<?, ? examples/s]



Flattening the indices:   0%|          | 0/350 [00:00<?, ? examples/s]

You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Running tokenizer on dataset:   0%|          | 0/350 [00:00<?, ? examples/s]



Step,Training Loss,Validation Loss,Accuracy
50,1.0191,0.540525,0.8325
100,0.3271,0.574178,0.8375
150,0.1241,0.783769,0.836667
200,0.0329,0.940299,0.836667
250,0.0231,0.99959,0.838333
300,0.0132,0.987337,0.846667
350,0.0122,1.009066,0.8475
400,0.0061,0.990356,0.853333
450,0.0062,1.003332,0.854167
500,0.0021,1.014345,0.854167


***** train metrics *****
  epoch                    =      22.73
  total_flos               =   487395GF
  train_loss               =     0.1566
  train_runtime            = 0:05:15.74
  train_samples            =        350
  train_samples_per_second =     25.337
  train_steps_per_second   =      1.584


***** eval metrics *****
  epoch                   =      22.73
  eval_accuracy           =     0.8542
  eval_loss               =     1.0143
  eval_runtime            = 0:00:08.60
  eval_samples            =       1200
  eval_samples_per_second =    139.386
  eval_steps_per_second   =      8.712




Metrics Report:

Accuracy: 0.85
Confusion Matrix:
[[376  64  11  17]
 [ 40 356   6  46]
 [  7   9 425  12]
 [ 32  10  16 373]]
Precision: 0.8495819482007879
Recall: 0.85
F1-score: 0.8496452608757931
Current Active Learning Iteration: 3


Evaluating: 100%|██████████| 125/125 [00:14<00:00,  8.82it/s]
Evaluating: 100%|██████████| 88/88 [00:10<00:00,  8.79it/s]
Find nearest neighbours for each unlabeled item.: 500it [00:01, 419.61it/s]


Filter:   0%|          | 0/9000 [00:00<?, ? examples/s]



Flattening the indices:   0%|          | 0/450 [00:00<?, ? examples/s]

You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Running tokenizer on dataset:   0%|          | 0/450 [00:00<?, ? examples/s]



Step,Training Loss,Validation Loss,Accuracy
50,1.0087,0.560763,0.8625
100,0.4049,0.482459,0.8525
150,0.1917,0.609132,0.843333
200,0.0838,0.781956,0.841667
250,0.025,0.83224,0.858333
300,0.0057,0.905328,0.8475
350,0.0024,0.927608,0.855833
400,0.0019,0.942093,0.856667
450,0.0016,0.953008,0.856667
500,0.0058,0.95756,0.8575


***** train metrics *****
  epoch                    =      17.24
  total_flos               =   475511GF
  train_loss               =     0.1732
  train_runtime            = 0:05:21.15
  train_samples            =        450
  train_samples_per_second =      24.91
  train_steps_per_second   =      1.557


***** eval metrics *****
  epoch                   =      17.24
  eval_accuracy           =     0.8575
  eval_loss               =     0.9576
  eval_runtime            = 0:00:08.79
  eval_samples            =       1200
  eval_samples_per_second =    136.465
  eval_steps_per_second   =      8.529




Metrics Report:

Accuracy: 0.8538888888888889
Confusion Matrix:
[[365  73   9  21]
 [ 31 364   5  48]
 [  6   5 431  11]
 [ 26  16  12 377]]
Precision: 0.8544130803971576
Recall: 0.8538888888888889
F1-score: 0.8535245494612218
Current Active Learning Iteration: 4


Evaluating: 100%|██████████| 125/125 [00:14<00:00,  8.86it/s]
Evaluating: 100%|██████████| 113/113 [00:12<00:00,  8.92it/s]
Find nearest neighbours for each unlabeled item.: 500it [00:01, 260.55it/s]


Filter:   0%|          | 0/9000 [00:00<?, ? examples/s]



Flattening the indices:   0%|          | 0/550 [00:00<?, ? examples/s]

You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Running tokenizer on dataset:   0%|          | 0/550 [00:00<?, ? examples/s]



Step,Training Loss,Validation Loss,Accuracy
50,0.9949,0.547916,0.860833
100,0.4274,0.439416,0.8675
150,0.2209,0.490618,0.8725
200,0.1492,0.564748,0.869167
250,0.0942,0.71584,0.858333
300,0.0526,0.800543,0.8525
350,0.0326,0.851516,0.850833
400,0.0195,0.878179,0.8575
450,0.0164,0.886598,0.8525
500,0.0107,0.903649,0.853333


***** train metrics *****
  epoch                    =      14.29
  total_flos               =   481514GF
  train_loss               =     0.2018
  train_runtime            = 0:05:13.97
  train_samples            =        550
  train_samples_per_second =      25.48
  train_steps_per_second   =      1.592


***** eval metrics *****
  epoch                   =      14.29
  eval_accuracy           =     0.8533
  eval_loss               =     0.9036
  eval_runtime            = 0:00:08.59
  eval_samples            =       1200
  eval_samples_per_second =    139.634
  eval_steps_per_second   =      8.727




Metrics Report:

Accuracy: 0.8572222222222222
Confusion Matrix:
[[381  54   6  27]
 [ 37 347   2  62]
 [  8   5 427  13]
 [ 19  11  13 388]]
Precision: 0.859186419752259
Recall: 0.8572222222222222
F1-score: 0.8569710356805043
Current Active Learning Iteration: 5


Evaluating: 100%|██████████| 125/125 [00:14<00:00,  8.88it/s]
Evaluating: 100%|██████████| 138/138 [00:15<00:00,  8.87it/s]
Find nearest neighbours for each unlabeled item.: 500it [00:01, 360.40it/s]


Filter:   0%|          | 0/9000 [00:00<?, ? examples/s]



Flattening the indices:   0%|          | 0/650 [00:00<?, ? examples/s]

You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Running tokenizer on dataset:   0%|          | 0/650 [00:00<?, ? examples/s]



Step,Training Loss,Validation Loss,Accuracy
50,0.9622,0.480698,0.859167
100,0.3815,0.605303,0.818333
150,0.2454,0.539732,0.865
200,0.1382,0.647919,0.86
250,0.0856,0.74046,0.861667
300,0.0683,0.779798,0.864167
350,0.0597,0.773919,0.866667
400,0.0446,0.831236,0.859167
450,0.0155,0.868827,0.855833
500,0.0158,0.84734,0.856667


***** train metrics *****
  epoch                    =       12.2
  total_flos               =   485680GF
  train_loss               =     0.2017
  train_runtime            = 0:05:20.73
  train_samples            =        650
  train_samples_per_second =     24.943
  train_steps_per_second   =      1.559


***** eval metrics *****
  epoch                   =       12.2
  eval_accuracy           =     0.8567
  eval_loss               =     0.8473
  eval_runtime            = 0:00:08.59
  eval_samples            =       1200
  eval_samples_per_second =    139.609
  eval_steps_per_second   =      8.726




Metrics Report:

Accuracy: 0.8688888888888889
Confusion Matrix:
[[377  66   8  17]
 [ 34 381   4  29]
 [  7   5 435   6]
 [ 33  13  14 371]]
Precision: 0.8687496401552122
Recall: 0.8688888888888889
F1-score: 0.8686375887259399
Current Active Learning Iteration: 6


Evaluating: 100%|██████████| 125/125 [00:14<00:00,  8.88it/s]
Evaluating: 100%|██████████| 163/163 [00:18<00:00,  8.84it/s]
Find nearest neighbours for each unlabeled item.: 500it [00:01, 265.61it/s]


Filter:   0%|          | 0/9000 [00:00<?, ? examples/s]



Flattening the indices:   0%|          | 0/750 [00:00<?, ? examples/s]

You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Running tokenizer on dataset:   0%|          | 0/750 [00:00<?, ? examples/s]



Step,Training Loss,Validation Loss,Accuracy
50,1.0866,0.596044,0.819167
100,0.4492,0.421497,0.865
150,0.2816,0.52414,0.8625
200,0.2052,0.58263,0.850833
250,0.1129,0.697955,0.853333
300,0.0648,0.686687,0.866667
350,0.064,0.737741,0.864167
400,0.0242,0.766946,0.8625
450,0.0217,0.816501,0.855
500,0.0183,0.803447,0.86


***** train metrics *****
  epoch                    =      10.64
  total_flos               =   488866GF
  train_loss               =     0.2328
  train_runtime            = 0:05:30.39
  train_samples            =        750
  train_samples_per_second =     24.213
  train_steps_per_second   =      1.513


***** eval metrics *****
  epoch                   =      10.64
  eval_accuracy           =       0.86
  eval_loss               =     0.8034
  eval_runtime            = 0:00:08.60
  eval_samples            =       1200
  eval_samples_per_second =    139.425
  eval_steps_per_second   =      8.714




Metrics Report:

Accuracy: 0.8605555555555555
Confusion Matrix:
[[367  70   8  23]
 [ 31 374   5  38]
 [  8   6 427  12]
 [ 22  16  12 381]]
Precision: 0.8613864675778775
Recall: 0.8605555555555555
F1-score: 0.8603270167581955


In [26]:
raw_datasets = load_dataset("csv", data_files=data_files)
label_list = raw_datasets["train"].unique("label")
label_list.sort()

In [27]:
label_list

['bussines', 'sci/tech', 'sports', 'world']

In [28]:
from google.colab import drive
drive.mount('/content/drive')

MessageError: ignored

In [None]:
! mv ./results/contrastive/ ./drive/MyDrive/Thesis/Data/experiments/