# Package Installation

In [None]:
!pip install transformers
!pip install datasets # huggingface dataset

Collecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 5.1 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 39.4 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 5.1 MB/s 
[?25hCollecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 38.6 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 52.1 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found ex

In [None]:
!nvidia-smi

Fri Apr  1 13:23:31 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   46C    P0    28W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# Preparation

## Import necessary packages

In [None]:
import os
import torch
import numpy as np
import pandas as pd
import pickle
import random
from numpy import ndarray
from torch import Tensor
from typing import Union, List, Dict
from multiprocessing import cpu_count
from tqdm.notebook import trange, tqdm
from torch import nn
from transformers import AutoTokenizer, AutoModel
from datasets import load_dataset
from transformers.modeling_outputs import SequenceClassifierOutput, BaseModelOutputWithPoolingAndCrossAttentions


## Some useful functions

In [None]:
# Utils

def save_parameter(save_object, save_file):
    with open(save_file, 'wb') as f:
        pickle.dump(save_object, f, protocol=pickle.HIGHEST_PROTOCOL)

def load_parameter(load_file):
    with open(load_file, 'rb') as f:
        output = pickle.load(f)
    return output

def sim_matrix(a, b, eps=1e-8):
    """
    Calculate cosine similarity between two matrices. 
    Note: added eps for numerical stability
    """
    a_n, b_n = a.norm(dim=1)[:, None], b.norm(dim=1)[:, None]
    a_norm = a / torch.clamp(a_n, min=eps)
    b_norm = b / torch.clamp(b_n, min=eps)
    sim_mt = torch.mm(a_norm, b_norm.transpose(0, 1))
    return sim_mt

def batch2device(batch, device):
    for key, value in batch.items():
        batch[key] = batch[key].to(device)
    return batch

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


# GPU accelerator
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Data preparation

## Build set of paired samples

In [None]:
# working dir
work_path = "/content/drive/MyDrive/PaperRecommendation/"
checkpoint_path = work_path + "checkpoint/"

In [None]:
# Load paper submission data and Journal's aims
data_train = pd.read_csv(checkpoint_path + "preprocessed_data/01_train.csv", encoding = "ISO-8859-1")
data_aims = pd.read_csv(checkpoint_path + "preprocessed_data/01_aims.csv", encoding = "ISO-8859-1")

data_train.fillna("", inplace=True)
data_aims.fillna("", inplace=True)

# merge two tables respect to Label and Index
merged_df = pd.merge(data_train[['Title', 'Abstract', 'Keywords', 'Label']], data_aims['Aims'], right_index=True, left_on='Label')

# construct set of pairs for contrastive fine-tuning
train_pairs = pd.DataFrame({'TAK': merged_df['Title'] + ' ' + merged_df['Abstract'] + ' ' + merged_df['Keywords'],
                            'Aims': data_aims['Aims']})
train_pairs.to_csv(checkpoint_path + "preprocessed_data/train_pairs.csv", index=False)

## Load saved pairs for training

In [None]:
data_args = {
    "train_file": checkpoint_path + "preprocessed_data/train_pairs.csv",
    "preprocessing_num_workers": None
}
data_files = {
    "train": data_args["train_file"]
}
tokenizer_kwargs = {
    "pretrained_path": "distilroberta-base",
    "use_fast": True,
    "max_seq_length": 300,
    "pad_to_max_length": True,
    "truncation": True,
    "return_tensors": None
}

datasets = load_dataset("csv", data_files=data_files)
tokenizer = AutoTokenizer.from_pretrained(
    tokenizer_kwargs["pretrained_path"], 
    use_fast=tokenizer_kwargs["use_fast"]
)
column_names = datasets["train"].column_names

def prepare_features(examples):
    total = len(examples[column_names[0]])
    for idx in range(total):
        if examples[column_names[0]][idx] is None:
            examples[column_names[0]][idx] = " "
        if examples[column_names[1]][idx] is None:
            examples[column_names[1]][idx] = " "
    sentences = examples[column_names[0]] + examples[column_names[1]]
    sent_features = tokenizer(
            sentences,
            max_length=tokenizer_kwargs["max_seq_length"],
            truncation=True,
            padding="max_length" if tokenizer_kwargs["pad_to_max_length"] else False,
            return_tensors=tokenizer_kwargs["return_tensors"]
        )
    features = {}
    for key in sent_features:
        features[key] = [[sent_features[key][i], sent_features[key][i+total]] for i in range(total)]
    return features

train_dataset = datasets["train"].map(
    prepare_features, 
    batched=True,
    num_proc=data_args["preprocessing_num_workers"],
    remove_columns=column_names
)

Using custom data configuration default-5f6bc9f5310a6fec


Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-5f6bc9f5310a6fec/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-5f6bc9f5310a6fec/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Downloading:   0%|          | 0.00/480 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

  0%|          | 0/299 [00:00<?, ?ba/s]

## Data loader

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __getitem__(self, idx):
        x = {
            key: torch.tensor(val) for key, val in self.dataset[idx].items()
        }
        return x
    def __len__(self):
        return len(self.dataset)

In [None]:
dataset = Dataset(train_dataset)
data_loader = torch.utils.data.DataLoader(dataset, batch_size=32, shuffle=True)

# Model definition

## Pooler layer

In [None]:
class Pooler(nn.Module):
    """
    Parameter-free poolers to get the sentence embedding
    'cls': [CLS] representation with BERT/RoBERTa's MLP pooler.
    'cls_before_pooler': [CLS] representation without the original MLP pooler.
    'avg': average of the last layers' hidden states at each token.
    'avg_top2': average of the last two layers.
    'avg_first_last': average of the first and the last layers.
    """
    def __init__(self, pooler_type):
        super().__init__()
        self.pooler_type = pooler_type
        assert self.pooler_type in ["cls", "cls_before_pooler", "avg", "avg_top2", "avg_first_last"], "unrecognized pooling type %s" % self.pooler_type

    def forward(self, attention_mask, outputs):
        last_hidden = outputs.last_hidden_state
        hidden_states = outputs.hidden_states

        if self.pooler_type in ['cls_before_pooler', 'cls']:
            return last_hidden[:, 0]
        elif self.pooler_type == "avg":
            return ((last_hidden * attention_mask.unsqueeze(-1)).sum(1) / attention_mask.sum(-1).unsqueeze(-1))
        elif self.pooler_type == "avg_first_last":
            first_hidden = hidden_states[0]
            last_hidden = hidden_states[-1]
            pooled_result = ((first_hidden + last_hidden) / 2.0 * attention_mask.unsqueeze(-1)).sum(1) / attention_mask.sum(-1).unsqueeze(-1)
            return pooled_result
        elif self.pooler_type == "avg_top2":
            second_last_hidden = hidden_states[-2]
            last_hidden = hidden_states[-1]
            pooled_result = ((last_hidden + second_last_hidden) / 2.0 * attention_mask.unsqueeze(-1)).sum(1) / attention_mask.sum(-1).unsqueeze(-1)
            return pooled_result
        else:
            raise NotImplementedError


# Model for contrastive leanring training

In [None]:
class ModelForCL(nn.Module):
    def __init__(self, model_name_or_path, pooler_type):
        super(ModelForCL, self).__init__()
        self.bert = AutoModel.from_pretrained(model_name_or_path)
        self.pooler_type = pooler_type
        self.pooler = Pooler(self.pooler_type)
        
    def forward(self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        mlm_input_ids=None,
        mlm_labels=None,
    ):
        batch_size = input_ids.size(0)
        # Number of sentences in one instance
        # 2: pair instance; 3: pair instance with a hard negative
        num_sent = input_ids.size(1)

        # Flatten input for encoding
        input_ids = input_ids.view((-1, input_ids.size(-1))) # (bs * num_sent, len)
        attention_mask = attention_mask.view((-1, attention_mask.size(-1))) # (bs * num_sent, len)
        if token_type_ids is not None:
            token_type_ids = token_type_ids.view((-1, token_type_ids.size(-1))) # (bs * num_sent, len)

        # Get raw embeddings
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=True if self.pooler_type in ['avg_top2', 'avg_first_last'] else False,
            return_dict=return_dict,
        )

        # Pooling
        if self.pooler_type in ["cls", "cls_before_pooler", "avg", "avg_top2", "avg_first_last"]:
            pooler_output = self.pooler(attention_mask, outputs)
        pooler_output = pooler_output.view((batch_size, num_sent, pooler_output.size(-1))) # (bs, num_sent, hidden)
        
        return BaseModelOutputWithPoolingAndCrossAttentions(
            pooler_output=pooler_output,
            last_hidden_state=outputs.last_hidden_state,
            hidden_states=outputs.hidden_states,
        )

## Contrastive Loss

In [None]:
class SupervisedContrastiveLoss(nn.Module):
    def __init__(self, temperature=0.1):
        super(SupervisedContrastiveLoss, self).__init__()
        self.temperature = temperature
        self.sim = nn.CosineSimilarity()
    def _eval_denom(self, z1, z2):
        cosine_vals = []
        for v in z1:
            cosine_vals.append(self.sim(v.view(1,-1), z2)/self.temperature)
        cos_batch = torch.cat(cosine_vals, dim=0).view(z1.shape[0], -1)
        denom = torch.sum(torch.exp(cos_batch),dim=1)
        return denom
    def _contrastive_loss(self, z1, z2):
        num = torch.exp(self.sim(z1, z2)/self.temperature)
        denom = self._eval_denom(z1, z2)
        loss = -torch.mean(torch.log(num/denom))
        return loss
    def forward(self, z1, z2):
        return self._contrastive_loss(z1, z2)

## Model declaration

In [None]:
model_args = {
    'model_name_or_path': 'distilroberta-base',
    'pooler_type': 'cls_before_pooler'
}
model = ModelForCL(**model_args)
model.to(device)

Downloading:   0%|          | 0.00/316M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


ModelForCL(
  (bert): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), e

# Training

## Optimizer and configuration

In [None]:
decayRate = 0.86
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-6)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer=optimizer, step_size=2, gamma=decayRate)

loss_fn = SupervisedContrastiveLoss(0.1)

In [None]:
min_loss = np.inf
max_epochs = 10
for epoch in range(max_epochs):
    loop = tqdm(data_loader, leave=True)
    train_loss = 0.0

    for batch in loop:
        optimizer.zero_grad()

        if torch.cuda.is_available():
        # Transfer batch of samples to GPU
            inputs = batch2device(batch, device)

        # forward 
        outputs = model(**inputs) 
        # Separate representation
        z1, z2 = outputs.pooler_output[:, 0], outputs.pooler_output[:, 1]

        # backward 
        loss = loss_fn(z1, z2)
        loss.backward()
        train_loss += loss.item()
        # Update Weights
        optimizer.step()

        loop.set_description('Epoch: {} - lr:{}'.format(epoch, optimizer.param_groups[0]['lr']))
        loop.set_postfix(loss=loss.item())
    train_loss = train_loss / len(data_loader)
    lr_scheduler.step()
    if min_loss > train_loss:
        print(f">> Loss Decreased({min_loss:.6f}--->{train_loss:.6f})")
        min_loss = train_loss
        save_path = checkpoint_path + 'saved_model/'
        if not os.path.exists(save_path):
            os.makedirs(save_path)
        torch.save({
            "model_state_dict": model.state_dict(),
            "optimizer_state_dict": optimizer.state_dict(),
            "min_loss": min_loss,
            "epoch": epoch
        }, save_path + "Epoch:{:0>2} SupCL-DistilRoBERTa.pth".format(epoch))