In [1]:
import os
import torch
import numpy as np
import time 
import torch.nn.functional as F
import easydict
import pickle
from torch.utils.data import TensorDataset, DataLoader
from tqdm import tqdm
from datasets import load_dataset
import json

from transformers import AutoConfig, AutoTokenizer, AutoModelForSequenceClassification, AutoModelForMaskedLM, RobertaForMaskedLM
from transformers import RobertaTokenizer

  from .autonotebook import tqdm as notebook_tqdm
In Transformers v4.0.0, the default path to cache downloaded models changed from '~/.cache/torch/transformers' to '~/.cache/huggingface/transformers'. Since you don't seem to have overridden and '~/.cache/torch/transformers' is a directory that exists, we're moving it to '~/.cache/huggingface/transformers' to avoid redownloading models you have already in the cache. You should only see this message once.


In [2]:
from datasets import load_dataset

In [3]:
r2_dataset = load_dataset("dynabench/dynasent", "dynabench.dynasent.r2.all")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.
Repo card metadata block was not found. Setting CardData to empty.


In [4]:
all_sent = r2_dataset['train']['sentence']

In [5]:
n_samples = len(r2_dataset['train'])

In [6]:
all_labels = r2_dataset['train']['label_distribution']
n_sample = len(all_labels)
all_annotation = torch.zeros(n_sample, 4)
label_str = ['negative', 'positive', 'neutral', 'mixed']

for i in range(n_sample):
    for j in range(len(label_str)):
        all_annotation[i, j] = len(all_labels[i][label_str[j]])

In [7]:
divide_anno = torch.zeros(5, n_samples).long()
for i in range(n_sample):
    idx = 0
    for j in range(4):
        for k in range(int(all_annotation[i, j])):
            if j == 3:
                selected_cls = torch.randint(0, 2, (1,))
                divide_anno[idx, i] = selected_cls
            else:
                divide_anno[idx, i] = j
            idx += 1

In [8]:
divide_anno

tensor([[1, 1, 0,  ..., 0, 0, 1],
        [1, 1, 0,  ..., 0, 1, 1],
        [1, 1, 0,  ..., 0, 1, 1],
        [1, 1, 0,  ..., 2, 1, 1],
        [0, 1, 0,  ..., 2, 1, 1]])

### Hard Labels

In [9]:
n_samples = len(r2_dataset['train']['gold_label'])
hard_labels = []
gold_labels = r2_dataset['train']['gold_label']
for i in range(n_samples):
    if gold_labels[i] == 'negative':
        gold_label = 0
    elif gold_labels[i] == 'positive':
        gold_label = 1
    else:
        gold_label = 2
    hard_labels.append(gold_label)
hard_labels_t = torch.LongTensor(hard_labels)

In [10]:
hard_labels_t

tensor([1, 1, 0,  ..., 0, 1, 1])

### Soft Labels

In [11]:
soft_labels = torch.zeros(n_samples, 3)
all_labels = r2_dataset['train']['label_distribution']
for i in range(n_samples):
    soft_labels[i, 0] += len(all_labels[i]['negative'])
    soft_labels[i, 1] += len(all_labels[i]['positive'])
    soft_labels[i, 2] += len(all_labels[i]['neutral'])
    
    soft_labels[i, 0] += (0.5 * len(all_labels[i]['mixed']))
    soft_labels[i, 1] += (0.5 * len(all_labels[i]['mixed']))
soft_labels = soft_labels / soft_labels.sum(dim=-1, keepdim=True)

In [12]:
np.save('./pre_gen/dynasent2_soft_label.npy', soft_labels.numpy())

### Human Preference (at least)

In [12]:
label_str = ['negative', 'positive', 'neutral']
indices_all = {}
indices_all[label_str[0]] = list((hard_labels_t == 0).float().nonzero()[:, 0].numpy())
indices_all[label_str[1]] = list((hard_labels_t == 1).float().nonzero()[:, 0].numpy())
indices_all[label_str[2]] = list((hard_labels_t == 2).float().nonzero()[:, 0].numpy())
prefers = soft_labels.max(dim=1)[0]

	nonzero()
Consider using one of the following signatures instead:
	nonzero(*, bool as_tuple) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:882.)
  indices_all[label_str[0]] = list((hard_labels_t == 0).float().nonzero()[:, 0].numpy())


In [13]:
human_preference_all = {}
for label in label_str:
    pref = prefers[indices_all[label]]
    mat = pref.unsqueeze(1) - pref.unsqueeze(0)
    convert = 2 * (mat == 0) + 1 * (mat > 0) + torch.eye(len(indices_all[label]))
    
    human_preference_all[label] = mat

In [14]:
human_preference_all

{'negative': tensor([[ 0.0000, -0.1000, -0.1000,  ...,  0.1000,  0.3000,  0.3000],
         [ 0.1000,  0.0000,  0.0000,  ...,  0.2000,  0.4000,  0.4000],
         [ 0.1000,  0.0000,  0.0000,  ...,  0.2000,  0.4000,  0.4000],
         ...,
         [-0.1000, -0.2000, -0.2000,  ...,  0.0000,  0.2000,  0.2000],
         [-0.3000, -0.4000, -0.4000,  ..., -0.2000,  0.0000,  0.0000],
         [-0.3000, -0.4000, -0.4000,  ..., -0.2000,  0.0000,  0.0000]]),
 'positive': tensor([[ 0.0000, -0.1000,  0.1000,  ...,  0.1000,  0.1000, -0.1000],
         [ 0.1000,  0.0000,  0.2000,  ...,  0.2000,  0.2000,  0.0000],
         [-0.1000, -0.2000,  0.0000,  ...,  0.0000,  0.0000, -0.2000],
         ...,
         [-0.1000, -0.2000,  0.0000,  ...,  0.0000,  0.0000, -0.2000],
         [-0.1000, -0.2000,  0.0000,  ...,  0.0000,  0.0000, -0.2000],
         [ 0.1000,  0.0000,  0.2000,  ...,  0.2000,  0.2000,  0.0000]]),
 'neutral': tensor([[ 0.0000,  0.0000,  0.2000,  ...,  0.2000,  0.4000,  0.4000],
         [

In [15]:
indices_all

{'negative': [2,
  6,
  16,
  19,
  20,
  22,
  25,
  33,
  34,
  41,
  52,
  53,
  54,
  58,
  61,
  62,
  63,
  64,
  66,
  68,
  73,
  74,
  78,
  79,
  80,
  85,
  89,
  91,
  92,
  94,
  95,
  98,
  99,
  100,
  102,
  104,
  106,
  111,
  114,
  115,
  116,
  117,
  120,
  123,
  126,
  127,
  129,
  130,
  131,
  133,
  135,
  136,
  139,
  140,
  148,
  149,
  155,
  160,
  162,
  163,
  165,
  175,
  176,
  178,
  180,
  183,
  185,
  186,
  188,
  192,
  194,
  195,
  196,
  203,
  204,
  208,
  210,
  214,
  219,
  228,
  230,
  233,
  236,
  239,
  240,
  246,
  247,
  249,
  254,
  260,
  264,
  265,
  269,
  270,
  271,
  274,
  275,
  277,
  280,
  282,
  291,
  293,
  297,
  305,
  308,
  311,
  314,
  321,
  322,
  324,
  332,
  337,
  338,
  342,
  343,
  344,
  345,
  351,
  353,
  354,
  357,
  358,
  363,
  369,
  374,
  375,
  377,
  378,
  380,
  382,
  383,
  384,
  385,
  386,
  391,
  396,
  397,
  398,
  401,
  402,
  408,
  410,
  411,
  412,
  414,
  417,
 

In [15]:
with open('./pre_gen/dynasent2_indices.pkl', 'wb') as f:
    pickle.dump(indices_all, f)
    
with open('./pre_gen/dynasent2_human_pref.pkl', 'wb') as f:
    pickle.dump(human_preference_all, f)    

In [16]:
rand_idx_pref = torch.zeros(20, n_samples, 2).long()

for i in range(n_samples):
    candidate_idx = indices_all[gold_labels[i]]
    i_loc = candidate_idx.index(i)

    for k in range(20):
        rand_idx = np.random.randint(0, len(candidate_idx))
        while i_loc == rand_idx:
            rand_idx = np.random.randint(0, len(candidate_idx))

        rand_idx_pref[k, i, 0] = candidate_idx[rand_idx]
        rand_idx_pref[k, i, 1] = human_preference_all[gold_labels[i]][i_loc, rand_idx]

In [17]:
np.save('./pre_gen/dynasent2_idx_pref_random20.npy', rand_idx_pref.numpy())