In [1]:
import os
import easydict
import json
import time
import math

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import pickle

from matplotlib import pyplot as plt
from torch.utils.data import DataLoader, TensorDataset, ConcatDataset
from tqdm import tqdm
from datasets import load_dataset

import sys
sys.path.append('/home/jaehyung/workspace/infoverse/')

from src.models import load_backbone, Classifier
from src.training.common import AverageMeter

# Preliminary

Adding syntactic noise label to train dataset

In [2]:
_, tokenizer = load_backbone('roberta_large')

2023-06-12 21:17:38.779993: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /home/jaehyung/torch/install/lib:/home/jaehyung/torch/install/lib:/usr/local/cuda/lib64:/home/jaehyung/torch/install/lib:/home/jaehyung/torch/install/lib:/usr/local/cuda/lib64:
2023-06-12 21:17:38.780047: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
In Transformers v4.0.0, the default path to cache downloaded models changed from '~/.cache/torch/transformers' to '~/.cache/huggingface/transformers'. Since you don't seem to have overridden and '~/.cache/torch/transformers' is a directory that exists, we're moving it to '~/.cache/huggingface/transformers' to avoid redownloading models you have already in the cache. You should only see this message once.
Some weig

In [35]:
def convert_csv_inputs(csv_file, tokenizer, exc=False, n_samples = 1001):
    res_numpy_all = np.zeros((n_samples, 128))
    if exc:
        n_hit = len(csv_file) - 1
    else:
        n_hit = len(csv_file)
    
    id_idx = ['Input.id0', 'Input.id1', 'Input.id2', 'Input.id3', 'Input.id4', 
              'Input.id5', 'Input.id6', 'Input.id7', 'Input.id8', 'Input.id9']
    input_idx = ['Input.text0', 'Input.text1', 'Input.text2',  'Input.text3', 'Input.text4',
                 'Input.text5', 'Input.text6', 'Input.text7', 'Input.text8', 'Input.text9']
    
    for i in range(len(id_idx)):
        idx = csv_file[id_idx[i]]
        inputs = csv_file[input_idx[i]]
        
        for j in range(n_hit):
            input_j = tokenizer.encode(inputs[j], add_special_tokens=True, max_length=128, pad_to_max_length=True, return_tensors='pt')
            res_numpy_all[int(idx[j])] = input_j
    
    return res_numpy_all[1:].astype(np.int64)

In [34]:
def convert_csv_to_numpy(csv_file, data, n_samples = 1001):
    res_numpy = np.zeros(n_samples)
    res_numpy_all = np.zeros((n_samples, 7))
    
    n_counts = np.zeros(n_samples)
    n_hit = len(csv_file) - 1
    
    id_idx = ['Input.id0', 'Input.id1', 'Input.id2', 'Input.id3', 'Input.id4', 
              'Input.id5', 'Input.id6', 'Input.id7', 'Input.id8', 'Input.id9']
    if data == 'sst5':
        score_idx = ['Answer.howMuch0', 'Answer.howMuch1', 'Answer.howMuch2',
           'Answer.howMuch3', 'Answer.howMuch4', 'Answer.howMuch5',
           'Answer.howMuch6', 'Answer.howMuch7', 'Answer.howMuch8',
           'Answer.howMuch9']
    else:
        score_idx = ['Answer.insult0.insult0', 'Answer.insult1.insult1',
           'Answer.insult2.insult2', 'Answer.insult3.insult3',
           'Answer.insult4.insult4', 'Answer.insult5.insult5',
           'Answer.insult6.insult6', 'Answer.insult7.insult7',
           'Answer.insult8.insult8', 'Answer.insult9.insult9']
    
    for i in range(len(id_idx)):
        idx = csv_file[id_idx[i]]
        score = csv_file[score_idx[i]]
        
        for j in range(n_hit):
            res_numpy[int(idx[j])] += int(score[j])
            res_numpy_all[int(idx[j])][int(n_counts[int(idx[j])])] = int(score[j])
            n_counts[int(idx[j])] += 1
    
    return res_numpy[1:], n_counts[1:], res_numpy_all[1:]

In [4]:
def disagree(all_votes, num_votes):
    n_samples = len(all_votes)
    
    disagree = np.zeros(n_samples)
    for i in range(n_samples):
        disagree[i] = np.std(all_votes[i, :int(num_votes[i])])
    
    return disagree

In [5]:
def disagree_sst5(all_votes, num_votes):
    n_samples = len(all_votes)
    
    all_votes_hard = np.array(all_votes)
    for i in range(5):
        i_idx = (5 * i <= all_votes) * (all_votes < 5 * (i+1))
        all_votes_hard[i_idx] = i
    
    all_votes_hard /= 5
    
    disagree = np.zeros(n_samples)
    for i in range(n_samples):
        disagree[i] = np.std(all_votes_hard[i, :int(num_votes[i])])
    
    return disagree

# SST5

In [7]:
import pandas as pd

In [30]:
random_csv = pd.read_csv('./anno_files/sst5_random_anno.csv')
infoverse_csv = pd.read_csv('./anno_files/sst5_info_anno.csv')
uncertain_csv = pd.read_csv('./anno_files/sst5_uncertain_anno.csv')

In [15]:
def get_sst5_labels_from_csv(csv_file):
    sum_votes, num_votes, all_votes = convert_csv_to_numpy(csv_file, 'sst5')
    
    avg_votes = sum_votes / num_votes
    
    final_labels = np.zeros(len(avg_votes))
    n_labels = np.zeros(5)
    for i in range(5):
        i_idx = (5 * i <= avg_votes) * (avg_votes < 5 * (i+1))
        final_labels[i_idx] = i
        n_labels[i] = (i_idx).sum()
    
    # Get annotation disagreements
    disagreements = disagree_sst5(all_votes, num_votes)
    
    print(n_labels)
    return final_labels, disagreements

In [36]:
random_labels, random_disagree = get_sst5_labels_from_csv(random_csv)

[ 22. 142. 434. 273. 128.]


In [17]:
uncertain_labels, uncertain_disagree = get_sst5_labels_from_csv(uncertain_csv)

[ 33. 184. 376. 310.  96.]


In [18]:
infoverse_labels, infoverse_disagree = get_sst5_labels_from_csv(infoverse_csv)

[ 29. 164. 399. 268. 136.]


In [32]:
random_inputs = convert_csv_inputs(random_csv, tokenizer, True)

In [20]:
uncertain_inputs = convert_csv_inputs(uncertain_csv, tokenizer)

In [21]:
infoverse_inputs = convert_csv_inputs(infoverse_csv, tokenizer)

# IMP

In [38]:
random_csv = pd.read_csv('./anno_files/imp_random_anno.csv')
infoverse_csv = pd.read_csv('./anno_files/imp_info_anno.csv')
uncertain_csv = pd.read_csv('./anno_files/imp_uncertain_anno.csv')

In [39]:
def get_imp_labels_from_csv(csv_file):
    sum_votes, num_votes, all_votes = convert_csv_to_numpy(csv_file, 601)
    
    avg_votes = sum_votes / num_votes
    
    final_labels = np.zeros(len(avg_votes))
    n_labels = np.zeros(2)
    
    i_idx = (avg_votes < 0.5)
    final_labels[i_idx] = 0
    n_labels[0] = (i_idx).sum()
    
    i_idx2 = (avg_votes >= 0.5)
    final_labels[i_idx2] = 1
    n_labels[1] = (i_idx2).sum()
    
    # Get annotation disagreements
    disagreements = disagree(all_votes, num_votes)
    
    print(n_labels)
    return final_labels, disagreements

In [40]:
random_labels, random_disagree = get_imp_labels_from_csv(random_csv)

[562.  38.]


  after removing the cwd from sys.path.
  keepdims=keepdims, where=where)
  subok=False)
  ret = ret.dtype.type(ret / rcount)


In [41]:
uncertain_labels, uncertain_disagree = get_imp_labels_from_csv(uncertain_csv)

[363. 237.]


  after removing the cwd from sys.path.


In [42]:
infoverse_labels, infoverse_disagree = get_imp_labels_from_csv(infoverse_csv)

[373. 227.]


  after removing the cwd from sys.path.


In [46]:
random_inputs = convert_csv_inputs(random_csv, tokenizer, False, 601)

In [47]:
uncertain_inputs = convert_csv_inputs(uncertain_csv, tokenizer, False, 601)

In [48]:
infoverse_inputs = convert_csv_inputs(infoverse_csv, tokenizer, False, 601)

# Save Annoted Samples as Dataset

In [112]:
def create_tensor_dataset(inputs, labels):
    inputs = torch.LongTensor(inputs)
    labels = torch.LongTensor(labels).unsqueeze(1)  # (N, 1)
    index = torch.arange(len(inputs))

    dataset = TensorDataset(inputs, labels, index)

    return dataset

In [111]:
from torch.utils.data import TensorDataset

In [185]:
random_dataset = create_tensor_dataset(random_inputs, random_labels)

In [186]:
uncertain_dataset = create_tensor_dataset(uncertain_inputs, uncertain_labels)

In [187]:
infoverse_dataset = create_tensor_dataset(infoverse_inputs, infoverse_labels)

In [188]:
torch.save(random_dataset, './anno_files/imp_random.pt')

In [189]:
torch.save(uncertain_dataset, './anno_files/imp_uncertain.pt')

In [190]:
torch.save(infoverse_dataset, './anno_files/imp_infoverse.pt')