In [None]:
import os
import easydict
import json

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import pickle
from matplotlib import pyplot as plt

from torch.utils.data import DataLoader, TensorDataset
from models import load_backbone, Classifier
from data import get_base_dataset
from utils import Logger, set_seed, set_model_path, save_model, load_augment, add_mislabel_dataset
from training.common import cut_input, get_embed, data_aug

In [None]:
import sys
sys.path.append('/home/jaehyung/workspace/infoverse/')

In [2]:
from src.scores_src import get_features, merge_multiple_models
from src.scores_src import avg_conf_variab, avg_forgetting, avg_aum
from src.scores_src import get_density_score, get_mlm_scores, masking_dataset, get_mlm_scores_jh, get_sentence_embedding, compute_nearest_neighbour_distances_cls
from src.scores_src import confidence, entropy, badge_grads_norm
from src.scores_src.ensembles import mc_dropout_models, el2n_score, ens_max_ent, ens_bald, ens_varR
from src.scores_src.dpp import gaussian_kernel, dpp_greedy, dpp_sampling
from src.scores_src.info import aggregate, get_infoverse

In [3]:
from sklearn.linear_model import LinearRegression, Lasso, SGDClassifier
from sklearn.model_selection import train_test_split
from MulticoreTSNE import MulticoreTSNE as TSNE
from matplotlib import pyplot as plt

# Preliminary

In [4]:
args = easydict.EasyDict({"batch_size": 2, 
                          "backbone": 'roberta_large',
                          "dataset": 'imp',
                          "ood_dataset": 'trec',
                          "train_type": 'base',
                          "aug_type": 'none',
                          "seed": 1234,
                          "name": '0417_base_large',
                          "pre_ckpt": './logs/imp_R1.0_0417_base_large_S1234/imp_roberta-large_0417_base_large_epoch4.model',
                          "score_type": 'confidence',
                          "topK": True,
                          "data_ratio": 1.0,
                          "n_classes": 2,
                        })

Adding syntactic noise label to train dataset

In [5]:
backbone, tokenizer = load_backbone(args.backbone)

In Transformers v4.0.0, the default path to cache downloaded models changed from '~/.cache/torch/transformers' to '~/.cache/huggingface/transformers'. Since you don't seem to have overridden and '~/.cache/torch/transformers' is a directory that exists, we're moving it to '~/.cache/huggingface/transformers' to avoid redownloading models you have already in the cache. You should only see this message once.
Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at roberta-large and are newly initialized: ['lm_head.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
dataset, train_loader, val_loader, test_loader = get_base_dataset(args.dataset, tokenizer, args.batch_size, args.data_ratio, args.seed, shuffle=False)

Initializing base dataset... (name: imp)


In [7]:
train_loader = DataLoader(dataset.train_dataset, shuffle=False, drop_last=False, batch_size=args.batch_size, num_workers=4)

In [8]:
labels_v = dataset.val_dataset[:][1][:, 0].numpy()

In [9]:
labels_t = dataset.train_dataset[:][1][:, 0].numpy()

In [11]:
model = Classifier(args.backbone, backbone, dataset.n_classes, args.train_type).cuda()
if args.pre_ckpt is not None:
    model.load_state_dict(torch.load(args.pre_ckpt))

## Load Dataset

In [12]:
import json

In [13]:
with open('imp_reduce_5K.json') as json_file:
    data = json.load(json_file)

In [14]:
import advertools as adv

In [15]:
reduced_data = []

for sample in data:
    emoji_summary = adv.extract_emoji(sample)
    n_emoji = len(emoji_summary['emoji_flat'])
    
    if n_emoji == 0:
        reduced_data.append(sample)

In [20]:
def tokenization(sentences, tokenizer):
    n_samples = len(sentences)
    
    tokens_aft = []
    for i in range(n_samples):
        token_aft = tokenizer.encode(sentences[i], add_special_tokens=True, max_length=128,
                                             pad_to_max_length=True, return_tensors='pt')
        tokens_aft.append(token_aft)
    return torch.cat(tokens_aft, dim=0)

## Filtering with GPT

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AutoModelForCausalLM, RobertaTokenizer
gpt_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt_model = GPT2LMHeadModel.from_pretrained('gpt2').cuda()

In [None]:
def perplexity(gpt_model, gpt_tokenizer, text_data):
    ppls = []
    stride, max_length = 512, 512 
    
    for text in text_data:
        sample_token = gpt_tokenizer.encode(text, return_tensors='pt').cuda()
        
        nlls = []
        for i in range(0, sample_token.size(1), stride):
            begin_loc = max(i + stride - max_length, 0)
            end_loc = min(i + stride, sample_token.size(1))
            trg_len = end_loc - i  # may be different from stride on last loop
            input_ids = sample_token[:, begin_loc:end_loc].cuda()
            target_ids = input_ids.clone()
            target_ids[:, :-trg_len] = -100

            with torch.no_grad():
                outputs = gpt_model(input_ids, labels=target_ids)
                neg_log_likelihood = outputs[0] * trg_len

            nlls.append(neg_log_likelihood)

        ppl = torch.exp(torch.stack(nlls).sum() / end_loc)
        ppls.append(float(ppl.cpu()))
        
    return ppls

In [None]:
gpt_ppls = perplexity(gpt_model, gpt_tokenizer, reduced_data)
gpt_ppls = np.array(gpt_ppls)

In [None]:
trimmed_idx = []
for i in range(len(gpt_ppls)):
    if gpt_ppls[i] >= 1000:
        continue
    else:
        trimmed_idx.append(i)

In [None]:
import re
def repetitions(s):
   r = re.compile(r"(.+?)\1+")
   for match in r.finditer(s):
       yield (match.group(1), len(match.group(0))/len(match.group(1)))

In [None]:
def get_repetition(sent):
    lists = list(repetitions(sent))
    n_max = 0
    
    for item in lists:
        n_repeat = item[1]
        
        if n_repeat > n_max:
            n_max = n_repeat
    
    return n_max

In [None]:
trimmed_data = []
for i in range(len(reduced_data)):
    n_repeat = get_repetition(reduced_data[i])
    if n_repeat < 5 and len(reduced_data[i]) < 512:
        trimmed_data.append(reduced_data[i])

## Construct Loader

In [21]:
tokens_data = tokenization(trimmed_data, tokenizer)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
def get_pseudo(tokens, model):
    model.eval()
    
    batch_size = 16
    n_sample = len(tokens) 
    n_batch = int(n_sample / batch_size) + int(n_sample % batch_size != 0)
    
    all_confs, all_labels, all_penuls = [], [], []
    for i in range(n_batch):
        tokens_i = tokens[i*batch_size:(i+1)*batch_size].cuda()
        
        with torch.no_grad():
            logits, penuls = model(tokens_i, get_penul=True)
            confs, pseudo_labels = torch.softmax(logits, dim=1).max(dim=1)
            
        all_confs.append(confs.cpu())
        all_labels.append(pseudo_labels.cpu())
        all_penuls.append(penuls.cpu())
    return torch.cat(all_confs, dim=0), torch.cat(all_labels, dim=0), torch.cat(all_penuls, dim=0) 

In [None]:
confs, pseudo, penuls = get_pseudo(tokens_data, model)

In [22]:
new_indices = torch.arange(len(tokens_data))

In [50]:
new_dataset = TensorDataset(tokens_data, pseudo.unsqueeze(1), new_indices)

In [51]:
new_loader = DataLoader(new_dataset, shuffle=False, drop_last=False, batch_size=args.batch_size, num_workers=4)

## InfoVerse

In [52]:
infoverse = get_infoverse(args, label_dataset=dataset.train_dataset, pool_dataset=new_dataset,
                     n_epochs=7, seeds_list=[1234, 2345, 3456], n_class=5)

Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at roberta-large and are newly initialized: ['lm_head.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


here1
here2


	nonzero()
Consider using one of the following signatures instead:
	nonzero(*, bool as_tuple) (Triggered internally at  /opt/conda/conda-bld/pytorch_1595629403081/work/torch/csrc/utils/python_arg_parser.cpp:766.)
  mask_idx = mask.nonzero()


here3


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [27]:
measures_t2 = (infoverse - infoverse.mean(axis=0)) / (1e-8 + infoverse.std(axis=0))

In [31]:
selected_idx = dpp_sampling(1100, measures_t2, pseudo.numpy(), scores='inv')

Now 0 % of processing has been done


In [32]:
np.save('imp_info_5k_selected_idx.npy', selected_idx)

In [13]:
selected_idx = np.load('imp_info_5k_selected_idx.npy')

In [33]:
final_sent = []

for idx in selected_idx:
    sent = trimmed_data[idx]
    
    equal = 0
    for item in final_sent:
        if sent == item:
            equal += 1
    
    if equal == 0:
        final_sent.append(sent)

In [35]:
with open('/home/jaehyung/imp_unlabel_infoverse.txt', 'w') as f:
    temp = 1
    for item in final_sent[:1000]:
        f.write(str(temp) + '\t' + item)
        temp += 1

In [28]:
def dpp_sampling(n_query, measurements, labels, scores='density', reduce=False):
    n_sample = len(measurements)
    eps = 5e-4

    # Dimension reduction for removing redundant features
    if reduce:
        info_measures, _ = PPCA(measurements)
    else:
        info_measures = np.array(measurements)

    # Define similarity kernel phi(x_1, x_2)
    similarity = gaussian_kernel(info_measures / np.linalg.norm(info_measures, axis=-1).reshape(-1, 1))

    # Define score function q(x)
    if scores == 'density':
        scores_bef = -1 * compute_nearest_neighbour_distances_cls(info_measures, labels, info_measures, labels, nearest_k=5)
        scores = (-1 / (1e-8 + scores_bef))
    elif scores == 'inv':
        scores = compute_nearest_neighbour_distances_cls(info_measures, labels, info_measures, labels, nearest_k=5)
    else:
        scores = np.ones(n_sample)
    scores = (scores - scores.min()) / scores.max()

    dpp_kernel = scores.reshape((n_sample, 1)) * similarity * scores.reshape((1, n_sample))
    selected_idx = dpp_greedy(dpp_kernel + eps * np.eye(n_sample), n_query)

    return selected_idx

## Baselines

In [36]:
random_idx = list(torch.randperm(len(trimmed_data))[:1100].numpy())

In [37]:
final_sent = []

for idx in random_idx:
    sent = trimmed_data[idx]
    
    equal = 0
    for item in final_sent:
        if sent == item:
            equal += 1
    
    if equal == 0:
        final_sent.append(sent)

In [39]:
with open('/home/jaehyung/imp_unlabel_random.txt', 'w') as f:
    temp = 1
    for item in final_sent[:1000]:
        f.write(str(temp) + '\t' + item)
        temp += 1

In [41]:
uncertain_idx = torch.Tensor(infoverse[:, 6]).sort(descending=True)[1][:1100]

In [42]:
final_sent = []

for idx in uncertain_idx:
    sent = trimmed_data[idx]
    
    equal = 0
    for item in final_sent:
        if sent == item:
            equal += 1
    
    if equal == 0:
        final_sent.append(sent)

In [43]:
with open('/home/jaehyung/imp_unlabel_uncertain.txt', 'w') as f:
    temp = 1
    for item in final_sent[:1000]:
        f.write(str(temp) + '\t' + item)
        temp += 1