In [1]:
import sys
import os
sys.path.append("/home/jovyan/filtered-transformer/")

In [2]:
from examples.qa.data import get_tokenized_dataset
from transformers import AutoConfig, AutoTokenizer
import transformers
import tasks
from transformers.file_utils import PaddingStrategy
from transformers.tokenization_utils_base import TruncationStrategy
from transformers import RobertaTokenizer, RobertaModel


def adjust_tokenizer(tokenizer):
    if isinstance(tokenizer, (transformers.GPT2Tokenizer, transformers.GPT2TokenizerFast)) and \
            "gpt" in tokenizer.name_or_path:
        tokenizer.pad_token = tokenizer.eos_token


tokenizer = RobertaTokenizer.from_pretrained(
    "roberta-base",
    cache_dir="/home/jovyan/cashe",
    use_fast=True,
    revision="main",
)

adjust_tokenizer(tokenizer)


task = tasks.get_task(task_args=tasks.TaskArguments(task_name="custom", task_base_path="/home/jovyan/quality_mc/"))
dataset_dict = task.get_datasets()


tokenized_dataset_dict = get_tokenized_dataset(
    task=task,
    dataset_dict=dataset_dict,
    tokenizer=tokenizer,
    max_seq_length=4096,
    padding_strategy=PaddingStrategy(PaddingStrategy.MAX_LENGTH),
    truncation_strategy=TruncationStrategy(TruncationStrategy.ONLY_FIRST),
    model_mode="mc",
)


Using custom data configuration default
Reusing dataset json (/home/jovyan/.cache/huggingface/datasets/json/default-b3b6f144a42f8309/0.0.0/fb88b12bd94767cb0cc7eedcd82ea1f402d2162addc03a37e81d4f8dc7313ad9)
Using custom data configuration default
Reusing dataset json (/home/jovyan/.cache/huggingface/datasets/json/default-ca7a0ff6209c9a75/0.0.0/fb88b12bd94767cb0cc7eedcd82ea1f402d2162addc03a37e81d4f8dc7313ad9)


HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




In [5]:
from typing import Dict

from metrics.accuracy import AccuracyMetric

from torch import Tensor, nn
from examples.qa.modules import DataFilter, MemUpMemoryImpl, Predictor, RobertaRT
from memup.base import DataCollectorAppend, DataCollectorReplace, MemoryRollout, State
import torch
from torch.utils.data import DataLoader

train_data = tokenized_dataset_dict.get("train")
test_data = tokenized_dataset_dict.get("validation")

print("train", len(train_data), "test", len(test_data))

def collate_fn(batch):

    batch_pt = {}
        
    for k in ['input_ids', 'attention_mask', "label", 'input_part_token_start_idx']:
        batch_pt[k] = torch.stack(
            [torch.tensor(el[k]) for el in batch]
        )

    return batch_pt


# train_dataloader = DataLoader(train_data, shuffle=False, batch_size=64, num_workers=8, collate_fn=collate_fn)
test_dataloader = DataLoader(test_data, shuffle=False, batch_size=64, num_workers=8, collate_fn=collate_fn)

model = RobertaRT(RobertaModel.from_pretrained(
    'roberta-base',
    cache_dir="/home/jovyan/cashe",
    revision="main",
)).cuda()

predictor = Predictor(model.bert.config).cuda()

weights = torch.load("/home/jovyan/qa_1.1.pt", map_location="cpu")
model.load_state_dict(weights["mem"])
predictor.load_state_dict(weights["pred"])

data_filter = DataFilter(tokenizer, 300)

memup_iter = MemoryRollout[Dict[str, Tensor]](
    steps=50,
    memory=MemUpMemoryImpl(model),
    data_filter=data_filter,
    info_update=[]
)



class DataCollectorTrain(DataCollectorReplace[Dict[str, Tensor], Tensor]):
    def apply(self, data: Dict[str, Tensor], out: Tensor, state: State) -> Tensor:
        return state

with torch.no_grad():

    all_pred = []
    all_labels = []

    for batch in test_dataloader:

        labels = batch["label"].cuda()
        print()

        state = torch.zeros(labels.shape[0] * 4, 50, 768, device=torch.device("cuda"))
        done = False
        info = {}

        model.eval()
        predictor.eval()

        data_collector, state, info, done = memup_iter.forward(batch, state, info, DataCollectorTrain())
        # opt.zero_grad()
        states_seq = data_collector.result()
        pred = predictor(states_seq[-1])
        loss = nn.CrossEntropyLoss()(pred, labels)
        acc = AccuracyMetric()(pred, labels)
        
        print(loss.item(), "acc=", acc)

        all_pred.append(pred.cpu())
        all_labels.append(labels.cpu())

    acc = AccuracyMetric()(torch.cat(all_pred), torch.cat(all_labels))

    print("final acc", acc)



    


train 2523 test 2086


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).



4.285730838775635 acc= 0.265625

3.6775429248809814 acc= 0.40625

4.105374813079834 acc= 0.3125

3.9322423934936523 acc= 0.421875

5.3573689460754395 acc= 0.328125

3.4823191165924072 acc= 0.296875

3.311509609222412 acc= 0.34375

3.1081905364990234 acc= 0.390625

2.629809856414795 acc= 0.453125

4.760799884796143 acc= 0.34375

4.753931522369385 acc= 0.1875

3.519364833831787 acc= 0.25

3.96626877784729 acc= 0.3125

4.008843421936035 acc= 0.3125

3.561490058898926 acc= 0.296875

3.288618564605713 acc= 0.375

3.7215704917907715 acc= 0.34375

3.5406551361083984 acc= 0.375

3.682309150695801 acc= 0.40625

3.9537923336029053 acc= 0.34375

3.9907119274139404 acc= 0.375

3.1293444633483887 acc= 0.390625

4.184239387512207 acc= 0.34375

3.757068395614624 acc= 0.3125

2.730419397354126 acc= 0.40625

3.64552903175354 acc= 0.359375

2.937997579574585 acc= 0.3125

3.7049295902252197 acc= 0.328125

3.7098772525787354 acc= 0.328125

3.432802438735962 acc= 0.359375

3.931098222732544 acc= 0.375

2.

In [6]:
from torch.utils.data import DataLoader, Dataset

train_dataloader = DataLoader(train_data, shuffle=True, batch_size=16, num_workers=4, collate_fn=collate_fn)

In [7]:
batch = next(iter(train_dataloader))

In [15]:
from turtle import forward
from typing import Dict, Tuple
from memup.base import Done, Info, State
from torch import nn, Tensor
import torch.nn.functional as F


class DataFilter(nn.Module):

    def __init__(self, tokenizer, size) -> None:
        super().__init__()
        self.pad_token_id = tokenizer.pad_token_id
        self.cls_token_id = tokenizer.cls_token_id
        self.sep_token_id = tokenizer.sep_token_id
        self.register_buffer('cls_token', torch.tensor([tokenizer.cls_token_id]))
        self.register_buffer('sep_token', torch.tensor([tokenizer.sep_token_id]))
        self.segment_size = size

    def pad_add_special_tokens_for_qa(self, tensor, query_option):
        input_elements = [self.cls_token, tensor, query_option]
        tensor = torch.cat(input_elements)

        pad_size = self.segment_size - tensor.shape[0]
        if pad_size > 0:
            tensor = F.pad(tensor, (0, pad_size), value=self.pad_token_id)
        return tensor
    
    def get_attention_mask(self, tensor):
        mask = torch.ones_like(tensor)
        mask[tensor == self.pad_token_id] = 0
        return mask

    def get_token_type_ids(self, tensor):
        return torch.zeros_like(tensor)

    def get_cut_input(self, input_ids, input_part_token_start_idx, shift_batch):
        # input_ids -> (b_s, 4, 4098)
        B, _, T = input_ids.shape
        input_ids = input_ids.reshape(B * 4, T)
        input_part_token_start_idx = input_part_token_start_idx.reshape(B * 4)

        end_seq = []
        ss_batch = []
    
        for i, seq in enumerate(input_ids):
            # seq = cls + context + sep + query + option + sep
            spliter_inx = input_part_token_start_idx[i]
            query_option = seq[spliter_inx:]
            query_option = query_option[(query_option != self.pad_token_id) & (query_option != self.cls_token_id)]  # sep + query + option + sep
            context = seq[:spliter_inx]  # cls + context
            context = context[(context != self.pad_token_id) & (context != self.cls_token_id) & (context != self.sep_token_id)]  # context

            start = shift_batch[i].item()
            end = max(start, start + self.segment_size - len(query_option) - 1)
            end = min(end, len(context))
            # print(start, end)
            input_segment = context[start:end] 
            input_segment = self.pad_add_special_tokens_for_qa(input_segment, query_option)
            assert len(input_segment) == self.segment_size
            ss_batch.append(input_segment)
            end_seq.append(end)

        # print("ends", torch.stack(end_seq))
        shift_batch = torch.tensor(end_seq)
        ss_batch = torch.stack(ss_batch)

        return ss_batch, shift_batch
    
    def forward(self, batch: Dict[str, Tensor], state: State = None, info: Info = {}, *args) -> Tuple[Dict[str, Tensor], Done]:

        if "shift_batch" not in info:
            info["shift_batch"] = torch.zeros(batch['input_ids'].shape[0] * 4, dtype=torch.int32)
        
        input_ids, new_shift = self.get_cut_input(batch['input_ids'], batch['input_part_token_start_idx'], info["shift_batch"])
        done = (info["shift_batch"] - new_shift).abs().sum().item() == 0
        info["shift_batch"] = new_shift

        return {
            "label": batch["label"].cuda(),
            'input_ids': input_ids.cuda(),
            'attention_mask': self.get_attention_mask(input_ids).cuda(),
            'token_type_ids': self.get_token_type_ids(input_ids).cuda()
        }, done




Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


TypeError: Predictor.__init__() takes 1 positional argument but 2 were given

In [55]:
ss_batch, shift_batch = data_filter.get_cut_input(batch["input_ids"], batch['input_part_token_start_idx'], shift_batch)
print(ss_batch.shape, shift_batch)

torch.Size([64, 512]) tensor([4059, 4065, 4063, 4063, 4075, 4075, 4075, 4075, 2589, 2589, 2589, 2589,
        4065, 4061, 4070, 4070, 2962, 2962, 2962, 2962, 4059, 4057, 4058, 4058,
        4051, 4068, 4061, 4065, 4053, 4054, 4054, 4057, 4065, 4067, 4067, 4065,
        4054, 4055, 4054, 4052, 4060, 4063, 4061, 4066, 4078, 4079, 4077, 4080,
        4064, 4070, 4061, 4066, 4078, 4076, 4079, 4079, 3962, 3962, 3962, 3962,
        4064, 4067, 4062, 4059])


In [9]:
from copy import deepcopy


class Predictor(nn.Module):

    def __init__(self, bert_config):
        super().__init__()
        config2 = deepcopy(bert_config)
        config2.hidden_size = bert_config.hidden_size 
        config2.num_attention_heads = 4
        config2.num_hidden_layers = 4

        self.encoder = RobertaModel(config2).encoder
        self.encoder.train()
        self.config = config2

        self.head = nn.Sequential(
            nn.Dropout(0.0),
            nn.Linear(bert_config.hidden_size * 4, bert_config.hidden_size),
            nn.Dropout(0.1),
            nn.ReLU(),
            nn.Linear(bert_config.hidden_size, 4),
        )

    def forward(self, state):
        B, D = state.shape[0], state.shape[2]
        out = self.encoder.forward(state)['last_hidden_state'][:, -1].reshape(B // 4, 768 * 4)
        return self.head(out)

In [11]:
from memup.base import MemUpMemory

class RobertaRT(nn.Module):

    _keys_to_ignore_on_load_unexpected = [r"pooler"]

    def __init__(self, roberta: RobertaModel):
        super().__init__()

        self.bert = roberta
        self.bert.train()

        config2 = deepcopy(roberta.config)
        config2.num_attention_heads = 4
        config2.num_hidden_layers = 4

        self.encoder = RobertaModel(config2).encoder
        self.encoder.train()

    def forward(
        self,
        state,
        input_ids: Tensor,
        attention_mask: Tensor,
        token_type_ids: Tensor
    ):
              
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
        )

        h = outputs[0]

        hs = torch.cat([h, state], dim=1)
        hs = self.encoder(hs)['last_hidden_state']
        new_state = hs[:, h.shape[1]:]
        # out = hs[:, : h.shape[1]]

        empty_mask = attention_mask.type(torch.int32).sum(1)
        new_state[empty_mask == 0] = state[empty_mask == 0]

        return new_state
    
    
class MemUpMemoryImpl(MemUpMemory):

    def __init__(self, mem_tr: RobertaRT):
        super().__init__()
        self.mem_tr = mem_tr

    def forward(self, data: Dict[str, Tensor], state: State) -> Tuple[Tensor, State]:
        new_state = self.mem_tr.forward(state, data["input_ids"], data["attention_mask"], data["token_type_ids"])
        return None, new_state


In [18]:

from memup.base import DataCollectorAppend, MemoryRollout
from memup.loss import TS, LossModule, PredictorLossStateOnly
from metrics.accuracy import AccuracyMetric


model = RobertaRT(RobertaModel.from_pretrained(
    'roberta-base',
    cache_dir="/home/jovyan/cashe",
    revision="main",
)).cuda()

predictor = Predictor(model.bert.config).cuda()

data_filter = DataFilter(tokenizer, 512)

memup_iter = MemoryRollout[Dict[str, Tensor]](
    steps=2,
    memory=MemUpMemoryImpl(model),
    data_filter=data_filter,
    info_update=[]
)


predictor_loss = PredictorLossStateOnly(predictor, [
        LossModule(nn.CrossEntropyLoss(), "CE", 1.0),
        LossModule(AccuracyMetric(), "Accuracy", 0.0)
])

class DataCollectorTrain(DataCollectorAppend[Dict[str, Tensor], Tensor]):
    def apply(self, data: Dict[str, Tensor], out: Tensor, state: State) -> Tensor:
        return state


labels = batch["label"].cuda()

state = torch.zeros(labels.shape[0] * 4, 100, 768, device=torch.device("cuda"))
done = False
info = {}

model.train()
predictor.train()

with torch.no_grad():
    while not done:
        data_collector, state, info, done = memup_iter.forward(batch, state, info, DataCollectorTrain())
        # opt.zero_grad()
        states_seq = data_collector.result()
        pred = predictor(states_seq[-1])
        print(pred.shape)
        loss = nn.CrossEntropyLoss()(pred, labels)
        acc = AccuracyMetric()(pred, labels)
        print(acc)

        # loss.backward()
        # opt.step()

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


torch.Size([16, 4])
0.375
torch.Size([16, 4])
0.25
torch.Size([16, 4])
0.3125
torch.Size([16, 4])
0.3125
torch.Size([16, 4])
0.4375
torch.Size([16, 4])
0.375
