In [18]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting transformers
  Downloading transformers-4.27.1-py3-none-any.whl (6.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.7/6.7 MB[0m [31m72.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m99.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Using cached huggingface_hub-0.13.2-py3-none-any.whl (199 kB)
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.2 tokenizers-0.13.2 transformers-4.27.1


In [29]:
import os
import sys
import argparse
import pandas as pd
from pathlib import Path
from time import time
from tqdm import tqdm
import pickle
import random
import numpy as np
from nltk.corpus import words
from nltk.tokenize import sent_tokenize
import torch
import torch.nn as nn
import torch.nn.functional as F
from cr.config import Config
import wandb
from cr.models.decoders import (
    Decoder,
)
from torch.utils.data import Dataset, DataLoader
from cr.utils import (
    get_model_class,
    get_optimizer_class,
    get_dataloader_class,
    args_factory,
    save_args,
    save_ckpt,
    load_ckpt,
    build_vib_path
)
import cr.utils
from transformers import BertTokenizerFast, RobertaTokenizerFast
config = Config()

# args

In [2]:
parser = argparse.ArgumentParser()

config = Config()
# experiment
parser.add_argument("--scale", type=str, default="normal", help="[small |normal]")
parser.add_argument("--dataset-name", type=str,help="[fever | multirc]")
parser.add_argument("--aspect", type=str, help="Look, Aroma,Palate for beer;Cleanliness,Location,Service for hotel")
parser.add_argument("--dataset-split", type=str, default="all", help="[all | train | dev | test]")
parser.add_argument("--max_length", type=int, default=120)
parser.add_argument("--encoder-type", type=str, default="bert-base-uncased")
parser.add_argument("--decoder-type", type=str, default="bert-base-uncased")
parser.add_argument("--cache_dir", type=str, default=config.CACHE_DIR)
parser.add_argument("--attack_path", type=str, default=None)
parser.add_argument("--debug", action="store_true")
parser.add_argument("--batch_size", type=int, default=32)
parser.add_argument("--overwrite_cache", action="store_true")
parser.add_argument("--num_epoch", type=int, default=10)
parser.add_argument("--lr", type=float, default=5e-5)
parser.add_argument("--dropout_rate", type=float, default=0.2)
parser.add_argument("--no-shuffle", action="store_true")
parser.add_argument("--optimizer", type=str, default="adamw")
parser.add_argument("--grad_accumulation_steps", type=int, default=1)
parser.add_argument("--device_id", type=int, default=3)
parser.add_argument("--print-every", type=int, default=80)
parser.add_argument("--eval-interval", type=int, default=500)
args = parser.parse_args("")
args.dataset_name='beer'


dataloader_class = get_dataloader_class(args)

# dataloader

In [67]:
def get_special_token_map(encoder_type):
    if encoder_type.startswith('roberta'):
        special_token_map = {
            'bos_token': '<s>',
            'eos_token': '</s>',
            'sep_token': '</s>',
            'cls_token': '<s>',
            'unk_token': '<unk>',
            'pad_token': '<pad>',
            'mask_token': '<mask>',
        }
    elif encoder_type.startswith('bert') or encoder_type.startswith('distilbert'):
        special_token_map = {
            'sep_token': '[SEP]',
            'cls_token': '[CLS]',
            'unk_token': '[UNK]',
            'pad_token': '[PAD]',
            'mask_token': '[MASK]',
        }
    return special_token_map

class BaseDataLoader:
    def __init__(self, args):
        self.args = args
        self.tok_kwargs = config.TOK_KWARGS
        self.tok_kwargs['max_length'] = self.args.max_length
        if self.args.dataset_name=='ga':
            with open('ga_code.pkl','rb') as f:
              self.tokenizer=pickle.load(f)
        elif self.args.encoder_type.startswith('bert') or self.args.encoder_type.startswith('distilbert'):
            self.tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased', cache_dir=self.args.cache_dir)
        elif self.args.encoder_type.startswith('roberta'):
            self.tokenizer = RobertaTokenizerFast.from_pretrained(self.args.encoder_type, cache_dir=self.args.cache_dir)
        
        self.dataset_name_to_dataset_class = {
            'beer': SentimentDataset
        }
        self._dataloaders = {}
        self.special_token_map = get_special_token_map(self.args.encoder_type)

    def _load_processed_data(self, mode):
        raise NotImplementedError

    def _build_dataloader(self, data, mode):
        dataset = self.dataset_name_to_dataset_class[self.args.dataset_name](
            self.args,
            data,
            self.tokenizer,
            self.tok_kwargs
        )
        collate_fn = dataset.collater
        batch_size = self.args.batch_size
        shuffle = True if mode == 'train' else False
        
        self._dataloaders[mode] = DataLoader(
            dataset=dataset,
            batch_size=batch_size,
            shuffle=shuffle,
            collate_fn=collate_fn,
        )
        print(f'[{mode}] dataloader built => {len(dataset)} examples')
    
    def build(self, mode):
        data = self._load_processed_data(mode)
        self._build_dataloader(data, mode)

    def build_all(self):
        for mode in ['train', 'dev', 'test']:
            self.build(mode)
    
    def __getitem__(self, mode):
        return self._dataloaders[mode]

    @property
    def train(self):
        return self._dataloaders['train']

    @property
    def dev(self):
        return self._dataloaders['dev']
    
    @property
    def test(self):
        return self._dataloaders['test']


class BaseDataset(Dataset):
    def __init__(self, args, data, tokenizer, tok_kwargs):
        self.args = args
        self.data = data
        self.tokenizer = tokenizer
        self.tok_kwargs = tok_kwargs

    def __getitem__(self, index):
        return self.data[index]

    def __len__(self):
        return len(self.data)
    
    @property
    def num_batches(self):
        return len(self.data) // self.args.batch_size


class SentiDataLoader(BaseDataLoader):
    def __init__(self, args):
        super(SentiDataLoader, self).__init__(args)
        if args.dataset_split == 'all':
            self.build_all()
        else:
            self.build(args.dataset_split)

    def _load_raw_data(self, mode):
            datapoints = []
            aspect=self.args.aspect
            #scale='normal'
            scale=self.args.scale
            print('aspect:',aspect)
            print('mode:',mode)
            print('scale:',scale)
            
            
            if scale=='normal':
                if self.args.dataset_name == 'beer' and mode in ('train', 'dev'):
                    path = config.DATA_DIR / f'sentiment/data/source/beer_{aspect}.{mode}_120'
                elif self.args.dataset_name == 'beer' and mode == 'test':
                    path = config.DATA_DIR / f'sentiment/data/target/beer_{aspect}.train'
                else:
                    raise ValueError('Dataset name not supported.')
                    
            if scale=='small':

                if self.args.dataset_name == 'beer' and mode in ('train', 'dev'):
                    path = config.DATA_DIR / f'sentiment/data/source/beer_{aspect}.{mode}_120'
                elif self.args.dataset_name == 'beer' and mode == 'test':
                    path = config.DATA_DIR / f'sentiment/data/target/beer_{aspect}.train'
                else:
                    raise ValueError('Dataset name not supported.')
                    
            if scale=='noise':
                if self.args.dataset_name == 'beer' and mode in ('train', 'dev'):
                    path = config.DATA_DIR / f'sentiment/data/source/beer_{aspect}.{mode}_noise'
                elif self.args.dataset_name == 'beer' and mode == 'test':
                    path = config.DATA_DIR / f'sentiment/data/target/beer_{aspect}.train'
            
     
              
                
            df = pd.read_csv(path, delimiter='\t')
            for index, row in df.iterrows():
                label = row['label']

                text = row['text']
                if 'rationale' in row:
                    rationale = [int(r) for r in row['rationale'].split()]
                else:
                    rationale = [-1] * len(row['text'].split())
                datapoints.append({
                    'label': label,
                    'text': text,
                    'rationale': rationale,
                })
            if self.args.debug:
              datapoints = datapoints[:200]
            return datapoints

    def _load_processed_data(self, mode):
        processed_datapoints = []
        datapoints = self._load_raw_data(mode)
        for datapoint in tqdm(datapoints, total=len(datapoints)):
            label = datapoint['label']
            # in this step token is correct
            input_tokens = ['[CLS]'] + datapoint['text'].split()
            rationale = datapoint['rationale']
            input_ids = []
            attention_mask = []
            rationale_ = []
            for input_token, r in zip(input_tokens, rationale):
                tokenized = self.tokenizer.encode_plus(input_token, add_special_tokens=False)
                input_ids += tokenized['input_ids']
                attention_mask += tokenized['attention_mask']
                ## make rationale cover subword
                rationale_ += [r] * len(tokenized['input_ids'])

                
            if  len(input_ids) >= self.args.max_length:
                input_ids = input_ids[:self.args.max_length - 1] + [102]
                attention_mask = attention_mask[:self.args.max_length - 1] + [1]
                rationale = rationale_[:self.args.max_length - 1] + [0]
            else:
                input_ids = input_ids + [102] #102 is [SEP]
                attention_mask = attention_mask + [1]
                rationale = rationale_ + [0]
                
            input_ids = self.pad(input_ids)
            attention_mask = self.pad(attention_mask)
            rationale = self.pad(rationale)

            assert len(input_ids) == self.args.max_length

            processed_datapoints.append({
                'input_ids': input_ids,
                'attention_mask': attention_mask,
                'label': label,
                'rationale': rationale,
            })
        return processed_datapoints

    def pad(self, seq):
        return seq + (self.args.max_length - len(seq)) * [0]


class SentimentDataset(BaseDataset):
    def __init__(self, args, data, tokenizer, tok_kwargs):
        super(SentimentDataset, self).__init__(args, data, tokenizer, tok_kwargs)

    def collater(self, batch):
        device = 'cuda' if self.args.use_cuda else 'cpu'
  
        return {
            'input_ids': torch.tensor([datapoint['input_ids'] for datapoint in batch]).long(),
            'attention_mask': torch.tensor([datapoint['attention_mask'] for datapoint in batch]).long(),
            'label': torch.tensor([datapoint['label'] for datapoint in batch]),
            'rationales': torch.tensor([datapoint['rationale'] for datapoint in batch]).long(),
        }
      

In [4]:
aspect='Aroma'
mode='train'
path = config.DATA_DIR / f'sentiment/data/source/beer_{aspect}.{mode}_120'
df= pd.read_csv(path, delimiter='\t')

In [5]:
df

Unnamed: 0,task,label,text,rationale,labels
0,beer1,0.8,"250ml screwtop bottle , poured into leffe chal...",0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 ...,1
1,beer1,0.9,appearance : deep brown color with a thin tan ...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,1
2,beer1,0.9,purchased at the lake merritt whole foods in o...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,1
3,beer1,0.9,pours clear amber with a small white head . ar...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 ...,1
4,beer1,0.2,12oz . bottle poured into a snifter . no bottl...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,0
...,...,...,...,...,...
14080,beer1,0.6,"a somewhat thin porter , black as hell with an...",0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 ...,1
14081,beer1,1.0,a- cloudy merky orange amber color . good head...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,1
14082,beer1,0.8,foamy head that retains its shape for a long t...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,1
14083,beer1,0.8,pours cloudy brownish red with no head . smell...,0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 0 0 0 0 0 0 0 0 ...,1


In [68]:
args.dataset_name='beer'


args.aspect='Look'


dl_look =SentiDataLoader(args)


aspect: Look
mode: train
scale: normal


100%|██████████| 15932/15932 [01:12<00:00, 219.54it/s]


[train] dataloader built => 15932 examples
aspect: Look
mode: dev
scale: normal


100%|██████████| 3757/3757 [00:17<00:00, 218.73it/s]


[dev] dataloader built => 3757 examples
aspect: Look
mode: test
scale: normal


100%|██████████| 200/200 [00:01<00:00, 159.08it/s]

[test] dataloader built => 200 examples





In [None]:
args.dataset_name='beer'


args.aspect='Look'

args.dataset_split='train'
dl_look_train =SentiDataLoaders(args)

args.dataset_split='dev'
dl_look_dev =SentiDataLoaders(args)

args.dataset_split='test'
dataloader_class = get_dataloader_class(args)
dl_look_test =SentiDataLoaders(args)

args.aspect='Aroma'

args.dataset_split='train'
dataloader_class = get_dataloader_class(args)
dl_aroma_train =SentiDataLoaders(args)

args.dataset_split='dev'
dataloader_class = get_dataloader_class(args)
dl_aroma_dev =SentiDataLoaders(args)

args.dataset_split='test'
dataloader_class = get_dataloader_class(args)
dl_aroma_test =SentiDataLoaders(args)

args.aspect='Palate'

args.dataset_split='train'
dataloader_class = get_dataloader_class(args)
dl_palate_train =SentiDataLoaders(args)

args.dataset_split='dev'
dataloader_class = get_dataloader_class(args)
dl_palate_dev =SentiDataLoaders(args)

args.dataset_split='test'
dataloader_class = get_dataloader_class(args)
dl_palate_test =SentiDataLoaders(args)

# Model

In [77]:
from transformers import AutoConfig,AutoModel,PreTrainedModel
class CustomModel(PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.backbone = AutoModel.from_config(config)

        self.output = nn.Linear(config.hidden_size, 1)
        self.output1 = nn.Linear(120, 1)
        self.sig= nn.Sigmoid()

    def forward(
        self,
        input_ids,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        labels=None,
    ):
        outputs = self.backbone(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
        )

        sequence_output = outputs.last_hidden_state
        outputs = self.output(sequence_output)
        print(outputs.shape)
        outputs = self.output1(outputs)
        outputs=self.sig(outputs)

            
        return {
            "outputs": outputs
        }

In [78]:
import copy
def train(model,dl_loader,args,device):
    num_batches = dl_loader.train.dataset.num_batches
    best_loss=100000
    
    #best_model=copy.deepcopy(model)
    global_step=0
    optimizer_class = get_optimizer_class(args)
    optimizer = optimizer_class(model.parameters(), lr=args.lr)
    

    # Number of training epochs (authors recommend between 2 and 4)
    epochs = args.num_epoch

    # trange is a tqdm wrapper around the normal python range
    for i in range(epochs):
       print(f'epoch {i}')
      # Training

      # Set our model to training mode (as opposed to evaluation mode)
       model.train()
       train_loss=0
      # Train the data for one epoch
       for batch_idx, batch in enumerate(dl_loader['train']):
                input_ids=batch['input_ids'].to(device)
                attention_mask=batch['attention_mask'].to(device)
                labels=batch['label'].to(device)
                output = model(input_ids=input_ids,attention_mask=attention_mask)
                loss_fn = nn.MSELoss()
                #loss = loss_fn(output['outputs'], labels.type(torch.cuda.FloatTensor))
                loss = loss_fn(output['outputs'], labels)
                train_loss=loss.item()
                global_step+=1
                
                loss.backward()

                optimizer.step()
                optimizer.zero_grad()
    
                if (global_step + 1) % args.print_every == 0:
                  #print(f'global step {global_step}, train_loss: {train_loss}')
                  with torch.no_grad():
                      # return acc is for binary classification on dev
                    model.eval()
                    total_loss = 0
                    m=0
                    for batch_idx, batch in enumerate(dl_loader['dev']):
                            input_ids=batch['input_ids'].to(device)
                            attention_mask=batch['attention_mask'].to(device)
                            labels=batch['labels'].to(device)
                            output = model(input_ids=input_ids,attention_mask=attention_mask)
                            loss_fn = nn.MSELoss()
                            loss = loss_fn(output['outputs'], labels.type(torch.cuda.FloatTensor))
                            total_loss += loss.item()
                            m+=1
                        
                    dev_loss = total_loss / m
                    print(f"[train] Epoch: {i} | "
            f"batch: {batch_idx} / {num_batches } (global step: {global_step})" | f"dev_loss: {dev_loss}")
                
                  if dev_loss < best_loss:
                    best_loss=dev_loss
                    print('dev_loss<best_loss, updated')
                    #best_model=copy.deepcopy(model)
                

                model.train()
    print(f"best loss: {best_loss}")
    return model
    

In [79]:

args.lr=5e-6

model_name = "bert-base-uncased"
device='cuda:3' if torch.cuda.is_available() else 'cpu'
#device='cpu'
cfg = AutoConfig.from_pretrained(model_name)
model_look= CustomModel(cfg)
model_look=model_look.to(device)

args.use_cuda=False
train(model_look,dl_look,args,device)

RuntimeError: CUDA error: invalid argument
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

In [56]:
example = next(iter(dl_look['train']))
example

tensor([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
        1, 1, 0, 0, 1, 0, 1, 0])


{'input_ids': tensor([[  101,  2054,  1037,  ...,  5753,  4502,   102],
         [  101, 10364,  2015,  ...,     0,     0,     0],
         [  101,  2023,  2028,  ...,     0,     0,     0],
         ...,
         [  101,  2784,  2751,  ...,  2017,  4392,   102],
         [  101,  8542,  1037,  ...,  2232, 13028,   102],
         [  101,  1045,  2031,  ...,     0,     0,     0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'label': tensor([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
         1, 1, 0, 0, 1, 0, 1, 0]),
 'rationales': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [0, 1, 1,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [0, 1, 1,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]])}

In [61]:
device

'cuda:3'

In [19]:
with torch.no_grad():
    torch.cuda.empty_cache()

In [76]:
from numba import cuda
cuda.select_device(3)
cuda.close()
cuda.select_device(3)


<weakproxy at 0x7f53cb89fb80 to Device at 0x7f53cb898e50>