In [1]:
import torch
import numpy as np

class Sampler(object):
    r"""Base class for all Samplers.

    Every Sampler subclass has to provide an __iter__ method, providing a way
    to iterate over indices of dataset elements, and a __len__ method that
    returns the length of the returned iterators.
    """

    def __init__(self, data_source):
        pass

    def __iter__(self):
        raise NotImplementedError

    def __len__(self):
        raise NotImplementedError

class WeightedRandomSampler(Sampler):
    
    def __init__(self, labels, stratify = None, weights = None):
        self.labels = labels
        self.label_counts = labels.value_counts().to_dict()
        self.num_labels = len(self.label_counts.keys())
        
        if weights is None:
            self.weights = {key: 1 for key, val in self.label_counts.items()}
        else:
            self.weights = weights
        
        if stratify == 'increase':
            self.samples_per_label = {key: round(max(self.label_counts.values()) * weight) for key, weight in self.weights.items()}
        elif stratify == 'decrease':
            self.samples_per_label = {key: round(min(self.label_counts.values()) * weight) for key, weight in self.weights.items()}
        else:
            self.samples_per_label = {key: round(self.label_counts[key] * self.weights[key]) for key in self.weights} 
        
        self.num_samples = sum(self.samples_per_label.values())
        
    def __iter__(self):
        indices = []
        for lbl, amount in self.samples_per_label.items():
            label_data = self.labels[self.labels == lbl]
            label_counts = len(label_data)
            for i in range(amount // label_counts):
                indices += label_data.index[torch.randperm(label_counts)].tolist()
            indices += label_data.index[torch.randperm(amount % label_counts)].tolist()
        return iter(np.array(indices)[torch.randperm(self.num_samples)].tolist())
        
    def __len__(self):
        return self.num_samples

In [2]:
import torch
import pandas as pd
from ast import literal_eval
from tqdm import tqdm

class LM_Dataset(torch.utils.data.Dataset):
    
    def __init__(self, data_path, encoding_map, split = [0,1]):
        
        dataset = pd.read_csv(f'{data_path}', header = 0, index_col = 0).reset_index()
        dataset = dataset[round(len(dataset)*split[0]): round(len(dataset)*split[1])]
        tqdm.pandas()
        
        self.encodings = pd.DataFrame({key: dataset[val].progress_map(literal_eval) for key, val in encoding_map.items()})
        self.others = dataset.drop(encoding_map.values(), axis=1)
        self.others['labels'] = dataset['labels'].apply(lambda lbl: int(lbl))

    def __len__(self):
        return len(self.encodings)

    def __getitem__(self, idx):
        return self.encodings.loc[idx].to_dict('list'), self.others.loc[idx]

In [3]:
from torch.utils.data import DataLoader
from transformers import GPT2LMHeadModel
from transformers import GPT2TokenizerFast
from transformers import DataCollatorForLanguageModeling
from transformers import AdamW
from transformers import get_scheduler
import torch
from tqdm.auto import tqdm

class Trainer():
    
    def __init__(self,
                 data_path,
                 save_path,
                 split = [0,1],
                 model_path = 'gpt2',
                 num_labels = 2,
                 stratify = None,
                 sample_weights = None,
                 batch_size = 16,
                 learning_rate = 5e-5,
                 num_epochs = 3,
                 warmup_percent = 1,
                ):
        
        encodings =  {'attention_mask': 'gold_mask', 'input_ids': 'gold_ids', 'labels': 'gold_ids'}
        self.dataset = LM_Dataset(data_path, encodings, split)
        self.save_path = save_path
        
        self.model = GPT2LMHeadModel.from_pretrained(model_path)
        
        self.wrs = WeightedRandomSampler(self.dataset.others['labels'], stratify = stratify, weights = sample_weights)
        sampler = torch.utils.data.sampler.BatchSampler(self.wrs, batch_size=batch_size,drop_last=False)
        
        self.tokenizer = GPT2TokenizerFast.from_pretrained('gpt2', use_fast = True)

        dataCollator = DataCollatorForLanguageModeling(tokenizer = self.tokenizer, mlm = False)
        self.train_loader = DataLoader(dataset = self.dataset, 
                                       batch_size = None, 
                                       collate_fn = self.train_collator, 
                                       sampler = sampler)
        
        self.optimizer = AdamW(self.model.parameters(), lr=learning_rate)
        self.num_epochs = num_epochs
        self.num_warmup_steps = round(num_epochs * len(self.train_loader) * warmup_percent / (batch_size * 100)) * batch_size
        self.num_training_steps = num_epochs * len(self.train_loader)
        self.lr_scheduler = get_scheduler("linear", 
                                     optimizer = self.optimizer, 
                                     num_warmup_steps = self.num_warmup_steps, 
                                     num_training_steps = self.num_training_steps)
    
    def train_collator(self, batch):
        encs = batch[0]
        max_len = max([len(l) for l in encs['input_ids']])
        
        pad = {'input_ids': self.tokenizer.eos_token_id, 'labels': -100, 'attention_mask': 0}
        for key in encs:
            for sample in encs[key]:
                encs[key] = torch.tensor(sample + [pad[key]] * (max_len - len(sample)))

        return encs
    
    def train(self, save = True, evaluator = None):
        device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
        self.model.to(device)
        
        progress_bar_train = tqdm(range(self.num_training_steps))
        
        losses = []
        for epoch in range(self.num_epochs):
            self.model.train()
            for i, batch in enumerate(self.train_loader):
                enc = {k: v.to(device) for k, v in batch.items()}
                outputs = self.model(**enc)
                loss = outputs.loss
                losses += [loss.detach().cpu()]
                loss.backward()

                self.optimizer.step()
                self.lr_scheduler.step()
                self.optimizer.zero_grad()
                progress_bar_train.update(1)
                
                if i % round(len(self.train_loader)/3) == 1:
                    print(f'Average loss: {sum(losses)/len(losses)}')
                    losses = []
            if evaluator is not None:
                self.model.eval()
                evaluator.evaluate(self.model)
        if save:
            self.model.save_pretrained(self.save_path)

In [4]:
from torch.utils.data import DataLoader
from transformers import GPT2LMHeadModel
from transformers import GPT2TokenizerFast
from transformers import DataCollatorForLanguageModeling
from transformers import AdamW
from transformers import get_scheduler
import torch
from tqdm.auto import tqdm
from datasets import load_metric
import json

class Evaluator():
    
    def __init__(self,
                 data_path,
                 split = [0,1],
                 stratify = None,
                 sample_weights = None,
                 batch_size = 16,
                 s1 = "",
                 s2 = "",
                 save_path = "evaluator_output",
                 gold_explanations = True
                ):
        
        encodings =  {'attention_mask': 'attention_mask', 'input_ids': 'input_ids'}
        self.dataset = LM_Dataset(data_path, encodings, split)
        
        self.wrs = WeightedRandomSampler(self.dataset.others['labels'], stratify = stratify, weights = sample_weights)
        sampler = torch.utils.data.sampler.BatchSampler(self.wrs, batch_size=batch_size,drop_last=False)
        
        self.tokenizer = GPT2TokenizerFast.from_pretrained('gpt2', use_fast = True)
        dataCollator = DataCollatorForLanguageModeling(tokenizer = self.tokenizer, mlm = False)
        self.test_loader = DataLoader(dataset = self.dataset, 
                                       batch_size = None, 
                                       collate_fn = self.test_collator, 
                                       sampler = sampler)
        
        self.bertscore = load_metric('bertscore')
        
        # UGLY CODE, FIX THIS!
        self.s1 = s1
        self.s2 = s2
        self.save_path = save_path
        self.gold_explanations = gold_explanations
        
    def test_collator(self, batch):
        max_len = max([len(l) for l in batch[0]['input_ids']])
        
        pad = {'input_ids': self.tokenizer.eos_token_id, 'attention_mask': 0}
        for key in batch[0]:
            batch[0][key] = torch.tensor([[pad[key]] * (max_len - len(sample)) + sample for sample in batch[0][key]])
                
        return batch[0], batch[1]
    
    def evaluate(self, model, save = True):
        device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
        model.to(device)

        progress_bar = tqdm(range(len(self.test_loader)))
        
        inputs = []
        explanations = []
        gold_exps = []
        groups = []
        s1s = []
        cls_labels = []
        s2s = []
        model.eval()
        for i, batch in enumerate(self.test_loader):
            enc = {k: v.to(device) for k, v in batch[0].items()}
            with torch.no_grad():
                generation = model.generate(
                    input_ids = enc['input_ids'],
                    attention_mask = enc['attention_mask'],
                    do_sample = True,
                    max_length = 100,
                    temperature = 0.7,
                    top_k = 50,
                    top_p = 0.7
                )
                batch_input = self.tokenizer.batch_decode(enc['input_ids'].detach(), skip_special_tokens=True)
                groups += batch[1]['groups'].tolist() #[int(id) for id in batch['other']['id']]
                s1s += batch[1][self.s1].tolist()
                s2s += batch[1][self.s2].tolist()
                cls_labels += [int(cls) for cls in batch[1]['labels']]
                preds = []
                for i, e in zip(batch_input, self.tokenizer.batch_decode(generation.detach(), skip_special_tokens=True)):
                    preds += [e[len(i):].strip()]
                
                if self.gold_explanations: 
                    golds = batch[1]['explanations'].tolist()
                    self.bertscore.add_batch(predictions=preds, references=golds)
                    gold_exps += golds
                    
                inputs += batch_input
                explanations += preds

            progress_bar.update(1)
        if save:
            if self.gold_explanations:
                with open('../../data/generated/{}.json'.format(self.save_path), 'w') as f:
                    json.dump({"groups": groups, 
                               self.s1: s1s, 
                               self.s2: s2s, 
                               "generated": explanations, 
                               "gold_explanations": gold_exps,
                               "labels": cls_labels}, 
                              f)

                res = self.bertscore.compute(lang = 'en')
                print(f'{round(sum(res["precision"])/len(res["precision"]), 2)}\t\t|\t'+
                      f'{round(sum(res["recall"])/len(res["recall"]), 2)}\t\t|\t'+
                      f'{round(sum(res["f1"])/len(res["f1"]), 2)}')

                with open('../../data/generated/{}_bertscores.json'.format(self.save_path), 'w') as f:
                    json.dump({'Precision': round(sum(res["precision"])/len(res["precision"]), 2),
                               'Recall': round(sum(res["recall"])/len(res["recall"]), 2),
                               'F1-score': round(sum(res["f1"])/len(res["f1"]), 2)}, 
                              f)
            else:
                with open('../../data/generated/{}.json'.format(save_path), 'w') as f:
                    json.dump({"groups": groups, 
                               self.s1: s1s, 
                               self.s2: s2s, 
                               "generated": explanations, 
                               "labels": cls_labels}, 
                              f)

            with open('../../data/generated/{}.json'.format(self.save_path), 'r') as f:
                json_explanations = json.load(f)

In [6]:
trainer = Trainer(data_path = '../../data/tokenized/gpt2/comve/train.csv', 
                  save_path = 'ComVE/Generator_v2', 
                  #model_path = 'ECQA/Generator', 
                  #split = [0,0.01],
                  sample_weights = {0: 0, 1: 1},
                  num_epochs = 3, 
                  batch_size = 32,
                  stratify = 'decrease')
evaluator = Evaluator(data_path = '../../data/tokenized/gpt2/comve/test.csv', 
                      #split = [0,0.01], 
                      batch_size = 32,
                     s1 = "correct",
                     s2 = "incorrect",
                     save_path = "comve_test_v2",
                     gold_explanations = True)

  0%|          | 0/49997 [00:00<?, ?it/s]

  0%|          | 0/49997 [00:00<?, ?it/s]

  0%|          | 0/49997 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

  0%|          | 0/5000 [00:00<?, ?it/s]

In [7]:
trainer.train(True, evaluator)

  0%|          | 0/1875 [00:00<?, ?it/s]

Average loss: 4.623959541320801
Average loss: 2.877784252166748
Average loss: 2.6743533611297607


  0%|          | 0/157 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

0.87		|	0.87		|	0.87
Average loss: 2.5468804836273193
Average loss: 2.5258471965789795
Average loss: 2.4752109050750732


  0%|          | 0/157 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

0.87		|	0.87		|	0.87
Average loss: 2.399286985397339
Average loss: 2.37255597114563
Average loss: 2.3708667755126953


  0%|          | 0/157 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

0.88		|	0.87		|	0.87


In [2]:
from transformers import GPT2LMHeadModel
from transformers import GPT2TokenizerFast

model = GPT2LMHeadModel.from_pretrained('ECQA/GPT2SingleTask')
tokenizer = GPT2TokenizerFast.from_pretrained('gpt2', use_fast = True)

In [46]:
enc = tokenizer('Statement: What profession does a man have?\nStatement: Engineer\nExplanation:', return_tensors = 'pt')
generation = model.generate(
                    input_ids = enc['input_ids'],
                    attention_mask = enc['attention_mask'],
                    do_sample = True,
                    max_length = 100,
                    temperature = 0.7,
                    top_k = 50,
                    top_p = 0.7
                )
tokenizer.batch_decode(generation.detach(), skip_special_tokens=True)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


['Statement: What profession does a man have?\nStatement: Engineer\nExplanation: Engineer is not a profession']