In [1]:
%config Completer.use_jedi = False

In [3]:
from prompted_gpt2 import PromptedGPT2Generator
from evaluator import Evaluator
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_colwidth', 100)
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime
import time
import json
import os

In [4]:
default_seed = 2
default_lr = 1e-4
default_top_k = 10
default_sample_size = 32
default_note = ''

runs = [{'model': 'distilgpt2',
         'machine': 'ec2-82',
         'task': 'pos2neg',
         'data': '500-test',
         'prompt': 'Errorbad WorseException BAD'},
        
        {'model': 'distilgpt2',
         'machine': 'petuum-42',
         'task': 'neg2pos',
         'data': '500-test',
         'prompt': ' RatingPros GOOD GOOD GOOD'},
        
        {'model': 'distilgpt2',
         'machine': 'ec2-82',
         'task': 'pos2neg',
         'data': '500-train-random',
         'prompt': ' ErrorsError Parameters BADBad'},
        
        {'model': 'distilgpt2',
         'machine': 'ec2-82',
         'task': 'neg2pos',
         'data': '500-train-random',
         'prompt': ' Description575 praises Excellent GREAT'},
        
        {'model': 'distilgpt2',
         'machine': 'petuum-203',
         'task': 'pos2neg',
         'data': '100-train-first',
         'prompt': ' problemErrorBadExceptionBad'},
        
        {'model': 'distilgpt2',
         'machine': 'petuum-203',
         'task': 'neg2pos',
         'data': '100-train-first',
         'prompt': ' mediumExcellentExcellent GREAT GREAT'},
        
        {'model': 'gpt2',
         'machine': 'petuum-203',
         'task': 'pos2neg',
         'data': '500-test',
         'prompt': 'Contents ERROR Values ERROR Values'},
        
        {'model': 'gpt2',
         'machine': 'petuum-203',
         'task': 'neg2pos',
         'data': '500-test',
         'prompt': ' attributes happiest Parameters Happiness=['},
        
        {'model': 'gpt2-medium',
         'machine': 'petuum-42',
         'task': 'pos2neg',
         'data': '500-test',
         'prompt': 'icultyException ConditionException Either'},
        
        {'model': 'gpt2-medium',
         'machine': 'ec2-82',
         'task': 'neg2pos',
         'data': '500-test',
         'prompt': ' value MeaningHappy positives (%'},
        
        {'model': 'gpt2-large',
         'machine': 'ec2-82',
         'task': 'pos2neg',
         'data': '500-test',
         'prompt': 'Problem objection discrepancyDERR contrasts'},
        
        {'model': 'gpt2-large',
         'machine': 'ec2-94',
         'task': 'pos2neg',
         'data': '500-test',
         'prompt': 'AvailabilityDisable Suppose contradictory probabilities'},
        
        {'model': 'gpt2-large',
         'machine': 'ec2-82',
         'task': 'neg2pos',
         'data': '500-test',
         'lr': 5e-5,
         'prompt': 'White happiest preferences (− happy'},
        
        {'model': 'gpt2-xl',
         'machine': 'ec2-82',
         'task': 'pos2neg',
         'data': '500-test',
         'prompt': 'Error [-either [-Neither'},
       ]
        
new_runs = []
for r in runs: 
    if 'seed' not in r: r['seed'] = default_seed
    if 'lr' not in r: r['lr'] = default_lr
    if 'top_k' not in r: r['top_k'] = default_top_k
    if 'sample_size' not in r: r['sample_size'] = default_sample_size
    if 'note' not in r: r['note'] = default_note
    new_runs.append(r)
runs = new_runs

In [5]:
len(runs)

14

In [6]:
def generate_outputs(run, 
                     device=None, 
                     reward_device=None, 
                     generator_device=None,
                     evaluator_device=None,
                     raw_save_path='./raw',
                     summary_save_path='./summary'): 
    if device is not None: 
        reward_device=device
        generator_device=device
        evaluator_device=device
        
    model = run['model']
    task = run['task']
    sample_size = run['sample_size']
    top_k = run['top_k']
    
    dummy_prompts = {'pos2neg': '', 'neg2pos': ''}
    generator = PromptedGPT2Generator(model, 
                                      dummy_prompts,
                                      reward_device=reward_device,
                                      generator_device=generator_device)
    evaluator = Evaluator(evaluator_device)
    
    start = time.time()
    output_list = generator.sample_generate(task, 
                                            sample_size, 
                                            top_k=top_k, 
                                            top_p=None, 
                                            single_prompt=run['prompt'])
    time_elapsed = time.time() - start
    del generator
    
    summary, output_df = evaluator.evaluate_output(task, 
                                                   output_list)
    timestamp = datetime.now().strftime('%Y-%m-%d_%H:%M:%S')
    del evaluator
    
    summary.update({'model': model,
                    'task': task,
                    'sample_size': sample_size,
                    'top_k': top_k,
                    'lr': run['lr'],
                    'seed': run['seed'],
                    'machine': run['machine'],
                    'data': run['data'],
                    'time_elapsed': round(time_elapsed, 2),
                    'timestamp': timestamp,
                    'note': run['note']})
    print(summary)
    
    output_name = f"{task}_{model}_{run['data']}_{run['machine']}_{timestamp}"
    json.dump(summary, open(os.path.join(summary_save_path, output_name + '.json'), 'w'))
    output_df.to_csv(os.path.join(raw_save_path, output_name + '.csv'), index=False)
    
    return summary, output_df

In [9]:
run = runs[-2]
summary, output_df = generate_outputs(run, reward_device=3, generator_device=2, evaluator_device=3)

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 500/500 [11:28<00:00,  1.38s/it]


Comparing with reference...


100%|██████████| 500/500 [00:00<00:00, 4799.50it/s]


Running test classifier


100%|██████████| 500/500 [00:04<00:00, 105.57it/s]


Computing perplexity...


100%|██████████| 500/500 [00:00<00:00, 12851.46it/s]
100%|██████████| 500/500 [00:05<00:00, 94.01it/s]

{'sum_reward': 81.6, 'recon': 65.09, 'self_bleu': 38.83, 'ref_bleu': 21.45, 'style_acc': 0.95, 'ppl': 46.77, 'model': 'gpt2-large', 'task': 'neg2pos', 'sample_size': 32, 'top_k': 10, 'lr': 0.0001, 'seed': 2, 'machine': 'ec2-82', 'data': '500-test', 'time_elapsed': 688.97, 'timestamp': '2022-04-26_14:12:34', 'note': ''}





In [7]:
summaries, output_dfs = [], []
for r in runs: 
    summary, output_df = generate_outputs(r, 
                                          reward_device=3, 
                                          generator_device=2, 
                                          evaluator_device=3)

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
500it [03:48,  2.19it/s]


Comparing with reference...


100%|██████████| 500/500 [00:00<00:00, 5388.26it/s]


Running test classifier


100%|██████████| 500/500 [00:03<00:00, 128.80it/s]


Computing perplexity...


100%|██████████| 500/500 [00:00<00:00, 13777.84it/s]
100%|██████████| 500/500 [00:04<00:00, 100.86it/s]


{'sum_reward': 75.34, 'recon': 52.16, 'self_bleu': 22.89, 'ref_bleu': 14, 'style_acc': 0.99, 'ppl': 34.68, 'model': 'distilgpt2', 'task': 'neg2pos', 'sample_size': 32, 'top_k': 10, 'lr': 0.0001, 'seed': 2, 'machine': 'petuum-42', 'data': '500-test', 'time_elapsed': 228.76, 'timestamp': '2022-04-25_22:24:27', 'note': ''}


In [32]:
from tqdm import tqdm
neg2pos_ref_output = []
for i, (src, ref) in tqdm(enumerate(zip(generator.sentence_dict['src_neg2pos'],
                                        evaluator.sentence_dict['ref_neg2pos']))): 
    output = generator._select_output([ref],
                                       [src], 
                                       'LABEL_1', 
                                       sample_id=i)
    neg2pos_ref_output.append(output)

500it [00:13, 38.04it/s]
