In [1]:
import os
import json
import pandas as pd
import spacy
from tqdm import tqdm
from collections import defaultdict, Counter
from pprint import pprint

In [2]:
def get_result(eval_type, evaluate_id):
    evaluated_dpath = "/data/group1/z44383r/dev/rl-nlg/experiments/evaluate_model/outputs"
    evaluated_result_fpath = os.path.join(evaluated_dpath, eval_type, evaluate_id, "result.json")
    result = json.load(open(evaluated_result_fpath))
    return result

In [3]:
def compute_acc_by_da(result):
    def update_acc(acc, key, i, d, s):
        ids = f"{i}-{d}-{s}"
        if ids not in acc:
            acc[ids] = {'tp': 0, 'fn': 0, 'fp': 0}
        acc[ids][key] += 1

    acc = {}
    for r in tqdm(result):
        for i,d,s,v in r["tp_acts"]:
            update_acc(acc, 'tp', i, d, s)
        for i,d,s,v in r["fn_acts"]:
            update_acc(acc, 'fn', i, d, s)
        for i,d,s,v in r["fp_acts"]:
            update_acc(acc, 'fp', i, d, s)

    for ids in acc:
        acc[ids]['tp_ratio'] = acc[ids]['tp'] / sum(acc[ids].values())
        acc[ids]['fn_ratio'] = acc[ids]['fn'] / sum(acc[ids].values())
        acc[ids]['fp_ratio'] = acc[ids]['fp'] / sum(acc[ids].values())
        acc[ids]['num_acts'] = sum(acc[ids][key] for key in ['tp', 'fn', 'fp'])

    return acc

In [4]:
def compute_diff(gpt2_acc, ppo_acc):
    diff = {}
    for ids in gpt2_acc:
        if ids not in ppo_acc:
            continue
        diff[ids] = {
            'tp_ratio': ppo_acc[ids]['tp_ratio'] - gpt2_acc[ids]['tp_ratio'],
            'fn_ratio': ppo_acc[ids]['fn_ratio'] - gpt2_acc[ids]['fn_ratio'],
            'fp_ratio': ppo_acc[ids]['fp_ratio'] - gpt2_acc[ids]['fp_ratio'],
            'num_acts': gpt2_acc[ids]['num_acts'] + ppo_acc[ids]['num_acts']
        }
    return diff

In [15]:
def search_ppo_win_turns(ids, result_gpt2, result_ppo, is_win_wer=False):
    text_keys = ['gen_text', 'noised_text']
    acts_keys = ['tp_acts', 'fn_acts', 'fp_acts']
    def is_ids_in(acts):
        for i, d, s, v in acts:
            if ids == f'{i}-{d}-{s}':
                return True
        return False

    def tuple2strings(acts):
        return [f'{i}-{d}-{s}-{v}' for i,d,s,v in acts]

    ppo_win_turns = []
    for r_gpt2, r_ppo in zip(result_gpt2, result_ppo):
        if is_win_wer and r_gpt2['noised_wer'] < r_ppo['noised_wer']:
            continue
        if not is_ids_in(r_gpt2['tp_acts']) and is_ids_in(r_ppo['tp_acts']):
            ppo_win_turns.append({
                'action': tuple2strings(r_gpt2['action']),
                'gpt2': {
                    **{key: r_gpt2[key] for key in text_keys},
                    **{key: tuple2strings(r_gpt2[key]) for key in acts_keys}
                },
                'ppo': {
                    **{key: r_ppo[key] for key in text_keys},
                    **{key: tuple2strings(r_ppo[key]) for key in acts_keys}
                }
            })
    return ppo_win_turns

In [16]:
result_gpt2_bert_0 = get_result("baselines", "gpt2-bert-full-sys-background(0)")
acc_by_da_gpt2_bert_0 = compute_acc_by_da(result_gpt2_bert_0)

result_ppo_bert_0 = get_result("ppo", "bert-full-sys-noise-background(0)-seed12")
acc_by_da_ppo_bert_0 = compute_acc_by_da(result_ppo_bert_0)

diff_gpt2_ppo_bert_0 = compute_diff(acc_by_da_gpt2_bert_0, acc_by_da_ppo_bert_0)
pprint(sorted(diff_gpt2_ppo_bert_0.items(), key=lambda x: x[1]['tp_ratio'], reverse=True))

100%|██████████| 7372/7372 [00:00<00:00, 529205.33it/s]
100%|██████████| 7372/7372 [00:00<00:00, 553652.93it/s]

[('Inform-Police-Addr',
  {'fn_ratio': -1.0, 'fp_ratio': 0.0, 'num_acts': 2, 'tp_ratio': 1.0}),
 ('NoOffer-Hotel-Internet',
  {'fn_ratio': -1.0,
   'fp_ratio': 0.1724137931034483,
   'num_acts': 9,
   'tp_ratio': 0.8}),
 ('Request-Taxi-Depart',
  {'fn_ratio': -0.6237870081293455,
   'fp_ratio': 0.07758080086491426,
   'num_acts': 166,
   'tp_ratio': 0.5469227411610651}),
 ('Request-Taxi-Leave',
  {'fn_ratio': -0.05728465877021535,
   'fp_ratio': -0.3617903432143894,
   'num_acts': 230,
   'tp_ratio': 0.41949423815620995}),
 ('Request-Hotel-Parking',
  {'fn_ratio': -0.3673469387755102,
   'fp_ratio': -0.03700376767262281,
   'num_acts': 47,
   'tp_ratio': 0.4130434782608695}),
 ('OfferBook-Train-Day',
  {'fn_ratio': 0.0,
   'fp_ratio': -0.35714285714285715,
   'num_acts': 8,
   'tp_ratio': 0.4}),
 ('NoOffer-Attraction-none',
  {'fn_ratio': -0.5132210256957032,
   'fp_ratio': 0.13160838350719298,
   'num_acts': 33,
   'tp_ratio': 0.37777777777777777}),
 ('NoOffer-Train-none',
  {'fn_rati




In [17]:
ppo_win_turns = search_ppo_win_turns(ids='Request-Taxi-Leave',
                                     result_gpt2=result_gpt2_bert_0,
                                     result_ppo=result_ppo_bert_0,
                                     is_win_wer=True)
pprint(ppo_win_turns)

[{'action': ['Request-Taxi-Leave-?', 'Request-Taxi-Arrive-?'],
  'gpt2': {'fn_acts': ['Request-Taxi-Leave-?', 'Request-Taxi-Arrive-?'],
           'fp_acts': ['Request-Train-Leave-?', 'Request-Train-Arrive-?'],
           'gen_text': 'What time would you like to leave or arrive by?',
           'noised_text': 'What time do you like to leave arrive by?',
           'tp_acts': []},
  'ppo': {'fn_acts': [],
          'fp_acts': [],
          'gen_text': 'When would you like to leave the restaurant or arrive '
                      'by?',
          'noised_text': 'When would you like to leave the restaurant or '
                         'arrive by?',
          'tp_acts': ['Request-Taxi-Leave-?', 'Request-Taxi-Arrive-?']}},
 {'action': ['Request-Taxi-Leave-?'],
  'gpt2': {'fn_acts': ['Request-Taxi-Leave-?'],
           'fp_acts': ['Request-Booking-Time-?'],
           'gen_text': 'What time would you like to leave?',
           'noised_text': 'What time would like to Ruby.',
           'tp_

In [42]:
result_gpt2_milu_0 = get_result("baselines", "gpt2-checkpoints-milu-full-sys-background(0)")
acc_by_da_gpt2_milu_0 = compute_acc_by_da(result_gpt2_milu_0)

result_ppo_milu_0 = get_result("ppo", "milu-full-sys-noise-background(0)-seed12")
acc_by_da_ppo_milu_0 = compute_acc_by_da(result_ppo_milu_0)

diff_gpt2_ppo_milu_0 = compute_diff(acc_by_da_gpt2_milu_0, acc_by_da_ppo_milu_0)

pprint(sorted(diff_gpt2_ppo_milu_0.items(), key=lambda x: x[1]['tp_ratio'], reverse=True))

100%|██████████| 7372/7372 [00:00<00:00, 456902.34it/s]
100%|██████████| 7372/7372 [00:00<00:00, 523755.15it/s]

[('Inform-Police-Name',
  {'fn_ratio': -1.0, 'fp_ratio': 0.0, 'num_acts': 2, 'tp_ratio': 1.0}),
 ('NoOffer-Train-Day',
  {'fn_ratio': -0.2666666666666666,
   'fp_ratio': -0.27272727272727276,
   'num_acts': 5,
   'tp_ratio': 0.5}),
 ('Request-Hotel-Name',
  {'fn_ratio': 0.1288244766505636,
   'fp_ratio': -0.5173383317713215,
   'num_acts': 22,
   'tp_ratio': 0.380952380952381}),
 ('Request-Restaurant-Name',
  {'fn_ratio': -0.06428571428571428,
   'fp_ratio': -0.27225335195019784,
   'num_acts': 15,
   'tp_ratio': 0.3333333333333333}),
 ('Select-Hotel-none',
  {'fn_ratio': 0.10954026644787274,
   'fp_ratio': -0.39876218881511244,
   'num_acts': 132,
   'tp_ratio': 0.2876813375952791}),
 ('Recommend-Hotel-Parking',
  {'fn_ratio': -0.28180619644034277,
   'fp_ratio': 0.0,
   'num_acts': 36,
   'tp_ratio': 0.2777777777777778}),
 ('Request-Train-Dest',
  {'fn_ratio': -0.020973774567965675,
   'fp_ratio': -0.19421886581931327,
   'num_acts': 409,
   'tp_ratio': 0.21549384511001257}),
 ('Requ




In [43]:
ppo_win_turns = search_ppo_win_turns(ids='Request-Taxi-Leave',
                                     result_gpt2=result_gpt2_milu_0,
                                     result_ppo=result_ppo_milu_0)
pprint(ppo_win_turns)

[{'action': ['Request-Taxi-Arrive-?', 'Request-Taxi-Leave-?'],
  'gpt2': {'fn_acts': ['Request-Taxi-Leave-?'],
           'fp_acts': ['Request-Booking-Time-?'],
           'gen_text': 'What time would you like to leave or arrive by?',
           'noised_text': 'What time would you like to eat or arrive by?',
           'tp_acts': ['Request-Taxi-Arrive-?']},
  'ppo': {'fn_acts': ['Request-Taxi-Arrive-?'],
          'fp_acts': ['Request-Taxi-Depart-?'],
          'gen_text': 'When would you like to leave or arrive at the hotel by?',
          'noised_text': 'When would you like to leave or of the hotel by?',
          'tp_acts': ['Request-Taxi-Leave-?']}},
 {'action': ['Request-Taxi-Leave-?'],
  'gpt2': {'fn_acts': ['Request-Taxi-Leave-?'],
           'fp_acts': ['Request-Train-Leave-?'],
           'gen_text': 'What time would you like to leave?',
           'noised_text': 'What anytime would you like to leave?',
           'tp_acts': []},
  'ppo': {'fn_acts': [],
          'fp_acts': [