In [2]:
from utils import load_data
from collections import defaultdict, Counter

In [39]:
def oov_rate(train_data, test_data):
    train_edits = Counter()
    test_edits = Counter()

    for example in train_data:
        for subword_edit in example['subword-edits-append']:
            train_edits[subword_edit.edit] += 1
    
    for example in test_data:
        for subword_edit in example['subword-edits-append']:
            test_edits[subword_edit.edit] += 1
    
    type_oov_rate = 0
    oov_mass = 0
    coverage = 0

    for edit in test_edits:
        if edit not in train_edits:
            type_oov_rate += 1
            oov_mass += test_edits[edit]

        elif edit != 'K*':
            coverage += test_edits[edit]
        
    
    type_oov_rate = (type_oov_rate / len(test_edits)) * 100
    oov_mass = (oov_mass / (sum(test_edits.values()) - test_edits['K*'])) * 100 
    coverage = (coverage / (sum(test_edits.values()) - test_edits['K*'])) * 100


    print(f'Type OOV Rate:        {type_oov_rate:.2f}%')
    print(f'OOV Mass w/o K*:      {oov_mass:.2f}%')
    print(f'Edit Coverage w/o K*: {coverage:.2f}%')

In [46]:
dev_data = load_data('edits_outputs_compressed/qalb14/dev_edits.json')

In [50]:
for output_dir in ['qalb14', 'qalb14_5', 'qalb14_4', 'qalb14_3']:
    train_data = load_data(f'edits_outputs_compressed/{output_dir}/train_edits.json')
    print(f'{output_dir}:')
    oov_rate(train_data, dev_data)
    print()

qalb14:
Type OOV Rate:        26.54%
OOV Mass w/o K*:      2.03%
Edit Coverage w/o K*: 97.97%

qalb14_5:
Type OOV Rate:        28.98%
OOV Mass w/o K*:      7.52%
Edit Coverage w/o K*: 92.48%

qalb14_4:
Type OOV Rate:        31.53%
OOV Mass w/o K*:      8.54%
Edit Coverage w/o K*: 91.46%

qalb14_3:
Type OOV Rate:        35.24%
OOV Mass w/o K*:      11.58%
Edit Coverage w/o K*: 88.42%



In [53]:
for output_dir in ['qalb14', 'qalb14_5', 'qalb14_4', 'qalb14_3']:
    train_data = load_data(f'edits_outputs_compressed_prune_10/{output_dir}/train_edits.json')
    print(f'{output_dir}:')
    oov_rate(train_data, dev_data)
    print()

qalb14:
Type OOV Rate:        51.38%
OOV Mass w/o K*:      5.45%
Edit Coverage w/o K*: 94.55%

qalb14_5:
Type OOV Rate:        52.55%
OOV Mass w/o K*:      9.06%
Edit Coverage w/o K*: 90.94%

qalb14_4:
Type OOV Rate:        54.25%
OOV Mass w/o K*:      10.10%
Edit Coverage w/o K*: 89.90%

qalb14_3:
Type OOV Rate:        57.22%
OOV Mass w/o K*:      13.12%
Edit Coverage w/o K*: 86.88%



In [54]:
for output_dir in ['qalb14', 'qalb14_5', 'qalb14_4', 'qalb14_3']:
    train_data = load_data(f'edits_outputs_compressed_prune_20/{output_dir}/train_edits.json')
    print(f'{output_dir}:')
    oov_rate(train_data, dev_data)
    print()

qalb14:
Type OOV Rate:        61.68%
OOV Mass w/o K*:      6.47%
Edit Coverage w/o K*: 93.53%

qalb14_5:
Type OOV Rate:        62.95%
OOV Mass w/o K*:      10.14%
Edit Coverage w/o K*: 89.86%

qalb14_4:
Type OOV Rate:        64.54%
OOV Mass w/o K*:      11.14%
Edit Coverage w/o K*: 88.86%

qalb14_3:
Type OOV Rate:        66.14%
OOV Mass w/o K*:      14.05%
Edit Coverage w/o K*: 85.95%



In [55]:
dev_data_nopnx = load_data('edits_outputs_compressed_pnx_sep/qalb14/dev_edits_nopnx.json')
dev_data_pnx = load_data('edits_outputs_compressed_pnx_sep/qalb14/dev_edits_pnx.json')


In [57]:
for output_dir in ['qalb14', 'qalb14_5', 'qalb14_4', 'qalb14_3']:
    train_data_nopnx = load_data(f'edits_outputs_compressed_pnx_sep/{output_dir}/train_edits_nopnx.json')
    train_data_pnx = load_data(f'edits_outputs_compressed_pnx_sep/{output_dir}/train_edits_pnx.json')
    print(f'{output_dir}:')
    print('No Pnx:')
    oov_rate(train_data_nopnx, dev_data_nopnx)
    print('PNX:')
    oov_rate(train_data_pnx, dev_data_pnx)
    print()



qalb14:
No Pnx:
Type OOV Rate:        26.74%
OOV Mass w/o K*:      2.39%
Edit Coverage w/o K*: 97.61%
PNX:
Type OOV Rate:        10.87%
OOV Mass w/o K*:      0.08%
Edit Coverage w/o K*: 99.92%

qalb14_5:
No Pnx:
Type OOV Rate:        29.16%
OOV Mass w/o K*:      10.67%
Edit Coverage w/o K*: 89.33%
PNX:
Type OOV Rate:        8.70%
OOV Mass w/o K*:      0.06%
Edit Coverage w/o K*: 99.94%

qalb14_4:
No Pnx:
Type OOV Rate:        31.01%
OOV Mass w/o K*:      12.11%
Edit Coverage w/o K*: 87.89%
PNX:
Type OOV Rate:        8.70%
OOV Mass w/o K*:      0.06%
Edit Coverage w/o K*: 99.94%



In [None]:
for output_dir in ['qalb14', 'qalb14_5', 'qalb14_4', 'qalb14_3']:
    train_data_nopnx = load_data(f'edits_outputs_compressed_pnx_sep_10/{output_dir}/train_edits_nopnx.json')
    train_data_pnx = load_data(f'edits_outputs_compressed_pnx_sep_10/{output_dir}/train_edits_pnx.json')
    print(f'{output_dir}:')
    print('No Pnx:')
    oov_rate(train_data_nopnx, dev_data_nopnx)
    print('PNX:')
    oov_rate(train_data_pnx, dev_data_pnx)
    print()

In [None]:
for output_dir in ['qalb14', 'qalb14_5', 'qalb14_4', 'qalb14_3']:
    train_data_nopnx = load_data(f'edits_outputs_compressed_pnx_sep_20/{output_dir}/train_edits_nopnx.json')
    train_data_pnx = load_data(f'edits_outputs_compressed_pnx_sep_20/{output_dir}/train_edits_pnx.json')
    print(f'{output_dir}:')
    print('No Pnx:')
    oov_rate(train_data_nopnx, dev_data_nopnx)
    print('PNX:')
    oov_rate(train_data_pnx, dev_data_pnx)
    print()