In [15]:
from collections import Counter

In [16]:
def read_data(path):
    inputs, edits, sents = [], [], []
    with open(path) as f:
        for line in f.readlines():
            line = line.strip()
            if line:
                line = line.split('\t')
                inputs.append(line[0].replace('<s>', ''))
                edits.append(line[1].replace('<s>', ''))
            else:
                sents.append({'sent': inputs, 'edits': edits})
                inputs = []
                edits = []
        
    if inputs or edits:
        sents.append({'sent': inputs, 'edits': edits})
    
    return sents

In [17]:
def read_splits(data_dir, dataset='qalb14'):
    train_word_no_cmpr = read_data(f'{data_dir}/edits_no_compressed/{dataset}/word-level/train_edits.modeling.tsv')
    train_word_cmpr = read_data(f'{data_dir}/edits_compressed/{dataset}/word-level/train_edits.modeling.tsv')

    dev_word_no_cmpr = read_data(f'{data_dir}/edits_no_compressed/{dataset}/word-level/dev_edits.modeling.tsv')
    dev_word_cmpr = read_data(f'{data_dir}/edits_compressed/{dataset}/word-level/dev_edits.modeling.tsv')

    train_subword_no_cmpr = read_data(f'{data_dir}/edits_no_compressed/{dataset}/subword-level/train_edits.modeling.tsv')
    train_subword_cmpr = read_data(f'{data_dir}/edits_compressed/{dataset}/subword-level/train_edits.modeling.tsv')

    dev_subword_no_cmpr = read_data(f'{data_dir}/edits_no_compressed/{dataset}/subword-level/dev_edits.modeling.tsv')
    dev_subword_cmpr = read_data(f'{data_dir}/edits_compressed/{dataset}/subword-level/dev_edits.modeling.tsv')

    return {'compressed':
                {'word': {'train': train_word_cmpr, 'dev': dev_word_cmpr},
                 'subword': {'train': train_subword_cmpr, 'dev': dev_subword_cmpr}
                },
            'no-compressed':
                {'word': {'train': train_word_no_cmpr, 'dev': dev_word_no_cmpr},
                 'subword': {'train': train_subword_no_cmpr, 'dev': dev_subword_no_cmpr}
                }
           }


def read_splits_prune(data_dir, dataset='qalb14'):
    train_subword_prune_10 = read_data(f'{data_dir}/edits_compressed_prune_10/{dataset}/subword-level/train_edits.modeling.tsv')
    train_subword_prune_20 = read_data(f'{data_dir}/edits_compressed_prune_20/{dataset}/subword-level/train_edits.modeling.tsv')
    train_subword_prune_30 = read_data(f'{data_dir}/edits_compressed_prune_30/{dataset}/subword-level/train_edits.modeling.tsv')

    dev_subword_cmpr = read_data(f'{data_dir}/edits_compressed/{dataset}/subword-level/dev_edits.modeling.tsv')

    return {'prune_10': {'train': train_subword_prune_10, 'dev': dev_subword_cmpr },
            'prune_20': {'train': train_subword_prune_20, 'dev': dev_subword_cmpr},
            'prune_30': {'train': train_subword_prune_30, 'dev': dev_subword_cmpr}
            }



def read_splits_prune_pnx_sep(data_dir, dataset='qalb14'):
    train_subword_nopnx = read_data(f'{data_dir}/edits_compressed_pnx_sep/{dataset}/subword-level/train_edits_nopnx_edits.modeling.tsv')
    train_subword_pnx = read_data(f'{data_dir}/edits_compressed_pnx_sep/{dataset}/subword-level/train_edits_pnx_edits.modeling.tsv')


    train_subword_prune_nopnx_10 = read_data(f'{data_dir}/edits_compressed_pnx_sep_prune_10/{dataset}/subword-level/train_edits_nopnx_edits.modeling.tsv')
    train_subword_prune_nopnx_20 = read_data(f'{data_dir}/edits_compressed_pnx_sep_prune_20/{dataset}/subword-level/train_edits_nopnx_edits.modeling.tsv')
    train_subword_prune_nopnx_30 = read_data(f'{data_dir}/edits_compressed_pnx_sep_prune_30/{dataset}/subword-level/train_edits_nopnx_edits.modeling.tsv')


    train_subword_prune_pnx_10 = read_data(f'{data_dir}/edits_compressed_pnx_sep_prune_10/{dataset}/subword-level/train_edits_pnx_edits.modeling.tsv')
    train_subword_prune_pnx_20 = read_data(f'{data_dir}/edits_compressed_pnx_sep_prune_20/{dataset}/subword-level/train_edits_pnx_edits.modeling.tsv')
    train_subword_prune_pnx_30 = read_data(f'{data_dir}/edits_compressed_pnx_sep_prune_30/{dataset}/subword-level/train_edits_pnx_edits.modeling.tsv')


    dev_subword_cmpr_nopnx = read_data(f'{data_dir}/edits_compressed_pnx_sep/{dataset}/subword-level/dev_edits_nopnx_edits.modeling.tsv')
    dev_subword_cmpr_pnx = read_data(f'{data_dir}/edits_compressed_pnx_sep/{dataset}/subword-level/dev_edits_pnx_edits.modeling.tsv')

    return {'nopnx_prune_0': {'train': train_subword_nopnx, 'dev': dev_subword_cmpr_nopnx},
            'nopnx_prune_10': {'train': train_subword_prune_nopnx_10, 'dev': dev_subword_cmpr_nopnx},
            'nopnx_prune_20': {'train': train_subword_prune_nopnx_20, 'dev': dev_subword_cmpr_nopnx},
            'nopnx_prune_30': {'train': train_subword_prune_nopnx_30, 'dev': dev_subword_cmpr_nopnx},

            'pnx_prune_0': {'train': train_subword_pnx, 'dev': dev_subword_cmpr_pnx},
            'pnx_prune_10': {'train': train_subword_prune_pnx_10, 'dev': dev_subword_cmpr_pnx},
            'pnx_prune_20': {'train': train_subword_prune_pnx_20, 'dev': dev_subword_cmpr_pnx},
            'pnx_prune_30': {'train': train_subword_prune_pnx_30, 'dev': dev_subword_cmpr_pnx},
            }


In [18]:
def data_stats_all(data, paper_print=False):
    # Total Number of Edits
    # Total Number of Uniq Edits
    # % of Errors
    # OOV rate
    # if paper_print == False:
    #     print(f'Split\tGran.\tComp.\tPrune\tTotal Edits\tUnique Edits\tUnique Errors\tToken OOVs\tType OOVs')
    # else:
    #     print(f'Gran.\tComp.\tSubset\tPrune\tUnique Edits\tToken OOVs')
    
    for compr in ['no-compressed', 'compressed']:
        for gran in ['word', 'subword']:
            oov_stats = get_oov_rate(train_edits=[edit for example in data[compr][gran]['train']
                                                  for edit in example['edits']],
                                               test_edits=[edit for example in data[compr][gran]['dev']
                                                           for edit in example['edits']])

            for split in ['train', 'dev']:
                edits = [edit for example in data[compr][gran][split]
                                for edit in example['edits']]
                edits_cnt = Counter(edits)

                # total number of edits
                num_edits = len(edits)

                # total number of uniq edits
                edits_uniq = len(edits_cnt)

                # total number of errors
                errors = sum([v for k, v in edits_cnt.items() if (k != 'K*' and set(k) != {'K'})])
                no_errors = sum([v for k, v in edits_cnt.items() if (k == 'K*' or set(k) == {'K'})])

                assert errors + no_errors == sum(edits_cnt.values())

                if paper_print == False:
                    if split == 'dev':
                        print(f'{split}\t{gran}\t{True if compr == "compressed" else False}\t{None}\t{num_edits}\t{edits_uniq}'
                            f'\t{errors}\t{oov_stats[0]} ({oov_stats[1]:.2f}%)\t{oov_stats[2]} ({oov_stats[3]:.2f}%)')
                    else:
                        print(f'{split}\t{gran}\t{True if compr == "compressed" else False}\t{None}\t{num_edits}\t{edits_uniq}'
                            f'\t{errors}')

                else:
                    if split == 'dev':
                        print(f'{gran}\t{True if compr == "compressed" else False}\tAll\t{None}\t{unique_train_edits}\t{oov_stats[1]:.2f}%')
                    else:
                        unique_train_edits = edits_uniq


def data_stats_prune(data, paper_print=False):
    # Total Number of Edits
    # Total Number of Uniq Edits
    # % of Errors
    # OOV rate
    # if paper_print == False:
        # print(f'Split\tGran.\tComp.\tPrune\tTotal Edits\tUnique Edits\tUnique Errors\tToken OOVs\tType OOVs')
    # else:
        # print(f'Gran.\tComp.\tSubset\tPrune\tUnique Edits\tToken OOVs')
    
    for k in [10, 20, 30]:
        oov_stats = get_oov_rate(train_edits=[edit for example in data[f'prune_{k}']['train']
                                                for edit in example['edits']],
                                            test_edits=[edit for example in data[f'prune_{k}']['dev']
                                                        for edit in example['edits']])

        for split in ['train', 'dev']:
            edits = [edit for example in data[f'prune_{k}'][split]
                            for edit in example['edits']]
            edits_cnt = Counter(edits)

            # total number of edits
            num_edits = len(edits)

            # total number of uniq edits
            edits_uniq = len(edits_cnt)

            # total number of errors
            errors = sum([v for k, v in edits_cnt.items() if (k != 'K*' and set(k) != {'K'})])
            no_errors = sum([v for k, v in edits_cnt.items() if (k == 'K*' or set(k) == {'K'})])

            assert errors + no_errors == sum(edits_cnt.values())

            if paper_print == False:
                if split == 'dev':
                    print(f'{split}\tsubword\tTrue\t{k}\t{num_edits}\t{edits_uniq}'
                            f'\t{errors}\t{oov_stats[0]} ({oov_stats[1]:.2f}%)\t{oov_stats[2]} ({oov_stats[3]:.2f}%)')
                else:
                    print(f'{split}\tsubword\tTrue\t{k}\t{num_edits}\t{edits_uniq}'
                            f'\t{errors}')
            else:
                if split == 'dev':
                    print(f'subword\tTrue\tAll\t{k}\t{unique_train_edits}\t{oov_stats[1]:.2f}%')
                else:
                    unique_train_edits = edits_uniq



def data_stats_prune_pnx_sep(data, paper_print=False):
    # Total Number of Edits
    # Total Number of Uniq Edits
    # % of Errors
    # OOV rate
    # if paper_print == False:
    #     print(f'Pnx?\tSplit\tGran.\tComp.\tPrune\tTotal Edits\tUnique Edits\tUnique Errors\tToken OOVs\tType OOVs')
    # else:
    #     print(f'Gran.\tComp.\tSubset\tPrune\tUnique Edits\tToken OOVs')

    for k in [0, 10, 20, 30]:
        for exp in ['nopnx', 'pnx']:
            oov_stats = get_oov_rate(train_edits=[edit for example in data[f'{exp}_prune_{k}']['train']
                                                    for edit in example['edits']],
                                                test_edits=[edit for example in data[f'{exp}_prune_{k}']['dev']
                                                            for edit in example['edits']])

            for split in ['train', 'dev']:
                edits = [edit for example in data[f'{exp}_prune_{k}'][split]
                                for edit in example['edits']]
                edits_cnt = Counter(edits)

                # total number of edits
                num_edits = len(edits)

                # total number of uniq edits
                edits_uniq = len(edits_cnt)

                # total number of errors
                errors = sum([v for k, v in edits_cnt.items() if (k != 'K*' and set(k) != {'K'})])
                no_errors = sum([v for k, v in edits_cnt.items() if (k == 'K*' or set(k) == {'K'})])

                assert errors + no_errors == sum(edits_cnt.values())

                if paper_print == False:
                    if split == 'dev':
                        print(f'{exp}\t{split}\tsubword\tTrue\t{k}\t{num_edits}\t{edits_uniq}'
                                f'\t{errors}\t{oov_stats[0]} ({oov_stats[1]:.2f}%)\t{oov_stats[2]} ({oov_stats[3]:.2f}%)')
                    else:
                        print(f'{exp}\t{split}\tsubword\tTrue\t{k}\t{num_edits}\t{edits_uniq}'
                                f'\t{errors}')
                else:
                    if split == 'dev':
                        print(f'subword\tTrue\t{exp}\t{k}\t{unique_train_edits}\t{oov_stats[1]:.2f}%')
                    else:
                        unique_train_edits = edits_uniq


def get_oov_rate(train_edits, test_edits):
    oov = 0
    for edit in test_edits:
        if edit not in train_edits:
            oov += 1
    oov_percentage = (oov / len(test_edits)) * 100

    type_oov = 0
    test_edits_cnts = Counter(test_edits)
    train_edits_cnts = Counter(train_edits)

    for edit in test_edits_cnts:
        if edit not in train_edits_cnts:
            type_oov += 1

    type_oov_percentage = (type_oov / len(test_edits_cnts)) * 100

    return oov, oov_percentage, type_oov, type_oov_percentage

In [19]:
data_dir = '../data/msa-gec/edits/qalb14'
qalb14 = read_splits(data_dir=data_dir, dataset='qalb14-arabertv02')
qalb14_prune = read_splits_prune(data_dir=data_dir, dataset='qalb14-arabertv02')
qalb14_pnx_sep = read_splits_prune_pnx_sep(data_dir=data_dir, dataset='qalb14-arabertv02')

In [20]:
print(f'Gran.\tComp.\tSubset\tPrune\tUnique Edits\tToken OOVs')

data_stats_all(qalb14, paper_print=True)
print()
data_stats_prune(qalb14_prune, paper_print=True)
print()
data_stats_prune_pnx_sep(qalb14_pnx_sep, paper_print=True)

Gran.	Comp.	Subset	Prune	Unique Edits	Token OOVs
word	False	All	None	16221	1.00%
subword	False	All	None	9060	0.36%
word	True	All	None	10410	1.00%
subword	True	All	None	6170	0.36%

subword	True	All	10	683	0.75%
subword	True	All	20	442	1.02%
subword	True	All	30	329	1.24%

subword	True	nopnx	0	4799	0.27%
subword	True	pnx	0	160	0.01%
subword	True	nopnx	10	520	0.56%
subword	True	pnx	10	48	0.02%
subword	True	nopnx	20	335	0.75%
subword	True	pnx	20	35	0.05%
subword	True	nopnx	30	250	0.92%
subword	True	pnx	30	29	0.05%


In [21]:
data_dir = '../data/msa-gec/edits/zaebuc'
zaebuc = read_splits(data_dir=data_dir, dataset='zaebuc-arabertv02')
zaebuc_prune = read_splits_prune(data_dir=data_dir, dataset='zaebuc-arabertv02')
zaebuc_pnx_sep = read_splits_prune_pnx_sep(data_dir=data_dir, dataset='zaebuc-arabertv02')

In [22]:
print(f'Gran.\tComp.\tSubset\tPrune\tUnique Edits\tToken OOVs')
data_stats_all(zaebuc, paper_print=True)
print()
data_stats_prune(zaebuc_prune, paper_print=True)
print()
data_stats_prune_pnx_sep(zaebuc_pnx_sep, paper_print=True)

Gran.	Comp.	Subset	Prune	Unique Edits	Token OOVs
word	False	All	None	1097	2.94%
subword	False	All	None	905	1.85%
word	True	All	None	687	2.94%
subword	True	All	None	563	1.85%

subword	True	All	10	58	3.71%
subword	True	All	20	35	4.67%
subword	True	All	30	27	5.26%

subword	True	nopnx	0	498	1.74%
subword	True	pnx	0	23	0.06%
subword	True	nopnx	10	52	3.39%
subword	True	pnx	10	6	0.11%
subword	True	nopnx	20	30	4.31%
subword	True	pnx	20	6	0.11%
subword	True	nopnx	30	22	4.90%
subword	True	pnx	30	6	0.11%


In [23]:
data_dir = '../data/da-gec/edits/madar'
madar = read_splits(data_dir=data_dir, dataset='madar-arabertv02')
madar_prune = read_splits_prune(data_dir=data_dir, dataset='madar-arabertv02')

In [24]:
print(f'Gran.\tComp.\tSubset\tPrune\tUnique Edits\tToken OOVs')
data_stats_all(madar, paper_print=True)
print()
data_stats_prune(madar_prune, paper_print=True)

Gran.	Comp.	Subset	Prune	Unique Edits	Token OOVs
word	False	All	None	1228	1.52%
subword	False	All	None	677	0.55%
word	True	All	None	741	1.52%
subword	True	All	None	454	0.55%

subword	True	All	10	84	1.33%
subword	True	All	20	52	2.02%
subword	True	All	30	45	2.28%
