In [9]:
import re
from string import punctuation
from camel_tools.utils.charsets import UNICODE_PUNCT_SYMBOL_CHARSET
from collections import defaultdict, Counter

PNX = punctuation + ''.join(list(UNICODE_PUNCT_SYMBOL_CHARSET)) + '&amp;'
pnx_patt = re.compile(r'(['+re.escape(PNX)+'])')

In [10]:
def read_data(path):
    with open(path) as f:
        return [x.strip() for x in f.readlines()]

In [11]:
def reconstruct_edit(pnx_edit, no_pnx_edit):
    def parse_edits(edit_string):
        """Parse edits into grouped operations."""
        return re.findall(r'I_\[.*?\]+|R_\[.*?\]+|A_\[.*?\]+|D|K|.', edit_string)

    def is_insert_or_append(edit):
        """Check if the edit is an insert or append operation."""
        return edit.startswith('I') or edit.startswith('A')

    def is_replace(edit):
        """Check if the edit is a replace operation."""
        return edit.startswith('R')

    # Parse the edits and initialize counters
    pnx_grouped_edits = parse_edits(pnx_edit)
    no_pnx_grouped_edits = parse_edits(no_pnx_edit)
    pnx_edit_cnts = Counter(pnx_grouped_edits)
    no_pnx_edit_cnts = Counter(edit for edit in no_pnx_grouped_edits if not is_insert_or_append(edit))

    
    i, j = 0, 0
    reconstructed_edit = ""

    # Merge edits
    while i < len(pnx_grouped_edits) and j < len(no_pnx_grouped_edits):
        pnx_edit = pnx_grouped_edits[i]
        no_pnx_edit = no_pnx_grouped_edits[j]

        # adding no pnx edit if pnx_edit is K and the no_pnx_edit is in [K, D, M, R]
        if pnx_edit == 'K' and (no_pnx_edit in ['K', 'D', 'M'] or is_replace(no_pnx_edit)):
            reconstructed_edit += no_pnx_edit
            pnx_edit_cnts[pnx_edit] -= 1
            no_pnx_edit_cnts[no_pnx_edit] -= 1
            i += 1
            j += 1

        # adding pnx edit if pnx edit is replace and no pnx edit is K
        elif is_replace(pnx_edit) and no_pnx_edit == 'K':
            reconstructed_edit += pnx_edit
            pnx_edit_cnts[pnx_edit] -= 1
            no_pnx_edit_cnts[no_pnx_edit] -= 1
            i += 1
            j += 1

        elif is_insert_or_append(pnx_edit):
            if pnx_edit_cnts['K'] != 0 and sum(no_pnx_edit_cnts.values()) == pnx_edit_cnts['K']:
                reconstructed_edit += pnx_edit
                i += 1
            else:
                reconstructed_edit += no_pnx_edit
                j += 1
        else:
            reconstructed_edit += no_pnx_edit
            j += 1


    # adding remaining edits
    reconstructed_edit += ''.join(no_pnx_grouped_edits[j:])
    reconstructed_edit += ''.join(pnx_grouped_edits[i:])
    
    return reconstructed_edit


In [34]:
def separate_pnx(edit):
    """
    Given an edit, returns two edits. One for pnx edits and one for no pnx edits.
    """
    grouped_edits = re.findall(r'I_\[.*?\]+|R_\[.*?\]+|A_\[.*?\]+|D+|K+|.', edit)

    pnx_edit = ''
    no_pnx_edit = '' 
    found_pnx = False

    for g_edit in grouped_edits:
        if g_edit.startswith('A_[') or g_edit.startswith('I_[') or g_edit.startswith('R_['):
            op = g_edit[0]
            seq = re.sub(op + r'_\[(.*?)\]', r'\1', g_edit)
            seq = re.sub(' +', '', seq)

            if pnx_patt.findall(seq) and ''.join(pnx_patt.findall(seq)) == seq:
                pnx_edit += g_edit
                found_pnx = True 
                if op == 'R':
                    no_pnx_edit += 'K'
            else:
                no_pnx_edit += g_edit
                if g_edit.startswith('R_['):
                    pnx_edit += 'K'

        elif g_edit:
            no_pnx_edit += g_edit
            if not (g_edit.startswith('I') and g_edit.startswith('M') and g_edit.startswith('A')):
                pnx_edit += 'K' * len(g_edit)
    
    if found_pnx == False:
        pnx_edit = ''


    re_edit = reconstruct_edit(pnx_edit=pnx_edit, no_pnx_edit=no_pnx_edit)
    if re_edit != edit:
        print(re_edit)
        print(edit)
        print({'no_pnx_edit': no_pnx_edit, 'pnx_edit': pnx_edit})
        print()
    return {'no_pnx_edit': no_pnx_edit, 'pnx_edit': pnx_edit}

In [35]:
edits = read_data('compress_me.txt')

In [36]:
separate_pnx('KI_[يين]A_[، "]')

> [0;32m/var/folders/b8/39h0_97j63lcj_0g_cml6phm0000gn/T/ipykernel_63042/878684517.py[0m(17)[0;36mseparate_pnx[0;34m()[0m
[0;32m     15 [0;31m            [0mseq[0m [0;34m=[0m [0mre[0m[0;34m.[0m[0msub[0m[0;34m([0m[0;34m' +'[0m[0;34m,[0m [0;34m''[0m[0;34m,[0m [0mseq[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     16 [0;31m            [0;32mimport[0m [0mpdb[0m[0;34m;[0m [0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 17 [0;31m            [0;32mif[0m [0mpnx_patt[0m[0;34m.[0m[0mmatch[0m[0;34m([0m[0mseq[0m[0;34m)[0m [0;32mand[0m [0mpnx_patt[0m[0;34m.[0m[0mmatch[0m[0;34m([0m[0mseq[0m[0;34m)[0m[0;34m[[0m[0;36m1[0m[0;34m][0m [0;34m==[0m [0mseq[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     18 [0;31m                [0mpnx_edit[0m [0;34m+=[0m [0mg_edit[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     19 [0;31m                [0mfound_pnx[0m

In [15]:
reconstruct_edit(pnx_edit='KA_[.]', no_pnx_edit='K')

'KA_[.]'

In [17]:
for edit in edits:
    pnx_sep = separate_pnx(edit)
    # print(f'<s>{pnx_sep["no_pnx_edit"]}<s>\t<s>{pnx_sep["pnx_edit"]}<s>')
    if pnx_sep["pnx_edit"]:
        print(f'Edit:    {edit}')
        print(f'Pnx:     {pnx_sep["pnx_edit"]}')
        print(f'NoPnx:   {pnx_sep["no_pnx_edit"]}')
        print()

Edit:    I_[:]
Pnx:     I_[:]
NoPnx:   

Edit:    R_[!]
Pnx:     R_[!]
NoPnx:   K

Edit:    I_[.]
Pnx:     I_[.]
NoPnx:   

Edit:    I_[،]
Pnx:     I_[،]
NoPnx:   

Edit:    I_[؟]
Pnx:     I_[؟]
NoPnx:   

Edit:    R_[:]
Pnx:     R_[:]
NoPnx:   K

Edit:    R_[،]
Pnx:     R_[،]
NoPnx:   K

Edit:    I_[؛]
Pnx:     I_[؛]
NoPnx:   

Edit:    I_["]
Pnx:     I_["]
NoPnx:   

Edit:    R_[؟]
Pnx:     R_[؟]
NoPnx:   K

Edit:    R_[(]
Pnx:     R_[(]
NoPnx:   K

Edit:    R_[)]
Pnx:     R_[)]
NoPnx:   K

Edit:    R_[.]
Pnx:     R_[.]
NoPnx:   K

Edit:    R_["]
Pnx:     R_["]
NoPnx:   K

Edit:    I_[!]
Pnx:     I_[!]
NoPnx:   

Edit:    I_[-]
Pnx:     I_[-]
NoPnx:   

Edit:    R_[.]DDDD
Pnx:     R_[.]KKKK
NoPnx:   KDDDD

Edit:    R_[ ]I_[،]
Pnx:     KI_[،]
NoPnx:   R_[ ]

Edit:    R_[ا]I_[ ]R_[،]
Pnx:     KR_[،]
NoPnx:   R_[ا]I_[ ]K

Edit:    R_[ ]I_[.]
Pnx:     KI_[.]
NoPnx:   R_[ ]

Edit:    I_[(]
Pnx:     I_[(]
NoPnx:   

Edit:    I_[)]
Pnx:     I_[)]
NoPnx:   

Edit:    R_[!]I_[ ]KD
Pnx:     R_