## Import packages and Load Data

In [1]:
def load_data(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
        for line in lines:
            if line != '\n':
                data.append((line.split('\t')[1].strip(), line.split('\t')[0]))
    return data

train_data = load_data('../data/MaintNorm/train.norm')
test_data = load_data('../data/MaintNorm/test.norm')
val_data = load_data('../data/MaintNorm/val.norm')
full_data = train_data + test_data + val_data
data = list(set(full_data)) # remove duplicates

In [2]:
# Remove irrelevant pairs (e.g. same text, empty text, mask tags, etc.)
def remove_irrelevant(data):
    output = []
    irrelevant_tags = ['<num>', '<number>', '<id>', '<sensitive>', '<date>']
    for pair in data:
        if (pair[0] == pair[1] or pair[0] == '' 
            or pair[0].lower() == pair[1].lower()
            or any(tag in pair[0] for tag in irrelevant_tags)): 
            continue
        output.append(pair)
    return output

In [3]:
# Save into a corrections dictionary
def save_corrections(corrections, file_path):
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write('Correct,Wrong\n')
        for pair in corrections:
            file.write(f'{pair[0]},{pair[1]}\n')

In [4]:
# Remove irrelevant pairs and save the sorted corrections
corrections = remove_irrelevant(data)
corrections = sorted(corrections, key=lambda x: x[0])
save_corrections(corrections, '../data/Corrections/maintnorm_corrections.csv')