## Import packages and Load Data

In [17]:
import re
import csv

# Read corrections/abbreviations dictionary
def load_dictionary(file):
    dictionary = {}
    with open(file, 'r') as f:
        reader = csv.reader(f)
        next(reader) # Ignore header
        for row in reader:
            original = row[0]
            variations = row[1]
            if original in dictionary:
                dictionary[original].append(variations)
            else:
                dictionary[original] = [variations]
    dictionary = dict(sorted(dictionary.items()))
    return dictionary

# Typos based on reversed mwo_corrections dictionary
mwo_typos = load_dictionary('../data/Corrections/mwo_corrections.csv')
maintnorm_typos = load_dictionary('../data/Corrections/maintnorm_corrections.csv')
abbreviations = load_dictionary('../data/Corrections/abbreviations.csv')

In [37]:
# Preprocessing
def preprocess(string):
    # Add spaces around punctuation marks, excluding slashes and hyphens
    output = re.sub(r'([!"#$%&\'()*+,.:;<=>?@[\\\]^_`{|}~])', r" \1 ", string)

    # Add spaces around slashes (/) where appropriate
    output = re.sub(r"((\w{3,})\s*\/\s*)(?=\w{3,})", r"\2 / ", output)

    # Add spaces around hyphens (-) where appropriate
    output = re.sub(r"((\w{3,})\s*-\s*)(?=\w{3,})", r"\2 - ", output)
    
    return output