## Imports and Load Dictionaries

In [530]:
import re
import csv
import random
from openai import OpenAI
from nltk.corpus import cmudict
from Levenshtein import distance as levenshtein_distance

# Load abbreviations dictionary
def load_dictionary(file):
    dictionary = {}
    with open(file, 'r') as f:
        reader = csv.reader(f)
        next(reader) # Ignore header
        for row in reader:
            original = row[0]
            variations = row[1]
            if original in dictionary:
                dictionary[original].append(variations)
            else:
                dictionary[original] = [variations]
    dictionary = dict(sorted(dictionary.items()))
    return dictionary

# Shuffle the dictionary
def shuffle_dictionary(d):
    items = list(d.items())  # Convert dictionary to list of items
    random.shuffle(items)    # Shuffle the list
    return dict(items)       # Convert list back to dictionary

client = OpenAI(api_key='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')

## Introduce Contractions
- `CONTRACTIONS_DICT` dictionary consists of common contractions in English language

In [4]:
# Load contractions {expand: [contractions]}
CONTRACTIONS_DICT = load_dictionary('data/Corrections/contractions.csv')

# Introduce contractions in a sentence (default probability=0.5)
def introduce_contractions(sentence, chance=0.5):
    contractions = shuffle_dictionary(CONTRACTIONS_DICT) # Shuffle
    for expanded, contracted in contractions.items():
        pattern = r'\b' + expanded + r'\b' # Match whole word
        if re.search(expanded, sentence, re.I) and random.random() < chance: # Case-insensitive
            sentence = re.sub(pattern, random.choice(contracted), sentence, flags=re.I)
    return sentence

In [5]:
# Demonstrate introduce_contractions
sentences = ["pump iS NOt working",
             "air horn does not work",
             "machine will not start"]
spacing = len(max(sentences, key=len))

for sentence in sentences:
    print("{:<{}} -> {}".format(sentence, spacing, introduce_contractions(sentence)))

pump iS NOt working    -> pump iS NOt working
air horn does not work -> air horn doesn't work
machine will not start -> machine will not start


## Introduce Jargons
- `ABBREVIATIONS_DICT` jargons dictionary consists of abbreviations and acronyms commonly used in Maintenance Work Order (MWO) records by technicians
- `ABBREVIATIONS_DICT` jargons dictionary was derived from [MaintNorm](https://github.com/nlp-tlp/maintnorm) dataset, [`mwo_corrections` dictionary](https://github.com/nlp-tlp/mudlark/blob/main/mudlark/dictionaries/mwo_corrections.csv) from [mudlark](https://github.com/nlp-tlp/mudlark), and MWO Annotation Guidelines

In [6]:
# Load abbreviations {original: [variations]}
ABBREVIATIONS_DICT = load_dictionary('data/Corrections/abbreviations.csv')

# Introduce abbreviations in a sentence (default probability=0.3)
def introduce_abbreviations(sentence, chance=0.3):
    abbreviations = shuffle_dictionary(ABBREVIATIONS_DICT) # Shuffle
    for original, variations in abbreviations.items():
        pattern = r'\b' + original + r'\b' # Match whole word
        # Check if original word is in sentence
        if re.search(original, sentence, re.I) and random.random() < chance: # Case-insensitive
            variation = random.choice(variations)
            variation = add_periods(original, variation)
            sentence = re.sub(pattern, variation, sentence, flags=re.I)
    return sentence

# Add periods to abbreviations if abbreviation matches initials (default probability=0.5)
def add_periods(original, abbreviation, chance=0.5):
    words = original.split()
    initials = ''.join(word[0] for word in words if word)
    # Check if abbreviation matches initials
    if initials.lower() == abbreviation.lower() and random.random() < chance:
        return '.'.join(initials) + '.'
    return abbreviation

In [7]:
# Demonstrate introduce_abbreviations
sentences = ["pump is not working",
             "blown o-ring on left hand lift cylinder",
             "compressor oil pressure switch unserviceable",
             "Tele-Remote might have issues",
             "cracks in fire suppression mounts"]
spacing = len(max(sentences, key=len))

for sentence in sentences:
    print("{:<{}} -> {}".format(sentence, spacing, introduce_abbreviations(sentence)))

pump is not working                          -> pump is not working
blown o-ring on left hand lift cylinder      -> blown o-ring on left hand lift cylinder
compressor oil pressure switch unserviceable -> compressor oil pressure swt unserviceable
Tele-Remote might have issues                -> tel/rem might have issues
cracks in fire suppression mounts            -> cracks in fire suppression mounts


## Introduce Typos
- `KEYBOARD_DICT` dictionary consists of adjacent keys on a standard QWERTY keyboard
- `CMU_DICT` pronouncing dictionary consists of phonetic transcriptions of English words

### Types of Typos
| **Typo Type**                                         | **Example**      | **Typo**        |
|-------------------------------------------------------|------------------|-----------------|
| Missing space between words                           | air conditioner  | airconditioner  |
| Additional space within words                         | permalube        | perma lube      |
| Swapped adjacent characters                           | crack            | carck           |
| Missing characters in a word                          | crack            | crak            |
| Double-up characters in a word                        | crack            | craack          |
| Incorrect character in a word (due to keys proximity) | crack            | xrack           |
| Extra characters in a word (due to keys proximity)    | crack            | cracvk          |
| Incorrect spelling (homophones)                       | motor            | moter           |

In [695]:
# Load keyboard adjacent letters {key: [adjacent]}
KEYBOARD_DICT = load_dictionary('data/Corrections/keyboard.csv')
CMU_DICT = cmudict.dict()

# Missing spaces in a sentence
def omit_space(sentence):
    """ Randomly omits a space from the given sentence. """
    space_idx = [idx for idx, char in enumerate(sentence) if char == ' ']
    if not space_idx: # No spaces to omit
        return sentence
    remove_idx = random.choice(space_idx)
    return sentence[:remove_idx] + sentence[remove_idx+1:]

# Extra space in a word
def add_space(word):
    """ Randomly adds a space within a word. """
    if len(word) < 2:
        return word  # Not enough characters to add a space
    index = random.randint(1, len(word) - 1)  # Ensure space is not at the beginning
    return word[:index] + ' ' + word[index:]

# Swap adjacent letters in a word
def swap_adjacent(word):
    """ Randomly swaps two adjacent letters in a given word. """
    if len(word) < 2: # Not enough letters to swap
        return word
    index = random.randint(0, len(word) - 2)
    return word[:index] + word[index + 1] + word[index] + word[index + 2:]

# Missing letter in a word
def omit_letter(word):
    """ Randomly omits one letter from a given word. """
    if len(word) < 3: # Do not omit from short words
        return word
    index = random.randint(0, len(word) - 1)
    return word[:index] + word[index + 1:]

# Double up a letter in a word
def double_letter(word):
    """ Randomly doubles one letter in a given word. """
    if len(word) < 1: # Not a word
        return word
    index = random.randint(0, len(word) - 1)
    return word[:index + 1] + word[index] + word[index + 1:]

# Replace a letter in a word with an adjacent letter (keyboard)
def adjacent_key(word):
    """ Randomly replaces a letter in a given word with an adjacent letter. """
    if len(word) < 1: # Not a word
        return word
    index = random.randint(0, len(word) - 1)
    letter = word[index]
    if letter in KEYBOARD_DICT:
        replacement = random.choice(KEYBOARD_DICT[letter])
        return word[:index] + replacement  + word[index + 1:]
    return word

# Add adjacent letter before or after a letter in a word
def adjacent_add(word):
    """ Randomly adds an adjacent letter before or after a letter in a given word. """
    if len(word) < 1: # Not a word
        return word
    index = random.randint(0, len(word) - 1)
    letter = word[index]
    if letter in KEYBOARD_DICT:
        addition = random.choice(KEYBOARD_DICT[letter])
        if random.random() < 0.5:
            return word[:index] + addition + word[index:]           # Add before
        else:
            return word[:index + 1] + addition + word[index + 1:]   # Add after
    return word

# Replace word with its homophone
def replace_homophone(word):
    """ Replace a word with one of its homophones, if available. """
    word = word.lower()
    if word not in CMU_DICT:
        return word  # No pronunciation found
    word_pron = CMU_DICT[word][0]
    # Find homophones with the same pronunciation
    homophones = [w for w, pron in CMU_DICT.items() if pron[0] == word_pron and w != word]
    # Filter homophones with Levenshtein distance <= 1
    homophones = [w for w in homophones if levenshtein_distance(word, w) <= 1]
    if homophones: # Homophones found
        return random.choice(homophones)
    return word # No homophones found

In [546]:
# Demonstrate homophones
for key in ABBREVIATIONS_DICT:
    word = key.lower()
    if word not in CMU_DICT:
        continue
    word_pron = CMU_DICT[word][0]
    homophones = [w for w, pron in CMU_DICT.items() if pron[0] == word_pron and w != word]
    homophones = [w for w in homophones if levenshtein_distance(word, w) <= 1]
    if homophones:
        print("{:<15} -> {}".format(word, homophones))

bearing         -> ['baring', 'behring', 'bering']
bearings        -> ['barings']
bracket         -> ['brackett']
check           -> ['chek']
christmas       -> ["christmas'"]
chute           -> ['shute']
communications  -> ["communications'"]
controls        -> ["control's"]
conveyor        -> ['conveyer']
copper          -> ['kopper']
corner          -> ['korner']
cross           -> ['cros', 'kross']
first           -> ['furst']
floor           -> ['flor']
for             -> ['fore', 'forr', 'four']
from            -> ['frum']
group           -> ['groupe']
half            -> ['haff']
hazard          -> ['hazzard']
hour            -> ['our']
kwik            -> ['cwik']
lights          -> ["light's"]
low             -> ['lo', 'loe', 'loh', 'lowe']
mobile          -> ['mobil']
park            -> ['parc', 'parke']
please          -> ['pleas', 'plese']
point           -> ['.point', 'pointe']
problems        -> ["problem's"]
regulators      -> ["regulator's", "regulators'"]
right          

In [681]:
# Demonstrate typo functions
test_sentence = "pump is not working"
spacing = len(test_sentence)
print("{:<15}: {:<{}} -> {}".format("Omit space", test_sentence, spacing, omit_space(test_sentence)))

words = test_sentence.split()
word = random.choice(words)
space_sentence = ' '.join([add_space(w) if w == word else w for w in words])
swap_sentence = ' '.join([swap_adjacent(word) if w == word else w for w in words])
omit_sentence = ' '.join([omit_letter(word) if w == word else w for w in words])
double_sentence = ' '.join([double_letter(word) if w == word else w for w in words])
key_sentence = ' '.join([adjacent_key(word) if w == word else w for w in words])
add_sentence = ' '.join([adjacent_add(word) if w == word else w for w in words])
homophone_sentence = ' '.join([replace_homophone(word) if w == word else w for w in words])

print("{:<15}: {:<{}} -> {}".format("Add space", word, spacing, space_sentence))
print("{:<15}: {:<{}} -> {}".format("Swap adjacent", word, spacing, swap_sentence))
print("{:<15}: {:<{}} -> {}".format("Omit letter", word, spacing, omit_sentence))
print("{:<15}: {:<{}} -> {}".format("Double letter", word, spacing, double_sentence))
print("{:<15}: {:<{}} -> {}".format("Adjacent key", word, spacing, key_sentence))
print("{:<15}: {:<{}} -> {}".format("Adjacent add", word, spacing, add_sentence))
print("{:<15}: {:<{}} -> {}".format("Homophone", word, spacing, homophone_sentence))


Omit space     : pump is not working -> pump is notworking
Add space      : working             -> pump is not wor king
Swap adjacent  : working             -> pump is not wokring
Omit letter    : working             -> pump is not workin
Double letter  : working             -> pump is not worrking
Adjacent key   : working             -> pump is not workong
Adjacent add   : working             -> pump is not workingh
Homophone      : working             -> pump is not werking


In [732]:
# Introduce different typos in a sentence (default probability=0.1)
def introduce_typos(sentence, chance=0.1):
    word_typos = [add_space, swap_adjacent, omit_letter, double_letter, 
                  adjacent_key, adjacent_add, replace_homophone]
    
    # For each word, there is a chance to introduce a typo type
    words = sentence.split()
    for i, word in enumerate(words):
        if random.random() < chance:
            typo_func = random.choice(word_typos)
            words[i] = typo_func(word)
    
    return ' '.join(words)

# Demonstrate introduce_typos
sentences = ["pump is not working",
             "blown o-ring on left hand lift cylinder",
             "compressor oil pressure switch unserviceable",
             "Tele-Remote might have issues",
             "cracks in fire suppression mounts"]
spacing = len(max(sentences, key=len))

for sentence in sentences:
    print("{:<{}} -> {}".format(sentence, spacing, introduce_typos(sentence)))

pump is not working                          -> pump is not working
blown o-ring on left hand lift cylinder      -> blown o-ring on left hand lift cylibnder
compressor oil pressure switch unserviceable -> comprdssor oil pressure switch unserviceable
Tele-Remote might have issues                -> Tele-Remote might have issues
cracks in fire suppression mounts            -> cracks in fire suppression mounts


## LLM for Introducing Typos

In [None]:
def llm_introduce_typos(openai, sentence):
    prompt = f"Introduce at least one typo in the following sentence: {sentence}"
    response = openai.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "system", "content": "You make spelling mistakes in the given sentences."},
                  {"role": "user", "content": prompt}],
        temperature=0.9,
        top_p=0.9,
        n=1
    )

    for choice in response.choices:
        output = choice.message.content
    return output

for sentence in sentences:
    out = llm_introduce_typos(client, sentence)
    print(out)

## Humanise Sentence Process
- `humanise_sentence` function takes a sentence and introduces contractions, jargons, and typos to make it more human-like
1. Introduce contractions
2. Introduce jargons
3. Introduce typos

In [764]:
# Humanise a MWO sentence
def humanise_sentence(sentence):
    sentence = introduce_contractions(sentence)
    sentence = introduce_abbreviations(sentence)
    sentence = introduce_typos(sentence)
    return sentence

# Demonstrate humanise_sentence
sentences = ["pump is not working",
             "blown o-ring on left hand lift cylinder",
             "compressor oil pressure switch unserviceable",
             "Tele-Remote might have issues",
             "cracks in fire suppression mounts"]
spacing = len(max(sentences, key=len))

for sentence in sentences:
    print("{:<{}} -> {}".format(sentence, spacing, humanise_sentence(sentence)))

pump is not working                          -> pump is not working
blown o-ring on left hand lift cylinder      -> blown o-ring on l/hand lift cylind
compressor oil pressure switch unserviceable -> comp oil press switch unsergiceable
Tele-Remote might have issues                -> Tele-Rem ote mkght've issues
cracks in fire suppression mounts            -> cracks in fire suppression mounts
