## Imports and Load Dictionaries

In [1]:
import os
import re
import csv
import random
import nltk
from dotenv import load_dotenv
from openai import OpenAI
from nltk.corpus import cmudict
from word_forms.word_forms import get_word_forms
from Levenshtein import distance as levenshtein_distance

# Load dictionaries
def load_dictionary(file):
    dictionary = {}
    with open(file, 'r') as f:
        reader = csv.reader(f)
        next(reader) # Ignore header
        for row in reader:
            original = row[0]
            variations = row[1]
            if original in dictionary:
                dictionary[original].append(variations)
            else:
                dictionary[original] = [variations]
    dictionary = dict(sorted(dictionary.items()))
    return dictionary

# Shuffle the dictionary
def shuffle_dictionary(d):
    items = list(d.items())  # Convert dictionary to list of items
    random.shuffle(items)    # Shuffle the list
    return dict(items)       # Convert list back to dictionary

load_dotenv()
api_key = os.getenv('API_KEY')
client = OpenAI(api_key=api_key)

## Introduce Contractions
- `CONTRACTIONS_DICT` dictionary consists of common contractions in English language

In [2]:
# Load contractions {expand: [contractions]}
CONTRACTIONS_DICT = load_dictionary('../data/Corrections/contractions.csv')

# Introduce contractions in a sentence (default probability=0.5)
def introduce_contractions(sentence, chance=0.5):
    contractions = shuffle_dictionary(CONTRACTIONS_DICT) # Shuffle
    for expanded, contracted in contractions.items():
        pattern = r'\b' + expanded + r'\b' # Match whole word
        if re.search(expanded, sentence, re.I) and random.random() < chance: # Case-insensitive
            sentence = re.sub(pattern, random.choice(contracted), sentence, flags=re.I)
    return sentence

In [3]:
# Demonstrate introduce_contractions
sentences = ["pump iS NOt working",
             "air horn does not work",
             "machine will not start"]
spacing = len(max(sentences, key=len))

for sentence in sentences:
    print("{:<{}} -> {}".format(sentence, spacing, introduce_contractions(sentence)))

pump iS NOt working    -> pump isn't working
air horn does not work -> air horn does not work
machine will not start -> machine will not start


## Introduce Jargons
- `ABBREVIATIONS_DICT` jargons dictionary consists of abbreviations and acronyms commonly used in Maintenance Work Order (MWO) records by technicians
- `ABBREVIATIONS_DICT` jargons dictionary was derived from [MaintNorm](https://github.com/nlp-tlp/maintnorm) dataset, [`mwo_corrections` dictionary](https://github.com/nlp-tlp/mudlark/blob/main/mudlark/dictionaries/mwo_corrections.csv) from [mudlark](https://github.com/nlp-tlp/mudlark), and MWO Annotation Guidelines

### Types of Jargons
| Type of Jargon            | Example                 | Jargon       |
|---------------------------|-------------------------|--------------|
| Abbreviations             | service                 | svce         |
| Acronym / Initialism      | water treatment pump    | wtp          |
| Shortened words           | air conditioner         | air con      |
| Include dashes (–)        | o-ring                  | oring        |
| Include hyphens (/)       | air conditioner         | a/c          |
| Include periods (.)       | water treatment pump    | w.t.p.       |
| Symbols                   | at                      | @            |
| Numerical abbreviations   | two way                 | 2 way        |

In [4]:
# Load abbreviations {original: [variations]}
ABBREVIATIONS_DICT = load_dictionary('../data/Corrections/abbreviations.csv')

# Introduce abbreviations in a sentence (default probability=0.4)
def introduce_abbreviations(sentence, chance=0.4):
    abbreviations = shuffle_dictionary(ABBREVIATIONS_DICT) # Shuffle
    for original, variations in abbreviations.items():
        pattern = r'\b' + original + r'\b' # Match whole word
        # Check if original word is in sentence
        if re.search(original, sentence, re.I) and random.random() < chance: # Case-insensitive
            variation = random.choice(variations)
            variation = add_periods(original, variation)
            sentence = re.sub(pattern, variation, sentence, flags=re.I)
    return sentence

# Add periods to abbreviations if condition is met (default probability=0.08)
def add_periods(original_word, abbreviated_word, chance=0.08):
    original = original_word.lower()
    abbreviation = abbreviated_word.lower()
    words = original.split()
    initials = ''.join(word[0] for word in words if word)
    # Check if abbreviation matches initials
    if initials.lower() == abbreviation and random.random() < chance:
        return '.'.join(initials) + '.'
    # Check if original starts with abbreviation
    elif words[0].startswith(abbreviation) and words[0] != abbreviation and random.random() < chance:
        return abbreviation + '.'
    # Check if original contains abbreviation
    elif original.find(abbreviation) != -1 and words[0] != abbreviation and random.random() < chance:
        return abbreviation + '.'
    # Check if original has all abbreviation characters in order
    elif all(char in iter(original) for char in abbreviation) and random.random() < chance:
        if original.replace('-', '') == abbreviation: # auto-greaser -> autogreaser
            return abbreviation
        if abbreviation in words:
            return abbreviation
        return abbreviation + '.'
    return abbreviation

In [5]:
# Demonstrate introduce_abbreviations
sentences = ["pump is not working",
             "blown o-ring on left hand lift cylinder",
             "compressor oil pressure switch unserviceable",
             "Tele-Remote might have issues",
             "cracks in fire suppression mounts",
             "replace broken lock"]
spacing = len(max(sentences, key=len))

for sentence in sentences:
    print("{:<{}} -> {}".format(sentence, spacing, introduce_abbreviations(sentence)))

pump is not working                          -> pmp is not working
blown o-ring on left hand lift cylinder      -> blown o-ring on l/hand lift cylinder
compressor oil pressure switch unserviceable -> compress oil pressure switch unserviceable
Tele-Remote might have issues                -> teleremote might have issues
cracks in fire suppression mounts            -> cracks in fire suppression mounts
replace broken lock                          -> replace brkn. lock


## Introduce Typos
- `KEYBOARD_DICT` dictionary consists of adjacent keys on a standard QWERTY keyboard
- `CMU_DICT` pronouncing dictionary consists of phonetic transcriptions of English words

### Types of Typos
| **Typo Type**                                         | **Example**      | **Typo**        |
|-------------------------------------------------------|------------------|-----------------|
| Missing space between words                           | air conditioner  | airconditioner  |
| Additional space within words                         | permalube        | perma lube      |
| Swapped adjacent characters                           | crack            | carck           |
| Missing characters in a word                          | crack            | crak            |
| Double-up characters in a word                        | crack            | craack          |
| Incorrect character in a word (due to keys proximity) | crack            | xrack           |
| Extra characters in a word (due to keys proximity)    | crack            | cracvk          |
| Incorrect spelling (homophones)                       | motor            | moter           |

In [6]:
# Load keyboard adjacent letters {key: [adjacent]}
KEYBOARD_DICT = load_dictionary('../data/Corrections/keyboard.csv')
nltk.download('cmudict')
CMU_DICT = cmudict.dict()

# Missing spaces in a sentence
def omit_space(sentence):
    """ Randomly omits a space from the given sentence. """
    space_idx = [idx for idx, char in enumerate(sentence) if char == ' ']
    if not space_idx: # No spaces to omit
        return sentence
    remove_idx = random.choice(space_idx)
    return sentence[:remove_idx] + sentence[remove_idx+1:]

# Extra space in a word
def add_space(word):
    """ Randomly adds a space within a word. """
    if len(word) < 2:
        return word  # Not enough characters to add a space
    index = random.randint(1, len(word) - 1)  # Ensure space is not at the beginning
    return word[:index] + ' ' + word[index:]

# Swap adjacent letters in a word
def swap_adjacent(word):
    """ Randomly swaps two adjacent letters in a given word. """
    if len(word) < 3: # Not enough letters to swap
        return word
    index = random.randint(1, len(word) - 2)
    return word[:index] + word[index + 1] + word[index] + word[index + 2:]

# Missing letter in a word
def omit_letter(word):
    """ Randomly omits one letter from a given word. """
    if len(word) < 3: # Do not omit from short words
        return word
    index = random.randint(1, len(word) - 1)
    return word[:index] + word[index + 1:]

# Double up a letter in a word
def double_letter(word):
    """ Randomly doubles one letter in a given word. """
    if len(word) < 1: # Not a word
        return word
    index = random.randint(1, len(word) - 1)
    return word[:index + 1] + word[index] + word[index + 1:]

# Replace a letter in a word with an adjacent letter (keyboard)
def adjacent_key(word):
    """ Randomly replaces a letter in a given word with an adjacent letter. """
    if len(word) < 1: # Not a word
        return word
    index = random.randint(1, len(word) - 1)
    letter = word[index]
    if letter in KEYBOARD_DICT:
        replacement = random.choice(KEYBOARD_DICT[letter])
        return word[:index] + replacement  + word[index + 1:]
    return word

# Add adjacent letter before or after a letter in a word
def adjacent_add(word):
    """ Randomly adds an adjacent letter before or after a letter in a given word. """
    if len(word) < 1: # Not a word
        return word
    index = random.randint(1, len(word) - 1)
    letter = word[index]
    if letter in KEYBOARD_DICT:
        addition = random.choice(KEYBOARD_DICT[letter])
        if random.random() < 0.5:
            return word[:index] + addition + word[index:]           # Add before
        else:
            return word[:index + 1] + addition + word[index + 1:]   # Add after
    return word

# Replace word with its homophone
def replace_homophone(word):
    """ Replace a word with one of its homophones, if available. """
    word = word.lower()
    if word not in CMU_DICT:
        return word  # No pronunciation found
    word_pron = CMU_DICT[word][0]
    # Find homophones with the same pronunciation
    homophones = [w for w, pron in CMU_DICT.items() if pron[0] == word_pron and w != word]
    # Filter homophones with Levenshtein distance <= 1
    homophones = [w for w in homophones if levenshtein_distance(word, w) <= 1]
    if homophones: # Homophones found
        return random.choice(homophones)
    return word # No homophones found

[nltk_data] Downloading package cmudict to
[nltk_data]     C:\Users\allis\AppData\Roaming\nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


In [7]:
# Replace word with its form
def replace_forms(word):
    """ Replace a word with one of its forms, if available. """
    response = get_word_forms(word)
    forms = []
    for form in response.values():
        forms.extend(form)
    forms = list(set(forms))
    # Remove original word
    forms.remove(word) if word in forms else forms
    if forms: # Forms found
        return random.choice(forms)
    return word # No forms found

# Demonstrate replace_forms
for key in ABBREVIATIONS_DICT:
    new_word = replace_forms(key)
    print("{:<15} -> {}".format(key, new_word))

accident        -> accidents
accumulator     -> accumulators
adjust          -> adjuster
adjustment      -> adjuster
aftercooler     -> aftercooler
air conditioner -> air conditioner
air conditioner condenser -> air conditioner condenser
alternator      -> alternators
and             -> and
antiskid        -> antiskid
assembly        -> assemblies
at              -> ats
auto-greaser    -> auto-greaser
auto-lube       -> auto-lube
autogreaser     -> autogreaser
auxiliary       -> auxiliaries
battery         -> batteries
bearing         -> bearers
bearings        -> bearer
been            -> isn't
belt            -> belting
bill of materials -> bill of materials
boiler feed water pump -> boiler feed water pump
boilermaker     -> boilermaker
boilermakers    -> boilermakers
bottom          -> bottoming
bracket         -> bracketing
brake           -> brakes
break out       -> break out
breakdown maintenance -> breakdown maintenance
broken          -> breakablenesses
brown boveri and cie ->

In [8]:
# Demonstrate homophones
for key in ABBREVIATIONS_DICT:
    word = key.lower()
    if word not in CMU_DICT:
        continue
    word_pron = CMU_DICT[word][0]
    homophones = [w for w, pron in CMU_DICT.items() if pron[0] == word_pron and w != word]
    homophones = [w for w in homophones if levenshtein_distance(word, w) <= 1]
    if homophones:
        print("{:<15} -> {}".format(word, homophones))

bearing         -> ['baring', 'behring', 'bering']
bearings        -> ['barings']
bracket         -> ['brackett']
check           -> ['chek']
christmas       -> ["christmas'"]
chute           -> ['shute']
communications  -> ["communications'"]
controls        -> ["control's"]
conveyor        -> ['conveyer']
copper          -> ['kopper']
corner          -> ['korner']
cross           -> ['cros', 'kross']
first           -> ['furst']
floor           -> ['flor']
for             -> ['fore', 'forr', 'four']
from            -> ['frum']
group           -> ['groupe']
half            -> ['haff']
hazard          -> ['hazzard']
hour            -> ['our']
kwik            -> ['cwik']
lights          -> ["light's"]
low             -> ['lo', 'loe', 'loh', 'lowe']
mobile          -> ['mobil']
park            -> ['parc', 'parke']
please          -> ['pleas', 'plese']
point           -> ['.point', 'pointe']
problems        -> ["problem's"]
regulators      -> ["regulator's", "regulators'"]
right          

In [9]:
# Demonstrate typo functions
test_sentence = "pump is not working"
spacing = len(test_sentence)
print("{:<15}: {:<{}} -> {}".format("Omit space", test_sentence, spacing, omit_space(test_sentence)))

words = test_sentence.split()
word = random.choice(words)
space_sentence = ' '.join([add_space(w) if w == word else w for w in words])
swap_sentence = ' '.join([swap_adjacent(word) if w == word else w for w in words])
omit_sentence = ' '.join([omit_letter(word) if w == word else w for w in words])
double_sentence = ' '.join([double_letter(word) if w == word else w for w in words])
key_sentence = ' '.join([adjacent_key(word) if w == word else w for w in words])
add_sentence = ' '.join([adjacent_add(word) if w == word else w for w in words])
homophone_sentence = ' '.join([replace_homophone(word) if w == word else w for w in words])

print("{:<15}: {:<{}} -> {}".format("Add space", word, spacing, space_sentence))
print("{:<15}: {:<{}} -> {}".format("Swap adjacent", word, spacing, swap_sentence))
print("{:<15}: {:<{}} -> {}".format("Omit letter", word, spacing, omit_sentence))
print("{:<15}: {:<{}} -> {}".format("Double letter", word, spacing, double_sentence))
print("{:<15}: {:<{}} -> {}".format("Adjacent key", word, spacing, key_sentence))
print("{:<15}: {:<{}} -> {}".format("Adjacent add", word, spacing, add_sentence))
print("{:<15}: {:<{}} -> {}".format("Homophone", word, spacing, homophone_sentence))


Omit space     : pump is not working -> pump is notworking
Add space      : not                 -> pump is no t working
Swap adjacent  : not                 -> pump is nto working
Omit letter    : not                 -> pump is no working
Double letter  : not                 -> pump is nott working
Adjacent key   : not                 -> pump is noy working
Adjacent add   : not                 -> pump is noit working
Homophone      : not                 -> pump is knot working


In [10]:
# Introduce different typos in a sentence (default probability=0.1)
# There is a chance no typos are introduced
def rule_introduce_typos(sentence, chance=0.05, max_typos=3):
    """ Introduce typos in a sentence with a given probability. """
    typo_funcs = [add_space, swap_adjacent, omit_letter, double_letter, adjacent_key, adjacent_add, replace_homophone]
    typo_probs = [8, 16, 16, 17, 13, 16, 14]  # Probabilities for each typo function

    if random.random() < chance:
        sentence = omit_space(sentence)
        
    words = sentence.split()
    typos = random.sample(range(len(words)), min(len(words), max_typos))
    for i in typos:
        if random.random() < chance:
            word = words[i]
            typo_func = random.choices(typo_funcs, weights=typo_probs, k=1)[0]
            words[i] = typo_func(word)

    return ' '.join(words)

# Demonstrate introduce_typos
sentences = ["pump is not working",
             "blown o-ring on left hand lift cylinder",
             "compressor oil pressure switch unserviceable",
             "Tele-Remote might have issues",
             "cracks in fire suppression mounts"]
spacing = len(max(sentences, key=len))

for sentence in sentences:
    print("{:<{}} -> {}".format(sentence, spacing, rule_introduce_typos(sentence)))

pump is not working                          -> pump is not working
blown o-ring on left hand lift cylinder      -> blown o-ring on left hand lifr cylinder
compressor oil pressure switch unserviceable -> compressor oil pressure switch unserviceable
Tele-Remote might have issues                -> Tele-Remote m ight have issues
cracks in fire suppression mounts            -> cracks in fire suppression mounts


## LLM for Introducing Typos

In [11]:
# Introduce typos in a sentence using OpenAI GPT-4
def llm_introduce_typos(openai, sentence):
    # Chance for no typos
    if random.random() < 0.15:
        return sentence
    prompt = (
        f"Introduce a few typos into the following sentence to make it look like it was written by a human. "
        f"Use a mix of the following typo types, but avoid overdoing it. The typo types are:\n"
        f"1. Missing space between words (e.g., air conditioner -> airconditioner)\n"
        f"2. Additional space within words (e.g., permalube -> perma lube)\n"
        f"3. Swapped adjacent characters (e.g., crack -> carck)\n"
        f"4. Missing characters in a word (e.g., crack -> crak)\n"
        f"5. Double-up characters in a word (e.g., crack -> craack)\n"
        f"6. Incorrect character in a word (due to keys proximity) (e.g., crack -> xrack)\n"
        f"7. Extra characters in a word (due to keys proximity) (e.g., crack -> cracvk)\n"
        f"8. Incorrect spelling (homophones) (e.g., motor -> moter)\n\n"
        f"Here is the sentence to modify: '{sentence}'"
        f"Return the modified sentence and nothing else."
    )
    response = openai.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "system", "content": "You are an expert in adding realistic typos to sentences."},
                  {"role": "user", "content": prompt}],
        temperature=0.9,
        top_p=0.9,
        n=1
    )
    if response.choices[0].message.content.startswith("'") and response.choices[0].message.content.endswith("'"):
        return response.choices[0].message.content[1:-1]
    return response.choices[0].message.content

# Demonstrate llm_introduce_typos
sentences = ["pump is not working",
             "blown o-ring on left hand lift cylinder",
             "compressor oil pressure switch unserviceable",
             "Tele-Remote might have issues",
             "cracks in fire suppression mounts"]
spacing = len(max(sentences, key=len))

for sentence in sentences:
    print("{:<{}} -> {}".format(sentence, spacing, llm_introduce_typos(client, sentence)))

pump is not working                          -> pump is not wrking
blown o-ring on left hand lift cylinder      -> blown o-ringon left hand lift cylnder
compressor oil pressure switch unserviceable -> compressor oill presure switch unserviceable
Tele-Remote might have issues                -> Tele-Remote might have issues
cracks in fire suppression mounts            -> craks in fire supression mounts


## Humanise Sentence Process
- `humanise_sentence` function takes a sentence and introduces contractions, jargons, and typos to make it more human-like
1. Introduce contractions
2. Introduce jargons
3. Introduce typos (rule-based or LLM)

In [12]:
# Humanise a MWO sentence
def humanise_sentence(sentence, llm=False):
    sentence = introduce_contractions(sentence)
    sentence = introduce_abbreviations(sentence)
    if llm:
        sentence = llm_introduce_typos(client, sentence)
    else:
        sentence = rule_introduce_typos(sentence)
    return sentence

# Demonstrate humanise_sentence
sentences = ["pump is not working",
             "blown o-ring on left hand lift cylinder",
             "compressor oil pressure switch unserviceable",
             "Tele-Remote might have issues",
             "cracks in fire suppression mounts"]
spacing = len(max(sentences, key=len))

for sentence in sentences:
    contracted_sentence = introduce_contractions(sentence)
    abbreviated_sentence = introduce_abbreviations(contracted_sentence)
    rule_typo_sentence = rule_introduce_typos(abbreviated_sentence)
    llm_typo_sentence = llm_introduce_typos(client, abbreviated_sentence)
    print("Rule typo : {:<{}} -> {}".format(sentence, spacing, rule_typo_sentence))
    print("GPT typo  : {:<{}} -> {}".format(sentence, spacing, llm_typo_sentence))

Rule typo : pump is not working                          -> pump isn't working
GPT typo  : pump is not working                          -> pump isn’t wrking
Rule typo : blown o-ring on left hand lift cylinder      -> blown o/ring on lh lift chamber
GPT typo  : blown o-ring on left hand lift cylinder      -> blown o/ring on lhfift chamber
Rule typo : compressor oil pressure switch unserviceable -> comressor oil pressure s/w u/s
GPT typo  : compressor oil pressure switch unserviceable -> compreesor oil pressur s/w u/s
Rule typo : Tele-Remote might have issues                -> Tele-Remote might have issues
GPT typo  : Tele-Remote might have issues                -> Tele-Remot might have isses
Rule typo : cracks in fire suppression mounts            -> cracks in fire suppression mounts
GPT typo  : cracks in fire suppression mounts            -> cracks in fire suppression mounts


## Humanise generated MWO sentences

In [13]:
# Read MWO sentences from text file
def read_sentences(file):
    with open(file, 'r', encoding='utf-8') as f:
        data = f.readlines()
        random.shuffle(data)
    sentences = []
    for s in data[:20]:
        if s.startswith('~'):
            sentences.append(s.strip()[2:])
    return sentences
    
path_sentences = read_sentences('../Generate/mwo_sentences/after_log.txt')
spacing = len(max(path_sentences, key=len))
for sentence in path_sentences:
    output = humanise_sentence(sentence, llm=False)
    print("{:<{}} -> {}".format(sentence, spacing, output))

filter is leaking                           -> filt is leaking
repair needed for walk motor brake hose     -> repair needed 4 walk motor brk hose
transmission hose shows weeping             -> tranny hose shows weeping
needs replacing engine air conditioner hose -> needs co eng air-cond hose
leaking brake filter hose                   -> leaking brake filter hose
heater hose needs a replacement             -> heater hose needs a replacement
engine air conditioner unserviceable        -> engine air conditioner unserviceable
leak fuel in condenser fan hose             -> leak fuel in condenser fan hose
clevis pin has no grease                    -> clevis pin has no grease
change out needed for drag roller           -> c/out needed 4 drag roller
replace auto-greaser door window            -> replace auto-greaser door window
replace brake hose leaking air              -> replace brk hose leaking air
battery needs replacement for dropped cell  -> battery needs repl for dropped cell
filter 