## Imports

In [1]:
import csv
import json
import random
import warnings
from openai import OpenAI
from llm_humanise_path import get_all_paths, process_mwo_response, get_fewshot_message, process_single_response, initialise_prompts, get_prompt
from llm_paraphrase import paraphrase_MWO

  from tqdm.autonotebook import tqdm, trange


## Initialise

In [180]:
client = OpenAI(api_key='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')
data = get_all_paths(valid=True)
warnings.filterwarnings("ignore")

# Load abbreviations dictionary
def load_dictionary(file):
    dictionary = {}
    with open(file, 'r') as f:
        reader = csv.reader(f)
        next(reader) # Ignore header
        for row in reader:
            original = row[0]
            variations = row[1]
            if original in dictionary:
                dictionary[original].append(variations)
            else:
                dictionary[original] = [variations]
    dictionary = dict(sorted(dictionary.items()))
    return dictionary


54	paths in object_property_paths
567	paths in process_agent_paths
402	paths in process_patient_paths
1743	paths in state_patient_paths
7	paths in object_property_state_paths
0	paths in object_process_state_paths
202	paths in state_agent_activity_paths
44	paths in state_agent_patient_paths
188	paths in process_agent_patient_paths
Total number of paths: 3207


## Same prompt VS Variant prompts

In [3]:
random.seed(42)
path = random.choice(data)
base_prompts, instructions = initialise_prompts(client, num_variants=5, num_examples=1)

  attn_output = torch.nn.functional.scaled_dot_product_attention(


In [42]:
# Same prompt
def same_prompt():
    outputs = []
    base_prompt = "Generate a Maintenance Work Order (MWO) sentence describing the following equipment and undesirable event."
    instruction = "The sentence can have a maximum of 8 words."
    prompt = f"{base_prompt}\nEquipment: {path['object_name']}\nUndesirable Event: {path['event_name']}\n{instruction}"
    fewshot = get_fewshot_message([base_prompt], [instruction], 1)
    message = fewshot + [{"role": "user", "content": prompt}]

    response = client.chat.completions.create(
                    model="gpt-4o-mini",
                    messages=message,
                    temperature=0.9,
                    top_p=0.9,
                    n=5
                )

    for choice in response.choices:
        output = choice.message.content
        output = process_single_response(output) 
        outputs.append(output)
    return outputs

In [43]:
# Variant prompts
def variant_prompt():
    outputs = []
    fewshot = get_fewshot_message(base_prompts, instructions, 1)
    for _ in range(5):
        prompt = get_prompt(base_prompts, instructions, path['object_name'], path['event_name'])
        message = fewshot + [{"role": "user", "content": prompt}]
        response = client.chat.completions.create(
                        model="gpt-4o-mini",
                        messages=message,
                        temperature=0.9,
                        top_p=0.9,
                        n=1
                )

        for choice in response.choices:
            output = choice.message.content
            output = process_single_response(output)
            outputs.append(output)
    return outputs

In [57]:
# Demo
same = same_prompt()
variant = variant_prompt()

print ("GPT given same prompt:")
for s in same:
    print(f"- {s}")
print ("GPT given variant prompts:")
for v in variant:
    print(f"- {v}")

GPT given same prompt:
- park brake alarm fault detected
- park brake alarm fault detected
- park brake alarm fault detected
- park brake alarm fault detected
- park brake alarm fault detected
GPT given variant prompts:
- park brake alarm fault detected
- park brake shows alarm fault
- park brake alarm fault detected
- park brake has alarm fault
- park brake alarm fault detected


In [45]:
# Compare diversity over long run
same = []
variant = []
for _ in range(20):
    same.extend(same_prompt())
    variant.extend(variant_prompt())
print (f"Number of unique responses for same prompt: {len(set(same))}")
print (f"Number of unique responses for variant prompts: {len(set(variant))}")

Number of unique responses for same prompt: 3
Number of unique responses for variant prompts: 20


## N completions VS N sentences in ONE completion

In [46]:
# N sentences in ONE completion
def n_sentences():
    outputs = []
    base_prompt = "Generate 5 different Maintenance Work Order (MWO) sentences describing the following equipment and undesirable event."
    instruction = "Each sentence can have a maximum of 8 words."
    prompt = f"{base_prompt}\nEquipment: {path['object_name']}\nUndesirable Event: {path['event_name']}\n{instruction}"
    fewshot = get_fewshot_message([base_prompt], [instruction], 5)
    message = fewshot + [{"role": "user", "content": prompt}]

    response = client.chat.completions.create(
                    model="gpt-4o-mini",
                    messages=message,
                    temperature=0.9,
                    top_p=0.9,
                    n=1
                )

    for choice in response.choices:
        output = choice.message.content
        output = process_mwo_response(output)
        for sentence in output:
            outputs.append(sentence)
    return outputs

In [47]:
# Demo
n_completion = same_prompt()
n_sentence = n_sentences()

print ("Output in N completions:")
for c in n_completion:
    print(f"- {c}")
print ("Output within 1 completion:")
for s in n_sentence:
    print(f"- {s}")

Output in N completions:
- park brake alarm fault detected
- park brake alarm fault detected
- park brake alarm fault detected
- park brake alarm fault detected
- park brake alarm fault detected
Output within 1 completion:
- park brake alarm fault detected
- park brake shows alarm fault
- alarm fault in park brake system
- park brake has alarm fault
- park brake is experiencing alarm fault


In [48]:
# Compare diversity over long run
n_completion = []
n_sentence = []
for _ in range(20):
    n_completion.extend(same_prompt())
    n_sentence.extend(n_sentences())
print (f"Number of unique responses for N completions: {len(set(n_completion))}")
print (f"Number of unique responses for ONE completion: {len(set(n_sentence))}")

Number of unique responses for N completions: 3
Number of unique responses for ONE completion: 22


## Paraphrase VS Generation

In [60]:
# Paraphrase
def paraphrase():
    outputs = []
    base_prompt = "Generate a Maintenance Work Order (MWO) sentence describing the following equipment and undesirable event."
    instruction = "The sentence can have a maximum of 8 words."
    prompt = f"{base_prompt}\nEquipment: {path['object_name']}\nUndesirable Event: {path['event_name']}\n{instruction}"
    fewshot = get_fewshot_message([base_prompt], [instruction], 1)
    message = fewshot + [{"role": "user", "content": prompt}]

    response = client.chat.completions.create(
                    model="gpt-4o-mini",
                    messages=message,
                    temperature=0.9,
                    top_p=0.9,
                    n=1
            )

    for choice in response.choices:
        output = choice.message.content
        output = process_single_response(output)
        outputs.append(output)

    keywords = [path['object_name'], path['event_name']]
    paraphrases = paraphrase_MWO(client, output, keywords, 5)
    outputs.extend(paraphrases)
    outputs = list(set(outputs)) # Remove duplicates
    return outputs[:5] # Only return 5 outputs

In [61]:
# Demo
paraphrases = paraphrase()
n_sentence = n_sentences()

print ("Output for Paraphrasing:")
for p in paraphrases:
    print(f"- {p}")
print ("Output for Generation:")
for s in n_sentence:
    print(f"- {s}")

Output for Paraphrasing:
- park brake alarm fault detected
- park brake has an alarm fault identified
- the park brake shows an alarm fault
- detected an alarm fault in the park brake
- an alarm fault in the park brake found
Output for Generation:
- park brake alarm fault detected
- alarm fault in park brake system
- park brake has alarm fault
- park brake system shows alarm fault
- alarm fault present in park brake


In [62]:
# Compare diversity over long run
paraphrases = []
n_sentence = []
for _ in range(20):
    paraphrases.extend(paraphrase())
    n_sentence.extend(n_sentences())
print (f"Number of unique responses for paraphrasing: {len(set(paraphrases))}")
print (f"Number of unique responses for generation: {len(set(n_sentence))}")

Number of unique responses for paraphrasing: 49
Number of unique responses for generation: 20


## Other Paraphraser Models

In [10]:
def model_paraphrase(tokenizer, model, sentences):
    outputs = {}
    for sentence in sentences:
        batch = tokenizer([sentence],truncation=True,padding='longest',max_length=8, return_tensors="pt")
        translated = model.generate(**batch,max_length=15,num_beams=5, num_return_sequences=5, temperature=1.5)
        output = tokenizer.batch_decode(translated, skip_special_tokens=True)
        outputs[sentence] = output
    return outputs

In [17]:
from transformers import BartTokenizer, BartForConditionalGeneration
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import PegasusTokenizer, PegasusForConditionalGeneration

bart_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large")
bart_model = BartForConditionalGeneration.from_pretrained("facebook/bart-large")
t5_tokenizer = T5Tokenizer.from_pretrained("t5-large")
t5_model = T5ForConditionalGeneration.from_pretrained("t5-large")
pegasus_tokenizer = PegasusTokenizer.from_pretrained("tuner007/pegasus_paraphrase")
pegasus_model = PegasusForConditionalGeneration.from_pretrained("tuner007/pegasus_paraphrase")

sentences = ["park brake alarm fault detected",
             "alarm fault in park brake system",
             "park brake has alarm fault",
             "park brake system shows alarm fault",
             "alarm fault present in park brake",
             "The boy is walking happily on the street"]

bart_outputs = model_paraphrase(bart_tokenizer, bart_model, sentences)
t5_outputs = model_paraphrase(t5_tokenizer, t5_model, sentences)
pegasus_outputs = model_paraphrase(pegasus_tokenizer, pegasus_model, sentences)

print("BART Outputs:")
for sentence, outputs in bart_outputs.items():
    print(f"{sentence}:")
    for output in outputs:
        print(f"- {output}")
print("T5 Outputs:")
for sentence, outputs in t5_outputs.items():
    print(f"{sentence}:")
    for output in outputs:
        print(f"- {output}")
print("Pegasus Outputs:")
for sentence, outputs in pegasus_outputs.items():
    print(f"{sentence}:")
    for output in outputs:
        print(f"- {output}")

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at tuner007/pegasus_paraphrase and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BART Outputs:
park brake alarm fault detected:
- park brake alarm fault detected
- park brake fault detected
- park brake alarm fault fault detected
- park brake alarm fault found
- park brake alarm detected
alarm fault in park brake system:
- alarm fault in park brake
- alarm fault in park brakes
- alarm fault in park car
- alarm fault in park assist
- alarm fault in park system
park brake has alarm fault:
- park brake has alarm fault
- park brake has alarm problem
- park brake has alarm fault may cause
- park brake has alarm fault may be
park brake system shows alarm fault:
- park brake system shows alarm fault
- park brake system shows alarm fault in California
- park brake system shows alarm fault in Florida
- park brake system shows alarm fault in China
- park brake system shows alarm fault in car
alarm fault present in park brake:
- alarm may be present in park
- alarm fault present in South Korea
- alarm fault present in South Africa
- alarm fault present in US nuclear plant
- a

## Introduce Contractions

In [181]:
# Load contractions {expand: [contractions]}
contractions = load_dictionary('data/Corrections/contractions.csv')

# Introduce contractions in a sentence
def introduce_contractions(sentence, chance=0.5):
    for key, values in contractions.items():
        if key in sentence and random.random() < chance:
            sentence = sentence.replace(key, random.choice(values))
    return sentence

In [166]:
sentences = ["pump is not working",
             "air horn does not work",
             "machine will not start"]
spacing = len(max(sentences, key=len))

for sentence in sentences:
    print("{:<{}} -> {}".format(sentence, spacing, introduce_contractions(sentence)))

pump is not working    -> pump is not working
air horn does not work -> air horn doesn't work
machine will not start -> machine will not start


## Introduce Jargon

In [182]:
# Load abbreviations {original: [variations]}
abbreviations = load_dictionary('data/Corrections/abbreviations.csv')

# Introduce abbreviations in a sentence
def introduce_abbreviations(sentence, chance=0.3):
    for key, values in abbreviations.items():
        if key in sentence and random.random() < chance:
            sentence = sentence.replace(key, random.choice(values))
    return sentence

In [196]:
sentences = ["pump is not working",
             "blown o-ring on left hand lift cylinder",
             "compressor oil pressure switch unserviceable"]
spacing = len(max(sentences, key=len))

for sentence in sentences:
    print("{:<{}} -> {}".format(sentence, spacing, introduce_abbreviations(sentence)))

pump is not working                          -> pmp is not working
blown o-ring on left hand lift cylinder      -> blown 0-ring on l/ hand lift cylinder
compressor oil pressure switch unserviceable -> compress oil press switch unserviceable


## Introduce Typos

In [16]:
def introduce_typos(openai, sentence):
    prompt = f"Introduce at least one typo in the following sentence: {sentence}"
    response = openai.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "system", "content": "You make spelling mistakes in the given sentences."},
                  {"role": "user", "content": prompt}],
        temperature=0.9,
        top_p=0.9,
        n=1
    )

    for choice in response.choices:
        output = choice.message.content
    return output

for sentence in sentences:
    out = introduce_typos(client, sentence)
    print(out)

park brake alram fault detected
alrm fault in park brake system
park brake has alrm fault
park brake sistem shows alarm fault
alrm fault present in park brake
