## Import packages

In [1]:

import os
import re
import sys
import random
import pandas as pd
from openai import OpenAI
from dotenv import load_dotenv
from scipy.stats import chi2_contingency

current_dir = os.path.dirname(os.path.abspath("__file__"))
main_dir = os.path.join(current_dir, '..')
sys.path.append(main_dir)

from humanise import humanise_sentence, initialise_globals

initialise_globals(main_dir)
HUMAN_DATAPATH = os.path.join(current_dir, 'human.txt')
GENERATE_DATAPATH = os.path.join(current_dir, 'synthetic_generate.txt')
HUMANISE_DATAPATH = os.path.join(current_dir, 'synthetic_humanise.txt')

# Load environment variables
load_dotenv()
api_key = os.getenv("API_KEY")
client = OpenAI(api_key=api_key)

[nltk_data] Downloading package cmudict to
[nltk_data]     C:\Users\allis\AppData\Roaming\nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


## Create Turing Test data

In [2]:
# Function to load human data from MaintNorm dataset
def load_maintnorm_sentences(file_path):
    dirty_sentences = []
    clean_sentences = []
    current_dirty = []
    current_clean = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()
            if not line:  # Empty line means a new sentence
                if current_dirty and current_clean:
                    dirty_sentences.append(' '.join(current_dirty))
                    clean_sentences.append(' '.join(current_clean))
                    current_dirty = []
                    current_clean = []
            else:
                parts = line.split('\t')
                if len(parts) > 1:
                    dirty, clean = parts[0], parts[1]
                    if not clean in ['<id>', '-']:
                        current_dirty.append(dirty.lower())
                    current_clean.append(clean.lower())
    return dirty_sentences, clean_sentences

# Save all human data to text file
train_dirty, train_clean = load_maintnorm_sentences('../data/MaintNorm/train.norm')
test_dirty, test_clean = load_maintnorm_sentences('../data/MaintNorm/test.norm')
val_dirty, val_clean = load_maintnorm_sentences('../data/MaintNorm/val.norm')
full_dirty = train_dirty + test_dirty + val_dirty
human_data = list(set(full_dirty)) # remove duplicates
with open(HUMAN_DATAPATH, 'w') as f:
    for item in human_data:
        f.write("%s\n" % item)

# Function to load human sentences or synthetic sentences
def load_sentences(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return [line.strip() for line in file]

# Humanise generated synthetic data
synthetic_data = load_sentences(GENERATE_DATAPATH)
humanise_data = [humanise_sentence(s) for s in synthetic_data]
with open(HUMANISE_DATAPATH, 'w') as f:
    for item in humanise_data:
        f.write("%s\n" % item)

In [8]:
# Random 50 human data sentences
human_data = load_sentences(HUMAN_DATAPATH)
human_50 = random.sample(human_data, 50)
human_50 = pd.DataFrame(human_50, columns=['sentence'])
human_50['label'] = 'h'

# Random 50 synthetic data sentences
synthetic_data = load_sentences(HUMANISE_DATAPATH)
synthetic_50 = random.sample(synthetic_data, 50)
synthetic_50 = pd.DataFrame(synthetic_50, columns=['sentence'])
synthetic_50['label'] = 's'

# Combine and shuffle human and synthetic data
turing_data = pd.concat([human_50, synthetic_50])
turing_data = turing_data.sample(frac=1).reset_index(drop=True)

turing_data.to_csv('target.csv', index=False)
turing_data.drop(columns=['label']).to_csv('turing.csv', index=False)

## Evaluate human predictions on Turing Test data

In [10]:
def evaluate_turing(target_file, turing_file):
    target = pd.read_csv(target_file)
    turing = pd.read_csv(turing_file)
    combine = pd.merge(target, turing, on='sentence')

    # Counts
    tp = ((combine['label_x'] == 'h') & (combine['label_y'] == 'h')).sum()
    tn = ((combine['label_x'] == 's') & (combine['label_y'] == 's')).sum()
    fp = ((combine['label_x'] == 's') & (combine['label_y'] == 'h')).sum()
    fn = ((combine['label_x'] == 'h') & (combine['label_y'] == 's')).sum()

    # Contingency table
    contingency_table = [[tp, fp], [fn, tn]]

    # Chi-square test
    res = chi2_contingency(contingency_table)
    
    # Confusion matrix
    column_names = ['Actual Human', 'Actual Synthetic']
    index_names = ['Predicted Human', 'Predicted Synthetic']
    confusion_matrix = pd.DataFrame(contingency_table, columns=column_names, index=index_names)

    # Accuracy
    accuracy = (tp + tn) / (tp + tn + fp + fn)

    # Print results
    print('-------------------------------- Frequency of Labels')
    print(turing['label'].value_counts().to_string())
    
    print('---------------------------------- Confusion Matrix')
    print(confusion_matrix)
    
    print('----------------------------------- Chi-Square Test')
    print(f'Chi-square          : {res.statistic:.4f}')
    print(f'p-value             : {res.pvalue:.4f}')
    print(f'Degrees of freedom  : {res.dof}')
    print('Expected frequencies:')
    print(res.expected_freq)
    
    print('--------------------------------------- Performance')
    print(f'Accuracy            : {accuracy:.4f}')

In [11]:
def filter_predictions(target_file, turing_file):
    target = pd.read_csv(target_file)
    turing = pd.read_csv(turing_file)
    combine = pd.merge(target, turing, on='sentence')

    # Actually human, predicted synthetic
    false_synthetic = combine[(combine['label_x'] == 'h') & (combine['label_y'] == 's')]['sentence']
    # Actually synthetic, predicted human
    false_human = combine[(combine['label_x'] == 's') & (combine['label_y'] == 'h')]['sentence']
    
    print('----------------------------------- False Synthetic')
    for s in false_synthetic:
        print(s)
    print('--------------------------------------- False Human')
    for s in false_human:
        print(s)

def find_common(target_file, turing_files):
    # Regex pattern extract name from filename
    pattern = r'(?<=_)[^_]+(?=_v\d+\.csv)'
    df = pd.read_csv(target_file)
    for file in turing_files:
        name = re.search(pattern, file).group()
        pred = pd.read_csv(file)['label']
        df[name] = pred

    # Find when everyone predicted human and actually synthetic
    common_fp = df[(df['label'] == 's') & (df.iloc[:, 2:].eq('h').all(axis=1))].index
    # Find when everyone predicted synthetic and actually human
    common_fn = df[(df['label'] == 'h') & (df.iloc[:, 2:].eq('s').all(axis=1))].index
    # Find when everyone predicted human and actually human
    common_tp = df[(df['label'] == 'h') & (df.iloc[:, 2:].eq('h').all(axis=1))].index
    # Find when everyone predicted synthetic and actually synthetic
    common_tn = df[(df['label'] == 's') & (df.iloc[:, 2:].eq('s').all(axis=1))].index
    
    print('--------------------- Common Predicted Human Actually Synthetic')
    for i in common_fp:
        print(df.iloc[i, 0])
    print('--------------------- Common Predicted Synthetic Actually Human')
    for i in common_fn:
        print(df.iloc[i, 0])
    print('----------------- Common Predicted Synthetic Actually Synthetic')
    for i in common_tn:
        print(df.iloc[i, 0])
    print('------------------------- Common Predicted Human Actually Human')
    for i in common_tp:
        print(df.iloc[i, 0])

## Evaluate individual predictions on Turing Test data

In [33]:
evaluate_turing('target.csv', 'Version1/turing_jf_v1.csv')
filter_predictions('target.csv', 'Version1/turing_jf_v1.csv')

-------------------------------- Frequency of Labels
label
h    54
s    45
---------------------------------- Confusion Matrix
                     Actual Human  Actual Synthetic
Predicted Human                29                25
Predicted Synthetic            20                25
----------------------------------- Chi-Square Test
Chi-square          : 0.5122
p-value             : 0.4742
Degrees of freedom  : 1
Expected frequencies:
[[26.72727273 27.27272727]
 [22.27272727 22.72727273]]
--------------------------------------- Performance
Accuracy            : 0.5455
----------------------------------- False Synthetic
c/o rear a/con condensor fans
rhs lower camframe inspect post shut
replace leaky lh tilt cyl hose
change out l/h lift cyl
l/h side stick cylinder leak
replace l/h front 1/4 window
arm rest u/s
no1 hoist generator flashed over
fw9593-fire suppression inspection
qr0218c/o rhbucket cylinder stauff clam
127 hour prevent maintenance tbc
648 hr preventative maintenance
c/o cab

In [34]:
evaluate_turing('target.csv', 'Version1/turing_cg_v1.csv')
filter_predictions('target.csv', 'Version1/turing_cg_v1.csv')

-------------------------------- Frequency of Labels
label
h    56
s    44
---------------------------------- Confusion Matrix
                     Actual Human  Actual Synthetic
Predicted Human                47                 9
Predicted Synthetic             3                41
----------------------------------- Chi-Square Test
Chi-square          : 55.5601
p-value             : 0.0000
Degrees of freedom  : 1
Expected frequencies:
[[28. 28.]
 [22. 22.]]
--------------------------------------- Performance
Accuracy            : 0.8800
----------------------------------- False Synthetic
c/o rear a/con condensor fans
over on hoist pinion
precleaner bowls on air con damaged
--------------------------------------- False Human
replace leaking lube pump
leak in air aircon hose
flt found in swg pump mtr lube
d/l air a/c fan speed resistor not working
hmu leaking hydraulic fluid
replace leaking air aircon hose
machine isn't starting
leak in air conditioner compress safety v/v
replace leakin

In [35]:
evaluate_turing('target.csv', 'Version1/turing_ms_v1.csv')
filter_predictions('target.csv', 'Version1/turing_ms_v1.csv')

-------------------------------- Frequency of Labels
label
h    65
s    35
---------------------------------- Confusion Matrix
                     Actual Human  Actual Synthetic
Predicted Human                40                25
Predicted Synthetic            10                25
----------------------------------- Chi-Square Test
Chi-square          : 8.6154
p-value             : 0.0033
Degrees of freedom  : 1
Expected frequencies:
[[32.5 32.5]
 [17.5 17.5]]
--------------------------------------- Performance
Accuracy            : 0.6500
----------------------------------- False Synthetic
rhs lower camframe inspect post shut
over on hoist pinion
replace pos 8 wheel end po
692 cables removed
replace faulty brake sensor1 task
cw coolant leak from #15 cylind
replace l/h front 1/4 window
oil leak near alt
fab rag bins
pcr room over alarm
--------------------------------------- False Human
hst drag brk filter shows signs of leaking
air cond thermostat not functioning properly
replace lea

In [36]:
evaluate_turing('target.csv', 'Version1/turing_mh_v1.csv')
filter_predictions('target.csv', 'Version1/turing_mh_v1.csv')

-------------------------------- Frequency of Labels
label
s    60
h    40
---------------------------------- Confusion Matrix
                     Actual Human  Actual Synthetic
Predicted Human                32                 8
Predicted Synthetic            18                42
----------------------------------- Chi-Square Test
Chi-square          : 22.0417
p-value             : 0.0000
Degrees of freedom  : 1
Expected frequencies:
[[20. 20.]
 [30. 30.]]
--------------------------------------- Performance
Accuracy            : 0.7400
----------------------------------- False Synthetic
rhs lower camframe inspect post shut
over on hoist pinion
check over issue with a/c system
replace pos 8 wheel end po
replace faulty brake sensor1 task
cw coolant leak from #15 cylind
replace drag ropes on d/line
hoist pony drive motor stopped working
intake fans 8 & 5 indication not working
repair damaged pos 6 guard
inspect hyd leak around articulation
qr0218c/o rhbucket cylinder stauff clam
127 hou

In [37]:
evaluate_turing('target.csv', 'Version1/turing_cw_v1.csv')
filter_predictions('target.csv', 'Version1/turing_cw_v1.csv')

-------------------------------- Frequency of Labels
label
h    52
s    48
---------------------------------- Confusion Matrix
                     Actual Human  Actual Synthetic
Predicted Human                27                25
Predicted Synthetic            23                25
----------------------------------- Chi-Square Test
Chi-square          : 0.0401
p-value             : 0.8414
Degrees of freedom  : 1
Expected frequencies:
[[26. 26.]
 [24. 24.]]
--------------------------------------- Performance
Accuracy            : 0.5200
----------------------------------- False Synthetic
c/o rear a/con condensor fans
check over issue with a/c system
replace pos 8 wheel end po
692 cables removed
replace faulty brake sensor1 task
cw coolant leak from #15 cylind
rep steering fault & ride control
replace l/h front 1/4 window
arm rest u/s
intake fans 8 & 5 indication not working
rh front light blown
no1 hoist generator flashed over
inspect hyd leak around articulation
stauff clamp missing b

## Analyse common predictions on Turing Test data

In [23]:
evaluators = [
    'Version1/turing_ms_v1.csv',
    'Version1/turing_mh_v1.csv',
    'Version1/turing_cw_v1.csv'
]

find_common('target.csv', evaluators)

--------------------- Common Predicted Human Actually Synthetic
glt detected in brake
leak detected in axle oil cool hose
chip inicators are leaking
hose is leaking
pump motor hose requires rep
applied grease 2 engine air cond hose leak
fan pump hose needs extending
cabin air aircon drive has oil leak
greease required for roofheater hose
--------------------- Common Predicted Synthetic Actually Human
crestkin to repair bucket roll
replace top cab mounts
replace rh camshaft
front steer pins and bush worn tbc
dragline fan testing lobb
platform support cracks above rhs crane
cm inspection excavator 1500 hr
steering filter plate cracked
----------------- Common Predicted Synthetic Actually Synthetic
brake pmp bolt needs to b e replaced
swing brk cylinder is unserviceable
unserviceable contamination swth found
hydoil oil leak @ pipe cpl
fitting required 4 boom cyl hose
hst motor hose rrquires grease& rep
coolant pipe clamps need replacing
buc cylinder hose has air leak
oil leaking from pede

## Version 2 Evaluation

In [5]:
evaluate_turing('Version2/target.csv', 'Version2/turing_cg_v2.csv')
filter_predictions('Version2/target.csv', 'Version2/turing_cg_v2.csv')

-------------------------------- Frequency of Labels
label
s    60
h    40
---------------------------------- Confusion Matrix
                     Actual Human  Actual Synthetic
Predicted Human                30                10
Predicted Synthetic            20                40
----------------------------------- Chi-Square Test
Chi-square          : 15.0417
p-value             : 0.0001
Degrees of freedom  : 1
Expected frequencies:
[[20. 20.]
 [30. 30.]]
--------------------------------------- Performance
Accuracy            : 0.7000
----------------------------------- False Synthetic
repair faulty e circuit
tooth metric camera not workingtw
despragg drag ropes
investigate noise from pos 0
c/out pos 1 tyre
replace both front headlight assemblies
open electrical room for vistacam downloads
has a big delay in bucket crowd
supply cmqs get sled
replace transmission cracked
repair gearshift fault not selecting
ripper boot fell off
fire alarm drag walk cubicle
c/o u/s quick release valve

In [12]:
evaluate_turing('target.csv', 'turing_cg_test.csv')
filter_predictions('target.csv', 'turing_cg_test.csv')

-------------------------------- Frequency of Labels
label
s    51
h    49
---------------------------------- Confusion Matrix
                     Actual Human  Actual Synthetic
Predicted Human                40                 9
Predicted Synthetic            10                41
----------------------------------- Chi-Square Test
Chi-square          : 36.0144
p-value             : 0.0000
Degrees of freedom  : 1
Expected frequencies:
[[24.5 24.5]
 [25.5 25.5]]
--------------------------------------- Performance
Accuracy            : 0.8100
----------------------------------- False Synthetic
replace u/s remote controller batt char
ndt propel final gear rhs mt
unable to select gears
cable work repower from east sub
loose swing pinion bolts swing no1
propel motor/blower lube inspect
boom point sheave bumper rubber cracked
repair house light cab side middle
electric horn on dash not working
oga tyre rep
--------------------------------------- False Human
fuel leak in heater hose
coolant 