## Import packages

In [13]:
import os
import re
import sys
import math
import random
import krippendorff
import pandas as pd
from openai import OpenAI
from dotenv import load_dotenv
from scipy.stats import chi2_contingency, ttest_1samp

current_dir = os.path.dirname(os.path.abspath("__file__"))
main_dir = os.path.join(current_dir, '..')
sys.path.append(main_dir)

from humanise import humanise_sentence, initialise_globals

initialise_globals(main_dir)
HUMAN_DATAPATH = os.path.join(current_dir, 'human.txt')
GENERATE_DATAPATH = os.path.join(current_dir, 'synthetic_generate_v3.txt')
HUMANISE_DATAPATH = os.path.join(current_dir, 'synthetic_humanise_v3.txt')

# Load environment variables
load_dotenv()
api_key = os.getenv("API_KEY")
client = OpenAI(api_key=api_key)

[nltk_data] Downloading package cmudict to
[nltk_data]     C:\Users\allis\AppData\Roaming\nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


## Prepare Turing Test data

In [2]:
# Function to load human data from MaintNorm dataset
def load_maintnorm_sentences(file_path):
    dirty_sentences = []
    clean_sentences = []
    current_dirty = []
    current_clean = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()
            if not line:  # Empty line means a new sentence
                if current_dirty and current_clean:
                    dirty_sentences.append(' '.join(current_dirty))
                    clean_sentences.append(' '.join(current_clean))
                    current_dirty = []
                    current_clean = []
            else:
                parts = line.split('\t')
                if len(parts) > 1:
                    dirty, clean = parts[0], parts[1]
                    if not clean in ['<id>', '-']:
                        current_dirty.append(dirty.lower())
                    current_clean.append(clean.lower())
    return dirty_sentences, clean_sentences

# Save all human data to text file
def save_human_data():
    train_dirty, _ = load_maintnorm_sentences('../data/MaintNorm/train.norm')
    test_dirty, _ = load_maintnorm_sentences('../data/MaintNorm/test.norm')
    val_dirty, _ = load_maintnorm_sentences('../data/MaintNorm/val.norm')
    full_dirty = train_dirty + test_dirty + val_dirty
    human_data = list(set(full_dirty)) # remove duplicates
    with open(HUMAN_DATAPATH, 'w') as f:
        for item in human_data:
            f.write("%s\n" % item)

# Function to load human sentences or synthetic sentences
def load_sentences(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return [line.strip() for line in file]

# Function to sample by sections
def synthetic_sample(data, num_samples, num_sections):
    section_size = len(data) // num_sections
    syn_sections = [data[i*section_size:(i+1)*section_size] for i in range(num_sections)]
    i = 0
    for d in data[num_sections*section_size:]:
        syn_sections[i].append(d)
        i += 1
    sample_size = math.ceil(num_samples / num_sections)
    syn_samples = []
    for section in syn_sections:
        syn_samples.extend(random.sample(section, sample_size))
    syn_samples = random.sample(syn_samples, 50)
    return syn_samples

# Uncomment to save human data to text file
# save_human_data()

# Humanise generated synthetic data
synthetic_data = load_sentences(GENERATE_DATAPATH)
humanise_data = [humanise_sentence(s) for s in synthetic_data]

with open(HUMANISE_DATAPATH, 'w') as f:
    for item in humanise_data:
        f.write("%s\n" % item)

In [None]:
# Random 50 human data sentences
human_data = load_sentences(HUMAN_DATAPATH)
human_50 = random.sample(human_data, 50)
human_50 = pd.DataFrame(human_50, columns=['sentence'])
human_50['label'] = 'h'

# Random 50 synthetic data sentences
synthetic_data = load_sentences(HUMANISE_DATAPATH)
synthetic_50 = synthetic_sample(synthetic_data, 50, 8)
synthetic_50 = pd.DataFrame(synthetic_50, columns=['sentence'])
synthetic_50['label'] = 's'

# Combine and shuffle human and synthetic data
turing_data = pd.concat([human_50, synthetic_50])
turing_data = turing_data.sample(frac=1).reset_index(drop=True)

turing_data.to_csv('target.csv', index=False)
turing_data.drop(columns=['label']).to_csv('turing.csv', index=False)

## Evaluate annotators on Turing Test data

### Individual evaluation functions

In [45]:
def evaluate_turing(target_file, turing_file, print_results=True):
    # Read files
    target = pd.read_csv(target_file)
    turing = pd.read_csv(turing_file)
    turing['label'] = turing['label'].str.lower()
    combine = pd.merge(target, turing, on='sentence')

    # Counts
    tp = ((combine['label_x'] == 'h') & (combine['label_y'] == 'h')).sum()
    tn = ((combine['label_x'] == 's') & (combine['label_y'] == 's')).sum()
    fp = ((combine['label_x'] == 's') & (combine['label_y'] == 'h')).sum()
    fn = ((combine['label_x'] == 'h') & (combine['label_y'] == 's')).sum()

    # Chi-square test
    contingency_table = [[tp, fp], [fn, tn]]
    res = chi2_contingency(contingency_table)
    
    # Confusion matrix
    column_names = ['Actual Human', 'Actual Synthetic']
    index_names = ['Predicted Human', 'Predicted Synthetic']
    confusion_matrix = pd.DataFrame(contingency_table, columns=column_names, index=index_names)

    # Accuracy, Precision, Recall, F1-score
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1_score = 2 * (precision * recall) / (precision + recall)
    
    # Percentage of human and synthetic data
    num_human = turing['label'].value_counts()['h']
    num_synthetic = turing['label'].value_counts()['s']
    human_percentage = num_human / len(turing)
    synthetic_percentage = num_synthetic / len(turing)

    # Print results
    if print_results:
        print('-------------------------------- Frequency of Labels')
        print(f'Labelled human      : {num_human} ({human_percentage:.2f})')
        print(f'Labelled synthetic  : {num_synthetic} ({synthetic_percentage:.2f})')
        
        print('---------------------------------- Confusion Matrix')
        print(confusion_matrix)
        
        print('----------------------------------- Chi-Square Test')
        print(f'Chi-square          : {res.statistic:.4f}')
        print(f'p-value             : {res.pvalue:.4f}')
        print(f'Degrees of freedom  : {res.dof}')
        print('Expected frequencies:')
        print(res.expected_freq)

        print('--------------------------------------- Performance')
        print(f'Accuracy            : {accuracy:.2f}')
        print(f'Precision           : {precision:.2f}')
        print(f'Recall              : {recall:.2f}')
        print(f'F1-score            : {f1_score:.2f}')
    
    return (accuracy, precision, recall, f1_score, human_percentage)

def filter_predictions(target_file, turing_file):
    target = pd.read_csv(target_file)
    turing = pd.read_csv(turing_file) 
    turing['label'] = turing['label'].str.lower()
    combine = pd.merge(target, turing, on='sentence')

    # Actually human, predicted synthetic
    false_synthetic = combine[(combine['label_x'] == 'h') & (combine['label_y'] == 's')]['sentence']
    # Actually synthetic, predicted human
    false_human = combine[(combine['label_x'] == 's') & (combine['label_y'] == 'h')]['sentence']
    
    print('----------------------------------- False Synthetic')
    for s in false_synthetic:
        print(s)
    print('--------------------------------------- False Human')
    for s in false_human:
        print(s)

### Agreement evaluation functions

In [52]:
def average_performance(turing_files, accuracies, precisions, recalls, f1_scores, human_percentages):
    # Calculate overall average performance
    avg_accuracy = sum(accuracies) / len(accuracies)
    avg_precision = sum(precisions) / len(precisions)
    avg_recall = sum(recalls) / len(recalls)
    avg_f1_score = sum(f1_scores) / len(f1_scores)
    avg_human_percentage = sum(human_percentages) / len(human_percentages)
    alpha_agreement = annotator_agreement(turing_files)
    
    # Print results
    print('----------------------------------- Overall Results')
    print(f'Average Accuracy    : {avg_accuracy:.2f}')
    print(f'Average Precision   : {avg_precision:.2f}')
    print(f'Average Recall      : {avg_recall:.2f}')
    print(f'Average F1-score    : {avg_f1_score:.2f}')
    print(f'Average Human %     : {avg_human_percentage:.2f}')
    print(f'Annotator Agreement : {alpha_agreement:.2f}')
    return (avg_accuracy, avg_precision, avg_recall, avg_f1_score, avg_human_percentage, alpha_agreement)

def find_common(target_file, turing_files):
    # Regex pattern extract name from filename
    pattern = r'(?<=_)[^_]+(?=_v\d+\.csv)'
    df = pd.read_csv(target_file)
    for file in turing_files:
        name = re.search(pattern, file).group()
        pred = pd.read_csv(file)['label']
        df[name] = pred
    
    # Find when everyone predicted human and actually synthetic
    common_fp = df[(df['label'] == 's') & (df.iloc[:, 2:].eq('h').all(axis=1))].index
    # Find when everyone predicted synthetic and actually human
    common_fn = df[(df['label'] == 'h') & (df.iloc[:, 2:].eq('s').all(axis=1))].index
    # Find when everyone predicted human and actually human
    common_tp = df[(df['label'] == 'h') & (df.iloc[:, 2:].eq('h').all(axis=1))].index
    # Find when everyone predicted synthetic and actually synthetic
    common_tn = df[(df['label'] == 's') & (df.iloc[:, 2:].eq('s').all(axis=1))].index
    
    print('--------------------- Common Predicted Human Actually Synthetic')
    for i in common_fp:
        print(df.iloc[i, 0])
    print('--------------------- Common Predicted Synthetic Actually Human')
    for i in common_fn:
        print(df.iloc[i, 0])
    print('----------------- Common Predicted Synthetic Actually Synthetic')
    for i in common_tn:
        print(df.iloc[i, 0])
    print('------------------------- Common Predicted Human Actually Human')
    for i in common_tp:
        print(df.iloc[i, 0])

# Krippendorff's Alpha for annotator agreement
def annotator_agreement(turing_files):
    labels = []
    for file in turing_files:
        label = pd.read_csv(file)['label'].tolist()
        labels.append(label)
    alpha = krippendorff.alpha(reliability_data=labels, level_of_measurement='nominal')
    return alpha

## Evaluate individual predictions V1

In [8]:
evaluate_turing('Version1/target_v1.csv', 'Version1/turing_jf_v1.csv')
# filter_predictions('Version1/target_v1.csv', 'Version1/turing_jf_v1.csv')

-------------------------------- Frequency of Labels
label
h    54
s    45
---------------------------------- Confusion Matrix
                     Actual Human  Actual Synthetic
Predicted Human                29                25
Predicted Synthetic            20                25
----------------------------------- Chi-Square Test
Chi-square          : 0.5122
p-value             : 0.4742
Degrees of freedom  : 1
Expected frequencies:
[[26.72727273 27.27272727]
 [22.27272727 22.72727273]]
--------------------------------------- Performance
Accuracy            : 0.5455


In [18]:
evaluate_turing('Version1/target_v1.csv', 'Version1/turing_cg_v1.csv')
# filter_predictions('Version1/target_v1.csv', 'Version1/turing_cg_v1.csv')

-------------------------------- Frequency of Labels
Labelled human      : 56 (0.56)
Labelled synthetic  : 44 (0.44)
---------------------------------- Confusion Matrix
                     Actual Human  Actual Synthetic
Predicted Human                47                 9
Predicted Synthetic             3                41
----------------------------------- Chi-Square Test
Chi-square          : 55.5601
p-value             : 0.0000
Degrees of freedom  : 1
Expected frequencies:
[[28. 28.]
 [22. 22.]]
--------------------------------------- Performance
Accuracy            : 0.88
Precision           : 0.84
Recall              : 0.94
F1-score            : 0.89


In [19]:
evaluate_turing('Version1/target_v1.csv', 'Version1/turing_ms_v1.csv')
# filter_predictions('Version1/target_v1.csv', 'Version1/turing_ms_v1.csv')

-------------------------------- Frequency of Labels
Labelled human      : 65 (0.65)
Labelled synthetic  : 35 (0.35)
---------------------------------- Confusion Matrix
                     Actual Human  Actual Synthetic
Predicted Human                40                25
Predicted Synthetic            10                25
----------------------------------- Chi-Square Test
Chi-square          : 8.6154
p-value             : 0.0033
Degrees of freedom  : 1
Expected frequencies:
[[32.5 32.5]
 [17.5 17.5]]
--------------------------------------- Performance
Accuracy            : 0.65
Precision           : 0.62
Recall              : 0.80
F1-score            : 0.70


In [20]:
evaluate_turing('Version1/target_v1.csv', 'Version1/turing_mh_v1.csv')
# filter_predictions('Version1/target_v1.csv', 'Version1/turing_mh_v1.csv')

-------------------------------- Frequency of Labels
Labelled human      : 40 (0.40)
Labelled synthetic  : 60 (0.60)
---------------------------------- Confusion Matrix
                     Actual Human  Actual Synthetic
Predicted Human                32                 8
Predicted Synthetic            18                42
----------------------------------- Chi-Square Test
Chi-square          : 22.0417
p-value             : 0.0000
Degrees of freedom  : 1
Expected frequencies:
[[20. 20.]
 [30. 30.]]
--------------------------------------- Performance
Accuracy            : 0.74
Precision           : 0.80
Recall              : 0.64
F1-score            : 0.71


In [21]:
evaluate_turing('Version1/target_v1.csv', 'Version1/turing_cw_v1.csv')
# filter_predictions('Version1/target_v1.csv', 'Version1/turing_cw_v1.csv')

-------------------------------- Frequency of Labels
Labelled human      : 52 (0.52)
Labelled synthetic  : 48 (0.48)
---------------------------------- Confusion Matrix
                     Actual Human  Actual Synthetic
Predicted Human                27                25
Predicted Synthetic            23                25
----------------------------------- Chi-Square Test
Chi-square          : 0.0401
p-value             : 0.8414
Degrees of freedom  : 1
Expected frequencies:
[[26. 26.]
 [24. 24.]]
--------------------------------------- Performance
Accuracy            : 0.52
Precision           : 0.52
Recall              : 0.54
F1-score            : 0.53


## Evaluate annotator agreement V1

In [53]:
evaluators = [
    'Version1/turing_ms_v1.csv',
    'Version1/turing_mh_v1.csv',
    'Version1/turing_cw_v1.csv'
]

accuracy, precision, recall, f1_score, human_percentage = [], [], [], [], []
for e in evaluators:
    results = evaluate_turing('Version1/target_v1.csv', e, False)
    accuracy.append(results[0])
    precision.append(results[1])
    recall.append(results[2])
    f1_score.append(results[3])
    human_percentage.append(results[4])

overall_results = average_performance(evaluators, accuracy, precision, 
                                      recall, f1_score, human_percentage)

----------------------------------- Overall Results
Average Accuracy    : 0.64
Average Precision   : 0.64
Average Recall      : 0.66
Average F1-score    : 0.65
Average Human %     : 0.52
Annotator Agreement : 0.17


In [54]:
find_common('Version1/target_v1.csv', evaluators)

--------------------- Common Predicted Human Actually Synthetic
replace leaking lube pump
hmu leaking hydraulic fluid
decking has several cracks
boom foot clevbis pin has no grease
diff lube hose insp for leaks
--------------------- Common Predicted Synthetic Actually Human
replace pos 8 wheel end po
replace faulty brake sensor1 task
cw coolant leak from #15 cylind
pcr room over alarm
----------------- Common Predicted Synthetic Actually Synthetic
chge out leaking axle oil cool
leak detected infan pump
cabindoor is leaking
leak in air aircon hose
replace leaking air aircon hose
plug has a leak
leaikng fluid fr swingbrake pump
gasket isleaking
hea t ssink clamp found brnk
machine isn't starting
hst motor hose shows a leak
cabin roof heater taps show leakking
engine fan pmp hose has a leak
------------------------- Common Predicted Human Actually Human
change out tyre pos 0
replace leaky lh tilt cyl hose
change out l/h lift cyl
l/h side stick cylinder leak
aftercooler gauge faulty
mv3594

## Evaluate individual predictions V2

In [56]:
evaluate_turing('Version2/target.csv', 'Version2/turing_cg.csv')
# filter_predictions('Version2/target.csv', 'Version2/turing_cg.csv')

-------------------------------- Frequency of Labels
Labelled human      : 48 (0.48)
Labelled synthetic  : 52 (0.52)
---------------------------------- Confusion Matrix
                     Actual Human  Actual Synthetic
Predicted Human                31                17
Predicted Synthetic            19                33
----------------------------------- Chi-Square Test
Chi-square          : 6.7708
p-value             : 0.0093
Degrees of freedom  : 1
Expected frequencies:
[[24. 24.]
 [26. 26.]]
--------------------------------------- Performance
Accuracy            : 0.64
Precision           : 0.65
Recall              : 0.62
F1-score            : 0.63


(0.64, 0.6458333333333334, 0.62, 0.6326530612244898, 0.48)

In [57]:
evaluate_turing('Version2/target.csv', 'Version2/turing_jf.csv')
# filter_predictions('Version2/target.csv', 'Version2/turing_jf.csv')

-------------------------------- Frequency of Labels
Labelled human      : 49 (0.49)
Labelled synthetic  : 51 (0.51)
---------------------------------- Confusion Matrix
                     Actual Human  Actual Synthetic
Predicted Human                25                24
Predicted Synthetic            25                26
----------------------------------- Chi-Square Test
Chi-square          : 0.0000
p-value             : 1.0000
Degrees of freedom  : 1
Expected frequencies:
[[24.5 24.5]
 [25.5 25.5]]
--------------------------------------- Performance
Accuracy            : 0.51
Precision           : 0.51
Recall              : 0.50
F1-score            : 0.51


(0.51, 0.5102040816326531, 0.5, 0.5050505050505051, 0.49)

In [58]:
evaluate_turing('Version2/target.csv', 'Version2/turing_ms.csv')
# filter_predictions('Version2/target.csv', 'Version2/turing_ms.csv')

-------------------------------- Frequency of Labels
Labelled human      : 63 (0.63)
Labelled synthetic  : 37 (0.37)
---------------------------------- Confusion Matrix
                     Actual Human  Actual Synthetic
Predicted Human                37                26
Predicted Synthetic            13                24
----------------------------------- Chi-Square Test
Chi-square          : 4.2900
p-value             : 0.0383
Degrees of freedom  : 1
Expected frequencies:
[[31.5 31.5]
 [18.5 18.5]]
--------------------------------------- Performance
Accuracy            : 0.61
Precision           : 0.59
Recall              : 0.74
F1-score            : 0.65


(0.61, 0.5873015873015873, 0.74, 0.6548672566371682, 0.63)

In [59]:
evaluate_turing('Version2/target.csv', 'Version2/turing_mh.csv')
# filter_predictions('Version2/target.csv', 'Version2/turing_mh.csv')

-------------------------------- Frequency of Labels
Labelled human      : 64 (0.64)
Labelled synthetic  : 36 (0.36)
---------------------------------- Confusion Matrix
                     Actual Human  Actual Synthetic
Predicted Human                36                28
Predicted Synthetic            14                22
----------------------------------- Chi-Square Test
Chi-square          : 2.1267
p-value             : 0.1447
Degrees of freedom  : 1
Expected frequencies:
[[32. 32.]
 [18. 18.]]
--------------------------------------- Performance
Accuracy            : 0.58
Precision           : 0.56
Recall              : 0.72
F1-score            : 0.63


(0.58, 0.5625, 0.72, 0.631578947368421, 0.64)

## Evaluate annotator agreement V2

In [62]:
evaluators = [
    'Version2/turing_ms.csv',
    'Version2/turing_mh.csv'
]

accuracy, precision, recall, f1_score, human_percentage = [], [], [], [], []
for e in evaluators:
    results = evaluate_turing('Version2/target.csv', e, False)
    accuracy.append(results[0])
    precision.append(results[1])
    recall.append(results[2])
    f1_score.append(results[3])
    human_percentage.append(results[4])

overall_results = average_performance(evaluators, accuracy, precision, 
                                      recall, f1_score, human_percentage)

----------------------------------- Overall Results
Average Accuracy    : 0.59
Average Precision   : 0.57
Average Recall      : 0.73
Average F1-score    : 0.64
Average Human %     : 0.64
Annotator Agreement : 0.29
