## Import packages

In [1]:
import os
import sys
import random
import pandas as pd
from scipy.stats import chi2_contingency

current_dir = os.path.dirname(os.path.abspath("__file__"))
main_dir = os.path.join(current_dir, '..')
sys.path.append(main_dir)

from humanise import humanise_sentence, initialise_globals

initialise_globals(main_dir)

[nltk_data] Downloading package cmudict to
[nltk_data]     C:\Users\allis\AppData\Roaming\nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


## Create Turing Test data

In [2]:
# Function to load human data from MaintNorm dataset
def load_maintnorm_sentences(file_path):
    dirty_sentences = []
    clean_sentences = []
    current_dirty = []
    current_clean = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()
            if not line:  # Empty line means a new sentence
                if current_dirty and current_clean:
                    dirty_sentences.append(' '.join(current_dirty))
                    clean_sentences.append(' '.join(current_clean))
                    current_dirty = []
                    current_clean = []
            else:
                parts = line.split('\t')
                if len(parts) > 1:
                    dirty, clean = parts[0], parts[1]
                    if not clean in ['<id>', '-']:
                        current_dirty.append(dirty.lower())
                    current_clean.append(clean.lower())
    return dirty_sentences, clean_sentences

# Save all human data to text file
train_dirty, train_clean = load_maintnorm_sentences('../data/MaintNorm/train.norm')
test_dirty, test_clean = load_maintnorm_sentences('../data/MaintNorm/test.norm')
val_dirty, val_clean = load_maintnorm_sentences('../data/MaintNorm/val.norm')
full_dirty = train_dirty + test_dirty + val_dirty
human_data = list(set(full_dirty)) # remove duplicates
with open('human.txt', 'w') as f:
    for item in human_data:
        f.write("%s\n" % item)

# Function to load human sentences or synthetic sentences
def load_sentences(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return [line.strip() for line in file]
    
synthetic_data = load_sentences('synthetic_generate.txt')
humanise_data = [humanise_sentence(s) for s in synthetic_data]
with open('synthetic_humanise.txt', 'w') as f:
    for item in humanise_data:
        f.write("%s\n" % item)

In [3]:
# Random 50 human data sentences
human_data = load_sentences('human.txt')
human_50 = random.sample(human_data, 50)
human_50 = pd.DataFrame(human_50, columns=['sentence'])
human_50['label'] = 'h'

# Random 50 synthetic data sentences
synthetic_data = load_sentences('synthetic_humanise.txt')
synthetic_50 = random.sample(synthetic_data, 50)
synthetic_50 = pd.DataFrame(synthetic_50, columns=['sentence'])
synthetic_50['label'] = 's'

# Combine and shuffle human and synthetic data
turing_data = pd.concat([human_50, synthetic_50])
turing_data = turing_data.sample(frac=1).reset_index(drop=True)

turing_data.to_csv('target.csv', index=False)
turing_data.drop(columns=['label']).to_csv('turing.csv', index=False)

## Evaluate human predictions on Turing Test data

In [77]:
def evaluate_turing(target_file, turing_file):
    target = pd.read_csv(target_file)
    turing = pd.read_csv(turing_file)
    combine = pd.merge(target, turing, on='sentence')

    # Observed counts
    # Count of correct identifications
    correct_human = ((combine['label_x'] == 'h') & (combine['label_y'] == 'h')).sum()
    correct_synthetic = ((combine['label_x'] == 's') & (combine['label_y'] == 's')).sum()
    # Count of incorrect identifications
    incorrect_human = ((combine['label_x'] == 'h') & (combine['label_y'] == 's')).sum()
    incorrect_synthetic = ((combine['label_x'] == 's') & (combine['label_y'] == 'h')).sum()
    
    # Contingency table
    contingency_table = [[correct_human, incorrect_human],
                       [incorrect_synthetic, correct_synthetic]]

    # Chi-square test
    res = chi2_contingency(contingency_table)
    
    # Confusion matrix
    column_names = ['Actual Human', 'Actual Synthetic']
    index_names = ['Predicted Human', 'Predicted Synthetic']
    confusion_matrix = pd.DataFrame(contingency_table, columns=column_names, index=index_names)

    # Accuracy
    accuracy = (correct_human + correct_synthetic) / len(combine)
    
    # Precision, recall, F1-score
    precision = correct_human / (correct_human + incorrect_synthetic)
    recall = correct_human / (correct_human + incorrect_human)
    f1_score = 2 * (precision * recall) / (precision + recall)

    # Print results
    print('-------------------------------- Frequency of Labels')
    print(turing['label'].value_counts().to_string())
    
    print('---------------------------------- Confusion Matrix')
    print(confusion_matrix)
    
    print('----------------------------------- Chi-Square Test')
    print(f'Chi-square          : {res.statistic:.2f}')
    print(f'p-value             : {res.pvalue:.2f}')
    print(f'Degrees of freedom  : {res.dof}')
    print('Expected frequencies:')
    print(res.expected_freq)
    
    print('--------------------------------------- Performance')
    print(f'Accuracy            : {accuracy:.2f}')
    print(f'Precision           : {precision:.2f}')
    print(f'Recall              : {recall:.2f}')
    print(f'F1-score            : {f1_score:.2f}')

In [78]:
evaluate_turing('target.csv', 'turing_jadeyn.csv')

-------------------------------- Frequency of Labels
label
h    54
s    45
---------------------------------- Confusion Matrix
                     Actual Human  Actual Synthetic
Predicted Human                29                20
Predicted Synthetic            25                25
----------------------------------- Chi-Square Test
Chi-square          : 0.51
p-value             : 0.47
Degrees of freedom  : 1
Expected frequencies:
[[26.72727273 22.27272727]
 [27.27272727 22.72727273]]
--------------------------------------- Performance
Accuracy            : 0.54
Precision           : 0.54
Recall              : 0.59
F1-score            : 0.56


In [79]:
evaluate_turing('target.csv', 'turing_chay.csv')

-------------------------------- Frequency of Labels
label
h    56
s    44
---------------------------------- Confusion Matrix
                     Actual Human  Actual Synthetic
Predicted Human                47                 3
Predicted Synthetic             9                41
----------------------------------- Chi-Square Test
Chi-square          : 55.56
p-value             : 0.00
Degrees of freedom  : 1
Expected frequencies:
[[28. 22.]
 [28. 22.]]
--------------------------------------- Performance
Accuracy            : 0.88
Precision           : 0.84
Recall              : 0.94
F1-score            : 0.89


In [80]:
evaluate_turing('target.csv', 'turing_peter.csv')

-------------------------------- Frequency of Labels
label
h    65
s    35
---------------------------------- Confusion Matrix
                     Actual Human  Actual Synthetic
Predicted Human                36                14
Predicted Synthetic            29                21
----------------------------------- Chi-Square Test
Chi-square          : 1.58
p-value             : 0.21
Degrees of freedom  : 1
Expected frequencies:
[[32.5 17.5]
 [32.5 17.5]]
--------------------------------------- Performance
Accuracy            : 0.57
Precision           : 0.55
Recall              : 0.72
F1-score            : 0.63


In [81]:
evaluate_turing('target.csv', 'turing_ms.csv')

-------------------------------- Frequency of Labels
label
h    65
s    35
---------------------------------- Confusion Matrix
                     Actual Human  Actual Synthetic
Predicted Human                40                10
Predicted Synthetic            25                25
----------------------------------- Chi-Square Test
Chi-square          : 8.62
p-value             : 0.00
Degrees of freedom  : 1
Expected frequencies:
[[32.5 17.5]
 [32.5 17.5]]
--------------------------------------- Performance
Accuracy            : 0.65
Precision           : 0.62
Recall              : 0.80
F1-score            : 0.70
