## Import packages

In [2]:
import os
import sys
import random
import pandas as pd
from scipy.stats import chi2_contingency

current_dir = os.path.dirname(os.path.abspath("__file__"))
main_dir = os.path.join(current_dir, '..')
sys.path.append(main_dir)

from humanise import humanise_sentence, initialise_globals

initialise_globals(main_dir)

[nltk_data] Downloading package cmudict to
[nltk_data]     C:\Users\allis\AppData\Roaming\nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


## Create Turing Test data

In [4]:
# Function to load human data from MaintNorm dataset
def load_maintnorm_sentences(file_path):
    dirty_sentences = []
    clean_sentences = []
    current_dirty = []
    current_clean = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()
            if not line:  # Empty line means a new sentence
                if current_dirty and current_clean:
                    dirty_sentences.append(' '.join(current_dirty))
                    clean_sentences.append(' '.join(current_clean))
                    current_dirty = []
                    current_clean = []
            else:
                parts = line.split('\t')
                if len(parts) > 1:
                    dirty, clean = parts[0], parts[1]
                    if not clean in ['<id>', '-']:
                        current_dirty.append(dirty.lower())
                    current_clean.append(clean.lower())
    return dirty_sentences, clean_sentences

# Save all human data to text file
train_dirty, train_clean = load_maintnorm_sentences('../data/MaintNorm/train.norm')
test_dirty, test_clean = load_maintnorm_sentences('../data/MaintNorm/test.norm')
val_dirty, val_clean = load_maintnorm_sentences('../data/MaintNorm/val.norm')
full_dirty = train_dirty + test_dirty + val_dirty
human_data = list(set(full_dirty)) # remove duplicates
with open('human.txt', 'w') as f:
    for item in human_data:
        f.write("%s\n" % item)

# Function to load human sentences or synthetic sentences
def load_sentences(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return [line.strip() for line in file]

In [5]:
# Random 50 human data sentences
human_data = load_sentences('human.txt')
human_50 = random.sample(human_data, 50)
human_50 = pd.DataFrame(human_50, columns=['sentence'])
human_50['label'] = 'h'

# Random 50 synthetic data sentences
synthetic_data = load_sentences('synthetic.txt')
synthetic_50 = random.sample(synthetic_data, 50)
synthetic_50 = [humanise_sentence(s) for s in synthetic_50]
synthetic_50 = pd.DataFrame(synthetic_50, columns=['sentence'])
synthetic_50['label'] = 's'

# Combine and shuffle human and synthetic data
turing_data = pd.concat([human_50, synthetic_50])
turing_data = turing_data.sample(frac=1).reset_index(drop=True)

turing_data.to_csv('target.csv', index=False)
turing_data.drop(columns=['label']).to_csv('turing.csv', index=False)

## Evaluate human predictions on Turing Test data

In [16]:
def evaluate_turing(target_file, turing_file):
    target = pd.read_csv(target_file)
    turing = pd.read_csv(turing_file)
    combine = pd.merge(target, turing, on='sentence')

    # Observed counts
    # Count of correct identifications
    correct_human = ((combine['label_x'] == 'h') & (combine['label_y'] == 'h')).sum()
    correct_synthetic = ((combine['label_x'] == 's') & (combine['label_y'] == 's')).sum()
    # Count of incorrect identifications
    incorrect_human = ((combine['label_x'] == 'h') & (combine['label_y'] == 's')).sum()
    incorrect_synthetic = ((combine['label_x'] == 's') & (combine['label_y'] == 'h')).sum()
    
    # Contingency table
    observed_counts = [[correct_human, incorrect_human],
                    [incorrect_synthetic, correct_synthetic]]

    # Chi-square test
    chi2, p, dof, expected = chi2_contingency(observed_counts)

    # Accuracy
    accuracy = (correct_human + correct_synthetic) / len(combine)

    # Print results
    print(f'Chi-square: {chi2:.2f}')
    print(f'p-value: {p:.2f}')
    print(f'Degrees of freedom: {dof}')
    print('Expected counts:')
    print(expected)
    print(f'Accuracy: {accuracy:.2f}')

In [17]:
evaluate_turing('target.csv', 'turing.csv')

Chi-square: 38.50
p-value: 0.00
Degrees of freedom: 1
Expected counts:
[[24. 26.]
 [24. 26.]]
Accuracy: 0.82
