## Import packages and Load target data

In [12]:
import pandas as pd
from scipy.stats import chi2_contingency

target = pd.read_csv('target.csv')

## Create Turing Test data

In [6]:
print(bool('\n'))

True


In [17]:
# Human data
def load_maintnorm_sentences(file_path):
    dirty_sentences = []
    clean_sentences = []
    current_dirty = []
    current_clean = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()
            if not line:  # Empty line means a new sentence
                if current_dirty and current_clean:
                    dirty_sentences.append(' '.join(current_dirty))
                    clean_sentences.append(' '.join(current_clean))
                    current_dirty = []
                    current_clean = []
            else:
                parts = line.split('\t')
                if len(parts) > 1:
                    dirty, clean = parts[0], parts[1]
                    if not clean in ['<id>', '-']:
                        current_dirty.append(dirty.lower())
                    current_clean.append(clean.lower())
    return dirty_sentences, clean_sentences

train_dirty, train_clean = load_maintnorm_sentences('../data/MaintNorm/train.norm')
test_dirty, test_clean = load_maintnorm_sentences('../data/MaintNorm/test.norm')
val_dirty, val_clean = load_maintnorm_sentences('../data/MaintNorm/val.norm')
full_dirty = train_dirty + test_dirty + val_dirty
data = list(set(full_dirty)) # remove duplicates
print(data)




In [2]:
# Synthetic data

## Evaluate human predictions on Turing Test data

In [14]:
evaluator = pd.read_csv('evaluator.csv')

evaluate = pd.merge(target, evaluator, on='Sentence')

accuracy = (evaluate['Human/Synthetic_x'] == evaluate['Human/Synthetic_y']).mean()

In [17]:
# Observed counts
# Count of correct identifications
correct_human = ((evaluate['Human/Synthetic_x'] == 'h') & (evaluate['Human/Synthetic_y'] == 'h')).sum()
correct_synthetic = ((evaluate['Human/Synthetic_x'] == 's') & (evaluate['Human/Synthetic_y'] == 's')).sum()
# Count of incorrect identifications
incorrect_human = ((evaluate['Human/Synthetic_x'] == 'h') & (evaluate['Human/Synthetic_y'] == 's')).sum()
incorrect_synthetic = ((evaluate['Human/Synthetic_x'] == 's') & (evaluate['Human/Synthetic_y'] == 'h')).sum()

# Contingency table
observed_counts = [[correct_human, incorrect_human],
                   [incorrect_synthetic, correct_synthetic]]

# Chi-square test
chi2, p, dof, expected = chi2_contingency(observed_counts)

# Accuracy
accuracy = (correct_human + correct_synthetic) / len(evaluate)

# Print results
print(f'Chi-square: {chi2:.2f}')
print(f'p-value: {p:.2f}')
print(f'Degrees of freedom: {dof}')
print('Expected counts:')
print(expected)
print(f'Accuracy: {accuracy:.2f}')

Chi-square: 0.00
p-value: 1.00
Degrees of freedom: 1
Expected counts:
[[2.5 2.5]
 [2.5 2.5]]
Accuracy: 0.60
