## Import packages

In [1]:
import os
import sys
import random
import pandas as pd
from scipy.stats import chi2_contingency

current_dir = os.path.dirname(os.path.abspath("__file__"))
main_dir = os.path.join(current_dir, '..')
sys.path.append(main_dir)

from humanise import humanise_sentence, initialise_globals

initialise_globals(main_dir)

[nltk_data] Downloading package cmudict to
[nltk_data]     C:\Users\allis\AppData\Roaming\nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


## Create Turing Test data

In [2]:
# Function to load human data from MaintNorm dataset
def load_maintnorm_sentences(file_path):
    dirty_sentences = []
    clean_sentences = []
    current_dirty = []
    current_clean = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()
            if not line:  # Empty line means a new sentence
                if current_dirty and current_clean:
                    dirty_sentences.append(' '.join(current_dirty))
                    clean_sentences.append(' '.join(current_clean))
                    current_dirty = []
                    current_clean = []
            else:
                parts = line.split('\t')
                if len(parts) > 1:
                    dirty, clean = parts[0], parts[1]
                    if not clean in ['<id>', '-']:
                        current_dirty.append(dirty.lower())
                    current_clean.append(clean.lower())
    return dirty_sentences, clean_sentences

# Save all human data to text file
train_dirty, train_clean = load_maintnorm_sentences('../data/MaintNorm/train.norm')
test_dirty, test_clean = load_maintnorm_sentences('../data/MaintNorm/test.norm')
val_dirty, val_clean = load_maintnorm_sentences('../data/MaintNorm/val.norm')
full_dirty = train_dirty + test_dirty + val_dirty
human_data = list(set(full_dirty)) # remove duplicates
with open('human.txt', 'w') as f:
    for item in human_data:
        f.write("%s\n" % item)

# Function to load human sentences or synthetic sentences
def load_sentences(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return [line.strip() for line in file]
    
synthetic_data = load_sentences('synthetic_generate.txt')
humanise_data = [humanise_sentence(s) for s in synthetic_data]
with open('synthetic_humanise.txt', 'w') as f:
    for item in humanise_data:
        f.write("%s\n" % item)

In [3]:
# Random 50 human data sentences
human_data = load_sentences('human.txt')
human_50 = random.sample(human_data, 50)
human_50 = pd.DataFrame(human_50, columns=['sentence'])
human_50['label'] = 'h'

# Random 50 synthetic data sentences
synthetic_data = load_sentences('synthetic_humanise.txt')
synthetic_50 = random.sample(synthetic_data, 50)
synthetic_50 = pd.DataFrame(synthetic_50, columns=['sentence'])
synthetic_50['label'] = 's'

# Combine and shuffle human and synthetic data
turing_data = pd.concat([human_50, synthetic_50])
turing_data = turing_data.sample(frac=1).reset_index(drop=True)

turing_data.to_csv('target.csv', index=False)
turing_data.drop(columns=['label']).to_csv('turing.csv', index=False)

## Evaluate human predictions on Turing Test data

In [21]:
def evaluate_turing(target_file, turing_file):
    target = pd.read_csv(target_file)
    turing = pd.read_csv(turing_file)
    combine = pd.merge(target, turing, on='sentence')

    # Counts
    tp = ((combine['label_x'] == 'h') & (combine['label_y'] == 'h')).sum()
    tn = ((combine['label_x'] == 's') & (combine['label_y'] == 's')).sum()
    fp = ((combine['label_x'] == 's') & (combine['label_y'] == 'h')).sum()
    fn = ((combine['label_x'] == 'h') & (combine['label_y'] == 's')).sum()

    # Contingency table
    contingency_table = [[tp, fp], [fn, tn]]

    # Chi-square test
    res = chi2_contingency(contingency_table)
    
    # Confusion matrix
    column_names = ['Actual Human', 'Actual Synthetic']
    index_names = ['Predicted Human', 'Predicted Synthetic']
    confusion_matrix = pd.DataFrame(contingency_table, columns=column_names, index=index_names)

    # Accuracy
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    
    # Precision, recall, F1-score
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1_score = 2 * (precision * recall) / (precision + recall)

    # Print results
    print('-------------------------------- Frequency of Labels')
    print(turing['label'].value_counts().to_string())
    
    print('---------------------------------- Confusion Matrix')
    print(confusion_matrix)
    
    print('----------------------------------- Chi-Square Test')
    print(f'Chi-square          : {res.statistic:.4f}')
    print(f'p-value             : {res.pvalue:.4f}')
    print(f'Degrees of freedom  : {res.dof}')
    print('Expected frequencies:')
    print(res.expected_freq)
    
    print('--------------------------------------- Performance')
    print(f'Accuracy            : {accuracy:.4f}')
    print(f'Precision           : {precision:.4f}')
    print(f'Recall              : {recall:.4f}')
    print(f'F1-score            : {f1_score:.4f}')

In [22]:
evaluate_turing('target.csv', 'turing_jf.csv')

-------------------------------- Frequency of Labels
label
h    54
s    45
---------------------------------- Confusion Matrix
                     Actual Human  Actual Synthetic
Predicted Human                29                25
Predicted Synthetic            20                25
----------------------------------- Chi-Square Test
Chi-square          : 0.5122
p-value             : 0.4742
Degrees of freedom  : 1
Expected frequencies:
[[26.72727273 27.27272727]
 [22.27272727 22.72727273]]
--------------------------------------- Performance
Accuracy            : 0.5455
Precision           : 0.5370
Recall              : 0.5918
F1-score            : 0.5631


In [23]:
evaluate_turing('target.csv', 'turing_cg.csv')

-------------------------------- Frequency of Labels
label
h    56
s    44
---------------------------------- Confusion Matrix
                     Actual Human  Actual Synthetic
Predicted Human                47                 9
Predicted Synthetic             3                41
----------------------------------- Chi-Square Test
Chi-square          : 55.5601
p-value             : 0.0000
Degrees of freedom  : 1
Expected frequencies:
[[28. 28.]
 [22. 22.]]
--------------------------------------- Performance
Accuracy            : 0.8800
Precision           : 0.8393
Recall              : 0.9400
F1-score            : 0.8868


In [24]:
evaluate_turing('target.csv', 'turing_pl.csv')

-------------------------------- Frequency of Labels
label
h    65
s    35
---------------------------------- Confusion Matrix
                     Actual Human  Actual Synthetic
Predicted Human                36                29
Predicted Synthetic            14                21
----------------------------------- Chi-Square Test
Chi-square          : 1.5824
p-value             : 0.2084
Degrees of freedom  : 1
Expected frequencies:
[[32.5 32.5]
 [17.5 17.5]]
--------------------------------------- Performance
Accuracy            : 0.5700
Precision           : 0.5538
Recall              : 0.7200
F1-score            : 0.6261


In [25]:
evaluate_turing('target.csv', 'turing_ms.csv')

-------------------------------- Frequency of Labels
label
h    65
s    35
---------------------------------- Confusion Matrix
                     Actual Human  Actual Synthetic
Predicted Human                40                25
Predicted Synthetic            10                25
----------------------------------- Chi-Square Test
Chi-square          : 8.6154
p-value             : 0.0033
Degrees of freedom  : 1
Expected frequencies:
[[32.5 32.5]
 [17.5 17.5]]
--------------------------------------- Performance
Accuracy            : 0.6500
Precision           : 0.6154
Recall              : 0.8000
F1-score            : 0.6957


In [26]:
evaluate_turing('target.csv', 'turing_mh.csv')

-------------------------------- Frequency of Labels
label
s    60
h    40
---------------------------------- Confusion Matrix
                     Actual Human  Actual Synthetic
Predicted Human                32                 8
Predicted Synthetic            18                42
----------------------------------- Chi-Square Test
Chi-square          : 22.0417
p-value             : 0.0000
Degrees of freedom  : 1
Expected frequencies:
[[20. 20.]
 [30. 30.]]
--------------------------------------- Performance
Accuracy            : 0.7400
Precision           : 0.8000
Recall              : 0.6400
F1-score            : 0.7111
