In [5]:
import csv

batch_path = "batch_results/Batch_4265208_batch_results_main.csv"

with open(batch_path, "r") as csv_file:
    csv_reader = csv.reader(csv_file)
    csv_lines = list(csv_reader)
    


# Get all columns but leave out 'Approve' 'Reject' since they are not in the Data yet
columns = csv_lines[0][:-2]
data_lines = csv_lines[1:]

# Read entries from previous annotated batch
csv_entries = []
for line in data_lines:
    entry = dict()
    for idx, col in enumerate(columns):
        entry[col] = line[idx]
    csv_entries.append(entry)



In [4]:
from dataset_utils import DatasetEntry
import json

pair_ids = []
dataset_entries = {}
for entry in csv_entries:
    # Get entry id
    pair_id = entry['Input.pair_id']
    pair_ids.append(pair_id)

    # If entry is not yet in dict create it
    if pair_id not in dataset_entries.keys():
        # Get poem details
        poem1 = entry['Input.poem1']
        poem2 = entry['Input.poem2']
        dataset1 = entry['Input.poem1_dataset']
        dataset2 = entry['Input.poem2_dataset']

        poem1 = poem1.replace("<eol>", "<br>")
        poem2 = poem2.replace("<eol>", "<br>")

        dataset_entry = DatasetEntry(pair_id, poem1, poem2, dataset1, dataset2)
        dataset_entries[pair_id] = dataset_entry

    dataset_entry = dataset_entries[pair_id]
    submitted_values = json.loads(str(entry['Answer.taskAnswers']))[0]
    dataset_entry.update_values(submitted_values)  
print(len(dataset_entries))

850


In [5]:
import csv
import random

from collections import Counter
from statistics import mean
import math
from random import shuffle

# Get all poems with their respective data set
unique_poems = []
pairs = []
old_poems = []
old_dataset_occ = []
for _, entry in dataset_entries.items():
    tuple1 = (entry.poem1, entry.dataset1)
    tuple2 = (entry.poem2, entry.dataset2)
    if tuple1 not in unique_poems:
        unique_poems.append(tuple1)
    if tuple2 not in unique_poems:
        unique_poems.append(tuple2)
    old_poems.extend([tuple1, tuple2]*3)
    old_dataset_occ.extend([entry.dataset1, entry.dataset2])
    pairs.append((tuple1, tuple2))

print(Counter(old_dataset_occ))

old_pairs = pairs
least_used_poems = list(set([poem for poem, count in Counter(old_poems).items() if count <= 1]))
unique_poems = list(set(old_poems))
print(len(old_poems))

n = 210
chosen_poems = []
# Choose 50% real poems
temp_poems = []
chance_real = 0.5
chance_not_real = 1. - chance_real

num_real_poems = math.floor(float(n) * chance_real)
num_not_real_poems = math.floor(float(n) * chance_not_real / 7)

print(f"Chances: Real({chance_real} - Not Real({chance_not_real})")
print(f"Num: Real({num_real_poems}) - Not Real({num_not_real_poems})")


poem_pool = unique_poems
while len(temp_poems) < num_real_poems:
    p = random.choice(poem_pool)
    if p not in temp_poems and p[1] == "gutenberg":
        temp_poems.append(p)
chosen_poems.extend(temp_poems)

source_datasets = []
for model in ['gpt2','lstm','ngram','hafez','deepspeare','jhamtani', 'true_poetry']:
    # Choose GPT-2 poems
    temp_poems = []
    while len(temp_poems) < num_not_real_poems:
        p = random.choice(poem_pool)
        if p not in temp_poems and p[1] == model:
            temp_poems.append(p)
            source_datasets.append(model)
    chosen_poems.extend(temp_poems)
print(f"Num unique poems overall: {len(unique_poems)}")
print(f"Num chosen poems: {len(chosen_poems)}")
print(f"Occurences data set in base poems: {Counter(source_datasets)}")

shuffle(chosen_poems)
chosen_poems = chosen_poems[:n]


# Create pairs
pairs = []
leftover_poems = list(set(old_poems) - set(chosen_poems))
extension_dataset_list = []
print(f"Num leftovers: {len(leftover_poems)}")
print(len(chosen_poems))


corpora = ['gpt2','lstm','ngram','hafez','deepspeare','jhamtani','gutenberg','true_poetry']
for poem in chosen_poems:
    # Choose 6 other poems differing from the current and not in the list
    for i in range(6):
        # choose a data set 
        probs = [chance_not_real / 7.] * 8
        probs[6] = chance_real
        dataset = random.choices(corpora, probs, k=1)[0]
        counter_part = random.choice(leftover_poems)
        extension_dataset_list.append(dataset)
        while counter_part[1] != dataset or (poem, counter_part) in pairs + old_pairs or (counter_part, poem) in pairs + old_pairs:
            counter_part = random.choice(leftover_poems)
        # Swap the two poems with a 50% chance
        if random.randint(0,1) == 0:
            poem, counter_part = counter_part, poem
        pairs.append((poem, counter_part))

new_pairs = pairs
print(f"Num new pairs: {len(new_pairs)}")
print(f"Occurences data sets as extension poems: {Counter(extension_dataset_list)}")
        
new_unique_poems = []
new_poems = []
num_real = 0
for pair in pairs:
    tuple1 = pair[0]
    tuple2 = pair[1]
    if tuple1[1] == "gutenberg" or tuple2[1] == "gutenberg":
        num_real += 1 
    # print(tuple2)
    new_poems.extend([tuple1, tuple2])
    if tuple1 not in new_unique_poems:
        new_unique_poems.append(tuple1)
    if tuple2 not in new_unique_poems:
        new_unique_poems.append(tuple2)

# Get stats about newly created pairs
print(f"Num new poems: {len(new_poems)}")
print(f"Num new unique poems in batch: {len(new_unique_poems)}")
print(f"Num real in new pairs: {num_real}")
print(f"Num new pairs: {len(pairs)}")


print(f"Num old poems {len(old_poems)}")
old_new_poems = old_poems + new_poems
final_occurences = Counter(old_new_poems)
# How many times a poem occurs on average
print(f"Mean num occurencences: {mean([count for _, count in final_occurences.items()])}")


print(f"Occurences Dataset in old and new combined: {Counter([dataset for _, dataset in old_new_poems])}")


# Check how many times each poem is compared to another one
print(f"Num old pairs: {len(old_pairs)}")
old_pairs = set(old_pairs)
print(len(old_pairs))
new_pairs = set(new_pairs)
print(len(new_pairs))
comparison_poems = []
for pair in (old_pairs | new_pairs):
    comparison_poems.append(pair[0])
    comparison_poems.append(pair[1])
single_comparison_counter = Counter(comparison_poems)
print()
print(f"Mean unique comparisons: {mean([count for poem, count in single_comparison_counter.items()])}")

Counter({'gutenberg': 526, 'ngram': 202, 'true_poetry': 180, 'jhamtani': 164, 'deepspeare': 163, 'hafez': 159, 'lstm': 153, 'gpt2': 153})
5100
Chances: Real(0.5 - Not Real(0.5)
Num: Real(105) - Not Real(15)
Num unique poems overall: 1469
Num chosen poems: 210
Occurences data set in base poems: Counter({'gpt2': 15, 'lstm': 15, 'ngram': 15, 'hafez': 15, 'deepspeare': 15, 'jhamtani': 15, 'true_poetry': 15})
Num leftovers: 1259
210
Num new pairs: 1260
Occurences data sets as extension poems: Counter({'gutenberg': 603, 'ngram': 102, 'lstm': 97, 'hafez': 97, 'true_poetry': 93, 'gpt2': 93, 'deepspeare': 90, 'jhamtani': 85})
Num new poems: 2520
Num new unique poems in batch: 974
Num real in new pairs: 908
Num new pairs: 1260
Num old poems 5100
Mean num occurencences: 5.187202178352621
Occurences Dataset in old and new combined: Counter({'gutenberg': 2760, 'ngram': 812, 'true_poetry': 737, 'hafez': 694, 'jhamtani': 672, 'deepspeare': 664, 'gpt2': 649, 'lstm': 632})
Num old pairs: 850
850
1260



# Create actual data set

In [6]:
from dataset_utils import PairwisePoemsExt as PairwisePoems, questions
from random import shuffle
import uuid
import jsonpickle

amt_pairs = []
dataset_list = []
for pair in new_pairs:
    poem1 = pair[0][0]
    dataset1 = pair[0][1]
    poem2 = pair[1][0]
    dataset2 = pair[1][1]
    
    dataset_list.extend([dataset1, dataset2])
    
    question_ids = list(questions.keys())
    shuffle(question_ids)
    question_set1 = [(questions[q_id], q_id) for q_id in question_ids[:5]]
    question_set2 = [(questions[q_id], q_id) for q_id in question_ids[5:]]
    
    # setup first pair
    pair_id = str(uuid.uuid4())
    
    pair1 = PairwisePoems(pair_id, poem1, poem2, dataset1, dataset2,
                         *question_set1[0],
                         *question_set1[1],
                         *question_set1[2],
                         *question_set1[3],
                         *question_set1[4])
    pair2 = PairwisePoems(pair_id, poem1, poem2, dataset1, dataset2,
                         *question_set2[0],
                         *question_set2[1],
                         *question_set2[2],
                         *question_set2[3],
                         *question_set2[4])
    amt_pairs.extend([pair1, pair2])

dataset_occ = Counter(dataset_list)
print(dataset_occ)
# Write to csv
dataset_json = jsonpickle.encode(amt_pairs, unpicklable=False)
dataset_dict = json.loads(dataset_json)
shuffle(dataset_dict)
fieldnames = dataset_dict[0].keys()
print(fieldnames)
with open("csv_dataset_ext.csv", "w+", encoding="utf-8") as csv_file:
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(dataset_dict)
    
    

Counter({'gutenberg': 1182, 'hafez': 217, 'ngram': 206, 'true_poetry': 197, 'gpt2': 190, 'jhamtani': 180, 'deepspeare': 175, 'lstm': 173})
dict_keys(['pair_id', 'poem1', 'poem2', 'poem1_dataset', 'poem2_dataset', 'question1', 'question1_id', 'question2', 'question2_id', 'question3', 'question3_id', 'question4', 'question4_id', 'question5', 'question5_id'])
