# Import main batch

In [3]:
from dataset_utils import DatasetEntry
import json
from collections import Counter
import csv

batch_path = "batch_results/Batch_4265208_batch_results_main.csv"

with open(batch_path, "r") as csv_file:
    csv_reader = csv.reader(csv_file)
    csv_lines = list(csv_reader)
    
# Get all columns but leave out 'Approve' 'Reject' since they are not in the Data yet
columns = csv_lines[0][:-2]
data_lines = csv_lines[1:]

csv_entries = []
for line in data_lines:
    entry = dict()
    for idx, col in enumerate(columns):
        entry[col] = line[idx]
    csv_entries.append(entry)
    
pair_ids = []
dataset_entries = {}
for entry in csv_entries:
    # Get entry id
    pair_id = entry['Input.pair_id']
    # If entry is not yet in dict create it
    if pair_id not in dataset_entries.keys():
        pair_ids.append(pair_id)
        # Get poem details
        poem1 = entry['Input.poem1']
        poem2 = entry['Input.poem2']
        dataset1 = entry['Input.poem1_dataset']
        dataset2 = entry['Input.poem2_dataset']

        dataset_entries[pair_id] = DatasetEntry(pair_id, poem1, poem2, dataset1, dataset2)

    dataset_entry = dataset_entries[pair_id]
    submitted_values = json.loads(str(entry['Answer.taskAnswers']))[0]
    dataset_entry.update_values(submitted_values)   
    dataset_entries[pair_id] = dataset_entry

# Creation of extension dataset 

In [4]:
import csv
import random
from collections import Counter
from statistics import mean

# Get all poems with their respective data set
unique_poems = []
pairs = []
old_poems = []
for _, entry in dataset_entries.items():
    tuple1 = (entry.poem1, entry.dataset1)
    tuple2 = (entry.poem2, entry.dataset2)
    if tuple1 not in unique_poems:
        unique_poems.append(tuple1)
    if tuple2 not in unique_poems:
        unique_poems.append(tuple2)
    old_poems.extend([tuple1, tuple2]*3)
    pairs.append((tuple1, tuple2))


old_pairs = pairs
least_used_poems = list(set([poem for poem, count in Counter(old_poems).items() if count <= 3]))
unique_poems = least_used_poems
print(len(old_poems))

n = 200
chosen_poems = []
temp_poems = []
chance_real = 0.245
chance_not_real = 1. - chance_real

num_real_poems = float(n) * chance_real
num_not_real_poems = float(n) * chance_not_real/7

print(f"Chances: Real({chance_real} - Not Real({chance_not_real})")
print(f"Num: Real({num_real_poems}) - Not Real({num_not_real_poems})")

while len(temp_poems) < num_real_poems:
    p = random.choice(unique_poems)
    if p not in temp_poems and p[1] == "gutenberg":
        temp_poems.append(p)
chosen_poems.extend(temp_poems)

for model in ['gpt2','lstm','ngram','hafez','deepspeare','jhamtani', 'true_poetry']:
    temp_poems = []
    while len(temp_poems) < num_not_real_poems:
        p = random.choice(unique_poems)
        if p not in temp_poems and p[1] == model:
            temp_poems.append(p)
    chosen_poems.extend(temp_poems)
print(len(unique_poems))
print(len(chosen_poems))

# Create pairs
pairs = []
leftover_poems = list(set(least_used_poems) - set(chosen_poems))
print(f"Num leftovers: {len(leftover_poems)}")
for poem in chosen_poems:
    # Choose 6 other poems differing from the current and not in the list
    for i in range(6):
        counter_part = random.choice(leftover_poems)
        while (poem, counter_part) in pairs + old_pairs or (counter_part, poem) in pairs + old_pairs:
            counter_part = random.choice(leftover_poems)
        pairs.append((poem, counter_part))

new_unique_poems = []
new_poems = []
num_real = 0
for pair in pairs:
    tuple1 = pair[0]
    tuple2 = pair[1]
    if tuple1[1] == "gutenberg" or tuple2[1] == "gutenberg":
        num_real += 1 
    # print(tuple2)
    new_poems.extend([tuple1, tuple2])
    if tuple1 not in new_unique_poems:
        new_unique_poems.append(tuple1)
    if tuple2 not in new_unique_poems:
        new_unique_poems.append(tuple2)


print(f"Num unique poems in batch: {len(new_unique_poems)}")
print(num_real)
print(pairs[:10])
print(len(pairs))
counter = Counter(new_poems)
counter.most_common(10)


print(len(old_poems))
old_new_poems = old_poems + new_poems
final_occurences = Counter(old_new_poems)
mean([count for _, count in final_occurences.items()])


# Check how many times each poem is compared to another one
old_pairs = set(old_pairs)
print(len(old_pairs))
new_pairs = set(pairs)
print(len(new_pairs))
comparison_poems = []
for pair in (old_pairs | new_pairs):
    comparison_poems.append(pair[0])
    comparison_poems.append(pair[1])
single_comparison_counter = Counter(comparison_poems)
mean([count for poem, count in single_comparison_counter.items()])

5100
Chances: Real(0.245 - Not Real(0.755)
Num: Real(49.0) - Not Real(21.571428571428573)
1309
203
Num leftovers: 1106
Num unique poems in batch: 934
663
[(("no , for the purged ear apprehends<br>earth 's import , not the eye late dazed .<br>the voice said , “ call my works thy friends !<br>at nature dost thou shrink amazed ?<br>god is it who transcends . ”", 'gutenberg'), ('was always: though is not a moment run<br>and i shall list with music to his own<br>what would he not to tell me, for aright<br>knows that he loves, the art of all the sight', 'deepspeare')), (("no , for the purged ear apprehends<br>earth 's import , not the eye late dazed .<br>the voice said , “ call my works thy friends !<br>at nature dost thou shrink amazed ?<br>god is it who transcends . ”", 'gutenberg'), ('they wanna listen mister uncle ben. <br>have mercy goodness gracious bad email ! <br>not even bother calling me amen, <br>for whom my mother ought to send e mail. <br><br>too many people under handed swords,

2.798376184032476