# Retrieve annotations from MTurk batch result file for main dataset

In [8]:
import csv
from dataset_utils import DatasetEntry
import json
from hashlib import sha256

batch_file = "batch_results/Batch_4265208_batch_results_main.csv"

with open(batch_file, "r") as csv_file:
    csv_reader = csv.reader(csv_file)
    i = 0
    categories = set()
    entries = dict()
    
    # Skip first line 
    next(csv_reader)
    poemids = []
    for row in csv_reader:
        # Get entry id
        poem_id = row[27]
        # Get poem details
        poem1 = row[28]
        poem2 = row[29]
        dataset1 = row[30]
        dataset2 = row[31]
        
        entry = entries.get(poem_id, DatasetEntry(poem_id, poem1, poem2, dataset1, dataset2))
        submitted_values = json.loads(str(row[-1]))[0]
        entry.update_values(submitted_values)
        entries[poem_id] = entry

main_entries = entries

print(len(main_entries))

{'id': '255a6de5-5f57-4302-a1fc-d040ca808c50', 'poem1': "going before thy path , him ,<br>to thee , away ! thy judgment !<br>stand by the faces i see ,<br>and though it 's not virgil<br>light as a shadow of death .<br>in his choice tiny while he was", 'poem2': 'look where the grass is gay<br>with summer blossoms , haply there he cowers ;<br>and search , from spray to spray ,<br>the leafy laurel-bowers ,<br>for well he loves the laurels and the flowers .', 'dataset1': 'lstm', 'dataset2': 'gutenberg', 'coherent': ['2', '2', '2'], 'grammatical': ['2', '1', '1'], 'melodious': ['2', '1', '1'], 'moved': ['2', '2', '1'], 'real': ['1', '2', '1'], 'rhyming': ['2', '1', '1'], 'readable': ['1', '2', '2'], 'comprehensible': ['2', '2', '1'], 'intense': ['1', '1', '2'], 'liking': ['1', '1', '2']}
850


In [9]:
import csv
from dataset_utils import DatasetEntry
import json
from hashlib import sha256


batch_file = "batch_results/Batch_4278643_batch_results_ext.csv"

with open(batch_file, "r") as csv_file:
    csv_reader = csv.reader(csv_file)
    i = 0
    categories = set()
    entries = dict()
    
    # Skip first line 
    next(csv_reader)
    poemids = []
    for row in csv_reader:
        # Get entry id
        poem_id = row[27]
        # Get poem details
        poem1 = row[28]
        poem2 = row[29]
        dataset1 = row[30]
        dataset2 = row[31]

        poem1 = poem1.replace("<eol>", "<br>")
        poem2 = poem2.replace("<eol>", "<br>")
        
        entry = entries.get(poem_id, DatasetEntry(poem_id, poem1, poem2, dataset1, dataset2))
        submitted_values = json.loads(str(row[-1]))[0]
        entry.update_values(submitted_values)
        entries[poem_id] = entry
ext_entries = entries
print(len(ext_entries))

1260


In [4]:
# Put all entries together into one dictionary by their ids
cons_entries = {**main_entries, **ext_entries}
print(len(cons_entries))

2110


In [5]:
import jsonpickle
from csv import DictWriter

split_entries = []
for pair_id, entry in cons_entries.items():
    poem1 = entry.poem1
    poem2 = entry.poem2
    dataset1 = entry.dataset1
    dataset2 = entry.dataset2
    num_annos = len(getattr(entry, "coherent"))
    for i in range(num_annos):
        pair_id_ = f"{pair_id}_{str(i)}"
        new_entry = DatasetEntry(pair_id_, poem1, poem2, dataset1, dataset2)
        for cat in ["coherent", "grammatical", "moved", "real", "rhyming", "readable", "comprehensible", "intense",
                "liking", "melodious"]:
            att = getattr(entry, cat)[i]
            setattr(new_entry, cat, att)
        split_entries.append(new_entry)

print(len(split_entries))
split_entries_dict = json.loads(jsonpickle.encode(split_entries, unpicklable=False))

fieldnames = list(split_entries_dict[0].keys())
with open("annotated_datasets/consolidated_batches.csv", "w+") as f:
    csv_writer = DictWriter(f, fieldnames)
    csv_writer.writeheader()
    csv_writer.writerows(split_entries_dict)


3810
['id', 'poem1', 'poem2', 'dataset1', 'dataset2', 'coherent', 'grammatical', 'melodious', 'moved', 'real', 'rhyming', 'readable', 'comprehensible', 'intense', 'liking']


# Real Poems list
For later identifaction of real poems a file containing all real poems is created

In [None]:
# Get Real Poems
real_poems = []
for entry in split_entries:
    dataset1 = entry.dataset1
    dataset2 = entry.dataset2
    
    if dataset1 == "gutenberg":
        real_poems.append(entry.poem1)
        
    if dataset2 == "gutenberg":
        real_poems.append(entry.poem2)
real_poems = list(set(real_poems))
with open("real_poems.txt", "w+") as f:
    for real_poem in real_poems:
        f.write(real_poem + "\n")
        