# Read the scores produced by every method

In [11]:
import csv
from utils import POEM_FOLDER
from os.path import join


def get_poem_dataset_lookup():
    """
    Return a dictionary with poems as key and their respective dataset association as value
    """
    poem_dataset_lookup = {}
    with open(join("data_poems", "consolidated_batches.csv"), "r") as batch_file:
        csv_reader = csv.reader(batch_file)
        for row in csv_reader:
            poem1 = row[1]
            poem2 = row[2]
            dataset1 = row[3]
            dataset2 = row[4]
            
            poem_dataset_lookup[poem1] = dataset1
            poem_dataset_lookup[poem2] = dataset2
    return poem_dataset_lookup
        

def get_scores(method, subset=True):
    """
    Retrieves the score generated from a specific method
    """
    poem_scores = {}
    poem_dataset_lookup = get_poem_dataset_lookup()
    subset_filename = "subset" if subset else ""
    filename = "scores/" + method + f"_{subset_filename}.csv"
    with open(filename, "r") as f:
        csv_reader = csv.reader(f)
        header = next(csv_reader)
        for row in csv_reader:
            poem = row[0]
            dataset = poem_dataset_lookup[poem]
            scores = {"poem": poem, "dataset": dataset}
            for idx, score in enumerate(row[1:], 1):
                cat = method + "_" + header[idx]
                scores[cat] = score
            poem_scores[poem] = scores 
    return poem_scores, header
                
method_scores = []
for method in ['bertranker', 'crowdgppl', 'bws', 'gppl']:
    method_scores.append(get_scores(method)[0])


# Merge all dictionaries into one on poem basis

In [12]:
cons_dicts = {}
for poem in method_scores[0]:
    cons_dicts[poem] = method_scores[0][poem]
    for d in method_scores[1:]:
        current_scores = cons_dicts[poem]
        other_scores = d[poem]
        current_scores = {**current_scores, **other_scores}
        cons_dicts[poem] = current_scores        
        
print(len(cons_dicts))

519


# Write all score into one large file

In [15]:
import csv 
import math

# Get any poem (in this case the first) to retrieve the colomn names
first_key = list(cons_dicts.keys())[0]
# Retrieve all fieldnames in under one poem entry
fieldnames = list(cons_dicts[first_key].keys())

with open("scores/consolidated_single_poems.csv", "w+") as f:
    writer = csv.writer(f)
    writer.writerow(fieldnames)
    # Iterate over poems in the merged dictionary
    for p in cons_dicts:
        line = []
        # Collect all data for that poem
        for col in fieldnames:
            try:
                value = cons_dicts[p][col]
                line.append(value)
            except:
                print(col)
                print(p)
        writer.writerow(line)


# Normalize Scores previously collected 

In [8]:
import csv
import math 
# Initialize mins and max for each of the 44 columns
mins = [math.inf]*44
maxs = [-math.inf]*44

lines = []
with open("scores/consolidated_single_poems.csv") as f:
    csv_reader = csv.reader(f)
    header = next(csv_reader)
    for row in csv_reader:
        lines.append(row)
        for i, num in enumerate(row[2:]):
            mins[i] = min(mins[i], float(num))
            maxs[i] = max(maxs[i], float(num))


normalized_lines = []
for line in lines:
    norm_line = line[:2]
    for i, num in enumerate(line[2:]):
        norm_val = (float(num) - mins[i])/(maxs[i] - mins[i])
        norm_line.append(norm_val)
    normalized_lines.append(norm_line)

with open("scores/normalized_scores.csv", "w+") as f:
    csv_writer = csv.writer(f)
    csv_writer.writerow(header)
    for line in normalized_lines:
        csv_writer.writerow(line)
    

[-0.2329796403646469, -0.25375258922576904, -0.2615964710712433, -0.2625853419303894, -0.25882112979888916, -0.26394417881965637, -0.25420114398002625, -0.2670304775238037, -0.260204017162323, -0.25886502861976624, -0.25799575448036194, -4.87951561733533, -4.87951561733533, -3.0437596331222236, -4.467384979067327, -5.503429999828406, -3.8889251577550006, -4.920130068629585, -5.725828063649544, -4.214829459834426, -3.484094375177864, -3.954453944379249, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0623027755847492, -1.0623027755847492, -1.0623027755847492, -1.0623027755847492, -1.0623027755847492, -1.0623027755847492, -1.0623027755847492, -1.0623027755847492, -1.0623027755847492, -1.0623027755847492, -1.0623027755847492]
[-0.23271417617797852, -0.23600099980831146, -0.24809706211090088, -0.24998334050178528, -0.24576006829738617, -0.25107160210609436, -0.2386636883020401, -0.25433170795440674, -0.24786609411239624, -0.24846740067005157, -0.24420931935310364, 4.06

# Dataset samples

In [None]:
dataset_lookup = get_poem_dataset_lookup()
datasets = set(list(dataset_lookup.values()))

dataset_samples = {}
for ds in datasets:
    print(ds)
    # Get 10 samples for each ds
    samples = []
    for poem, dataset in dataset_lookup.items():
        if dataset == ds:
            p = poem.replace("<br>", "\n")
            samples.append(p)
        if len(samples) >= 10:
            break
    dataset_samples[ds] = samples

for key in dataset_samples:
    print(f"~~~~~~~~~~~ {key} ~~~~~~~~~~~")
    for sample in dataset_samples[key]:
        print("=========")
        print(sample)
        print("=========")
    print("\n\n")