# Evaluating on the Grand et al data

All models, evaluated on all data except for the development set used in grand_hyper.

# Hyperparameter values

In [1]:
hyper_offset = 1.0
hyper_jitter = False
hyper_average = True
hyper_alpha1 = 0.02
hyper_alpha2 = 0.05

In [2]:
numfolds = 5
num_randseeds = 3

In [3]:
import os
from scipy import stats
import numpy as np 
import pandas as pd
import zipfile
import math
import sklearn
import torch
import torch.optim as optim
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
import matplotlib.pyplot as plt

# Reading in the data.

## GLoVE

In [4]:
glove_path = "../glove/glove.42B.300d.zip"
glove_file = "glove.42B.300d.txt"

feature_dim = 300

word_vectors = { }
            
with zipfile.ZipFile(glove_path) as azip:
    with azip.open(glove_file) as f:
        for line in f:
            values = line.split()
            word = values[0].decode()
            vector = np.array(values[1:], dtype=np.float32)
            word_vectors[word] = vector
print('glove vectors loaded')

glove vectors loaded


## Grand features

In [6]:
grandratings_dir = "../data/Grandetal-data/"
grandfeatures_path = "../data/Grandetal-data/features.xlsx"
grandfeatures_df = pd.read_excel(grandfeatures_path)

grand_freq_path = "../frequency_baseline/Grandetal/unsorted/"

  warn(msg)


## Function for reading a specific Grand dataset

In [7]:
# reading in Grand data
def read_grand_data(filename, grandratings_dir, grandfeatures_df):
    # extract category and feature
    grandcategory, grandfeature = filename[:-4].split("_")
        
    # read human ratings, make gold column
    df = pd.read_csv(grandratings_dir + filename)
    nspeakers = len(df.columns) -1
    df["Average"] = [row.iloc[1:26].sum() / nspeakers for _, row in df.iterrows()]
    # z-scores of average ratings
    df["Gold"] = (df["Average"] - df["Average"].mean()) / df["Average"].std()
        
    # obtain seed words from excel file
    relevant_row = grandfeatures_df[grandfeatures_df.Dimension == grandfeature]
    seedwords = relevant_row.iloc[:, 1:].values.flatten().tolist()
    pos_seedwords = seedwords[:3]
    neg_seedwords = seedwords[3:]
    
    return (grandcategory, grandfeature, pos_seedwords, neg_seedwords, df)

# Function for running crossvalidation

In [8]:
import eval_dim
import compute_dim
import statistics

def crossvalidation(filenames, method, word_vectors, grandratings_dir, grandfeatures_df, random_seed = 123, verbose = False):
    # crossvalidation(traintestset, method, word_vectors, grandratings_dir, grandfeatures_df)
    all_evals = [ ]
    
    rng = np.random.default_rng(seed = 3)
    
    for filename in filenames:
            grandcategory, grandfeature, pos_seedwords, neg_seedwords, df = read_grand_data(filename, 
                                                                                            grandratings_dir, 
                                                                                            grandfeatures_df)

            category_feature_ocp = []
            # storage for word vectors and gold values for this dataset
            all_thisdata_vectors = []
            all_thisdata_gold = []
     
            # collect word vectors and gold ratings
            for row in df.itertuples():
                # row.Row is the word. look it up in word_vectors
                all_thisdata_vectors.append( word_vectors[ row.Row ])
                # gold rating: use z-scored average
                all_thisdata_gold.append( row.Gold)

            # collect frequencies
            freq_file = grand_freq_path + 'freq_ranking.unsorted.' + grandcategory + '_' + grandfeature
            frequency_df = pd.read_csv(freq_file, sep = "\t", header = None)
            frequency_df.columns = ["word", "frequency"]
            frequency_df["log_frequency"] = np.log(frequency_df["frequency"])
                
            for key in frequency_df["word"]:
                if key.strip() in ['north dakota']:
                    words = key.split()
                    newkey = '_'.join([words[0], words[1]])
                    frequency_df.replace({'word': key}, {'word': newkey})
                elif key.strip() in ['rhode island', 'south carolina', 'west virginia', 'south dakota', 'north carolina', 'los angeles', 'new york', 'hong kong', 'new hampshire', 'new jersey', 'san francisco', 'new mexico', 'ping pong']:
                    words = key.split()
                    newkey = '-'.join([words[0], words[1]])
                    frequency_df.replace({'word': key}, {'word': newkey})

            # crossvalidation setup: give indices to datapoints
            fold = rng.integers(low = 0, high = method["numfolds"], size = len(all_thisdata_gold))

            # store the evaluation results from the different test folds
            evals = [ ]

            # iterate over folds, evaluate for each of them
            for testfold in range(method["numfolds"]):
                # compute training and test data for this fold
                print('testfold : ', testfold)
                test_indices =  [i for i in range(len(all_thisdata_gold)) if fold[i] == testfold]
                train_indices = [i for i in range(len(all_thisdata_gold)) if fold[i] != testfold]

                gold_test =  [ell["Gold"] for _, ell in df.iloc[ test_indices ].iterrows()]
                gold_train = [ ell["Gold"] for _, ell in df.iloc[ train_indices ].iterrows()]
                words_test =  [ell["Row"] for _, ell in df.iloc[ test_indices].iterrows()]
                words_train = [ell["Row"] for _, ell in df.iloc[ train_indices].iterrows()]
                vec_test =  [word_vectors[ w ] for w in words_test]
                vec_train = [word_vectors[ w ] for w in words_train ]


                # compute seed-based dimension, and its predictions
                if method["method"] == "seedbased":
                    dimension = compute_dim.dimension_seedbased(pos_seedwords, neg_seedwords, word_vectors)
                    df["Pred"] = compute_dim.predict_coord_fromtrain(vec_train, gold_train, dimension, all_thisdata_vectors)

                elif method["method"] == "fitted":
                    dimension, weight, bias = compute_dim.dimension_fitted_fromratings(vec_train, gold_train, 
                                                                                       method["feature_dim"],
                                                                                       random_seed = random_seed)
                    df["Pred"] = compute_dim.predict_coord_fromline(all_thisdata_vectors, dimension, weight, bias)

                elif method["method"] == "fitted_seedwords":
                    dimension, weight, bias = compute_dim.dimension_fitted_fromratings_seedwords(vec_train, gold_train, 
                                                                    method["feature_dim"], 
                                                                    pos_seedwords, neg_seedwords, word_vectors,
                                                                    offset = method["offset"], jitter = method["jitter"],
                                                                    random_seed = random_seed)
                    df["Pred"] = compute_dim.predict_coord_fromline(all_thisdata_vectors, dimension, weight, bias)

                elif method["method"] == "fitted_seeddims":
                    dimension, weight, bias = compute_dim.dimension_fitted_fromratings_seeddims(vec_train, gold_train, 
                                                                    method["feature_dim"], 
                                                                    pos_seedwords, neg_seedwords, word_vectors,
                                                                    do_average = method["do_average"], 
                                                                    alpha = method["alpha"],
                                                                    random_seed = random_seed)
                    df["Pred"] = compute_dim.predict_coord_fromline(all_thisdata_vectors, dimension, weight, bias)

                elif method["method"] == "combined":
                    dimension, weight, bias = compute_dim.dimension_fitted_fromratings_combined(vec_train, gold_train,
                                                                    method["feature_dim"],
                                                                    pos_seedwords, neg_seedwords, word_vectors,
                                                                    offset = method["offset"], jitter = method["jitter"],
                                                                    do_average = method["do_average"], 
                                                                    alpha = method["alpha"],
                                                                    random_seed = random_seed)
                    df["Pred"] = compute_dim.predict_coord_fromline(all_thisdata_vectors, dimension, weight, bias)

                elif method["method"] == "frequency":
                    df["Pred"] = frequency_df["log_frequency"]

                    pred_train = [ell["Pred"] for _, ell in df.iloc[ train_indices ].iterrows()]
                    weight, bias = compute_dim.fit_dimension_coef(gold_train, pred_train)
                    updated_model_predictions = [(v - bias) / weight for v in df["Pred"]]
                    df["Pred"] = updated_model_predictions

                elif method["method"] == "random":
                    np.random.seed(8)
                    df["Pred"] = np.random.uniform(-3, 3, size=len(df))
                    
                else:
                    raise Exception("shouldn't be here")


                weight, bias = compute_dim.fit_dimension_coef(df["Gold"], df["Pred"])
                updated_model_predictions = [(v - bias) / weight for v in df["Pred"]]
                df["Pred"] = updated_model_predictions
                
                # order consistency pairwise: test values tested for their ordering wrt. all values, training and test
                # MSE: evaluate on test only
                
                e = { "ocp" : eval_dim.pairwise_order_consistency_wrt(df["Gold"], df["Pred"], test_indices),
                      "mse" : eval_dim.mean_squared_error(gold_test, [p for i, p in enumerate(df["Pred"]) if i in test_indices]),
                      "feature" : grandfeature,
                      "category" : grandcategory}
    
                all_evals.append(e)
                category_feature_ocp.append(eval_dim.pairwise_order_consistency_wrt(df["Gold"], df["Pred"], test_indices))
                
            avg_cat_feat = sum(category_feature_ocp) / len(category_feature_ocp)
            print(grandcategory, grandfeature, ' ocp: ', f"{avg_cat_feat:.3f}", f"{statistics.mean(category_feature_ocp):.3f}")
    
    if verbose:
        ocps = [e["ocp"] for e in all_evals if e["ocp"] is not None]
        mses = [e["mse"] for e in all_evals if e["mse"] is not None]
        
        print("\n\nOverall", method["method"], f"OC_p {statistics.mean(ocps):.2f} ({statistics.stdev(ocps):.2f})", f"MSE mean {statistics.mean(mses):.2f} ({statistics.stdev(mses):.2f}) median {statistics.median(mses):.3f}")
        
    return all_evals

# Functions for aggregating results

This is different from the hyperparameter setting in that we don't look at different parameter settings, just one.

In [9]:
from collections import defaultdict
import statistics

# given a list of results dictionaries, 
# group them by the given dictionary keys
# return as a dictionary keys -> results dictionaries
def eval_aggregate_by(evals, keylabels):
    bydataset_eval = defaultdict(list)
    
    for e in evals:
        key = tuple([str(e[k]) for k in keylabels])
        bydataset_eval[ key ].append(e)
        
    return bydataset_eval


# given a list of results dictionaries,
# compute mean, median and standard deviation over values for a particular key

def eval_summary_by(evals, keylabel):
    vals = [e[keylabel] for e in evals if e[keylabel] is not None]   
    return (statistics.mean(vals), statistics.median(vals), statistics.stdev(vals))

# given a dictionary of results (parameters -> result dictionary list),
# * aggregate by cateogy and feature
# * for each category/feature, compute mean ocp and mse values
# * compute mean and sd ocp and mse over all category/feature pairs
#   for MSE, average over both means and medians
def eval_eval(results):
    # aggregate by condition = by category and feature
    results_bycond = eval_aggregate_by(results,["category", "feature"])

    # compute mean ocp and mse values.
    # ocp: we use mean for each condition.
    # mse: we use median for each condition
    ocps = [eval_summary_by(cond_results, "ocp")[0] for cond_results in results_bycond.values()]
    msemeds = [eval_summary_by(cond_results, "mse")[1] for cond_results in results_bycond.values()]
    msemeans = [eval_summary_by(cond_results, "mse")[0] for cond_results in results_bycond.values()]
    # print('OCPS : ', ocps, 'MSEMEDS : ', msemeds, 'MSEMEANS : ', msemeans)
    
    # compute mean and standard deviation over ocps and mses
    return (statistics.mean(ocps), statistics.stdev(ocps), 
            statistics.mean(msemeans), statistics.stdev(msemeans),
            statistics.mean(msemeds), statistics.stdev(msemeds))



# The data that is not in the development set

We set aside 6 category/feature pairs for development. We use the rest for testing through crossvalidation.

In [10]:
filenames = [f for f in os.listdir(grandratings_dir) if f.endswith("csv")]

import random
random.seed(789)
devset = random.sample(filenames, 6)
traintestset = [f for f in filenames if f not in devset]
# [ filename[:-4].split("_") for filename in traintestset]

# Running the actual evaluation

## Frequency baseline

In [13]:
method = {"method": "frequency",
          "numfolds" : numfolds}

results = crossvalidation(traintestset, method, word_vectors, grandratings_dir, grandfeatures_df)

# ocp_mean, mse_mean, mse_med = eval_eval(results)
ocp_mean, ocp_sd, msemean_mean, msemean_sd, msemed_mean, msemed_sd = eval_eval(results)

print("Frequency baseline:", 
      f"OC_P mean {ocp_mean:.2f}", 
      # f"MSE mean {mse_mean:.3f}", 
      f"MSE mean {msemean_mean:.3f}", 
      # f"MSE median {mse_med:.3f}")
      f"MSE median {msemed_mean:.2f}")
    

Frequency baseline: OC_P mean 0.58 MSE mean 134.866 MSE median 126.16


# Random baseline

In [14]:
method = {"method": "random",
          "numfolds": numfolds}

results = crossvalidation(traintestset, method, word_vectors, grandratings_dir, grandfeatures_df)

ocp_mean, ocp_sd, msemean_mean, msemean_sd, msemed_mean, msemed_sd = eval_eval(results)

print("Random baseline:", 
      f"OC_P mean {ocp_mean:.2f}", 
      # f"MSE mean {mse_mean:.3f}", 
      f"MSE mean {msemean_mean:.3f}", 
      # f"MSE median {mse_med:.3f}")
      f"MSE median {msemed_mean:.2f}")

Random baseline: OC_P mean 0.54 MSE mean 3151.139 MSE median 2736.24


## Seed-based dimensions

In [15]:
method = { "method": "seedbased",
          "numfolds" : numfolds}

results = crossvalidation(traintestset, method, word_vectors, grandratings_dir, grandfeatures_df)

In [16]:
ocp_mean, ocp_sd, msemean_mean, msemean_sd, msemed_mean, msemed_sd = eval_eval(results)

print("Seed-based method:",
      f"OC_P {ocp_mean:.3f} ({ocp_sd:.2f})",
      f"MSE mean {msemean_mean:.3f} ({msemean_sd:.2f})",
      f"MSE med {msemed_mean:.3f} ({msemed_sd:.2f})")
 

Seed-based method: OC_P 0.645 (0.10) MSE mean 45405.890 (316311.53) MSE med 50325.328 (352349.80)


## Fitted dimensions

In [17]:
method = {"method": "fitted",
          "numfolds" : numfolds,
          "feature_dim" : feature_dim}


random.seed(5)
randoms = [random.randrange(0,100) for _ in range(num_randseeds)]

results = [ ]

for rval in randoms:
    theseresults = crossvalidation(traintestset, method, word_vectors, grandratings_dir, grandfeatures_df, random_seed = rval)
    results += theseresults
    

In [18]:
ocp_mean, ocp_sd, msemean_mean, msemean_sd, msemed_mean, msemed_sd = eval_eval(results)

print("Fitted method:",
      f"OC_P {ocp_mean:.3f} ({ocp_sd:.2f})",
      f"MSE mean {msemean_mean:.3f} ({msemean_sd:.2f})",
      f"MSE med {msemed_mean:.3f} ({msemed_sd:.2f})")


Fitted method: OC_P 0.566 (0.03) MSE mean 78127.085 (512859.87) MSE med 111.436 (121.75)


# Fitted dimensions with seed words

In [11]:
method = { "method": "fitted_seedwords",
          "numfolds" : numfolds,
          "offset" : hyper_offset,
          "jitter" : hyper_jitter,
          "feature_dim" : feature_dim}

random.seed(5)
randoms = [random.randrange(0,100) for _ in range(num_randseeds)]
results = [ ]

for rval in randoms:
    theseresults = crossvalidation(traintestset, method, word_vectors, grandratings_dir, grandfeatures_df, random_seed = rval)
    results += theseresults


testfold :  0
testfold :  1
testfold :  2
testfold :  3
testfold :  4
cities temperature  ocp:  0.588 0.588
testfold :  0
testfold :  1
testfold :  2
testfold :  3
testfold :  4
professions intelligence  ocp:  0.545 0.545
testfold :  0
testfold :  1
testfold :  2
testfold :  3
testfold :  4
clothing location  ocp:  0.538 0.538
testfold :  0
testfold :  1
testfold :  2
testfold :  3
testfold :  4
cities arousal  ocp:  0.569 0.569
testfold :  0
testfold :  1
testfold :  2
testfold :  3
testfold :  4
clothing arousal  ocp:  0.605 0.605
testfold :  0
testfold :  1
testfold :  2
testfold :  3
testfold :  4
states size  ocp:  0.621 0.621
testfold :  0
testfold :  1
testfold :  2
testfold :  3
testfold :  4
sports intelligence  ocp:  0.562 0.562
testfold :  0
testfold :  1
testfold :  2
testfold :  3
testfold :  4
clothing wealth  ocp:  0.526 0.526
testfold :  0
testfold :  1
testfold :  2
testfold :  3
testfold :  4
weather danger  ocp:  0.552 0.552
testfold :  0
testfold :  1
testfold :  2


In [12]:
ocp_mean, ocp_sd, msemean_mean, msemean_sd, msemed_mean, msemed_sd = eval_eval(results)

print("Fitted, with seed words,", 
      "offset", method["offset"], "jitter", method["jitter"])
print(f"OC_P  {ocp_mean:.3f} ({ocp_sd:.2f})",
      f"MSE mean {msemean_mean:.3f} ({msemean_sd:.2f})",
      f"MSE med {msemed_mean:.3f} ({msemed_sd:.2f})")


Fitted, with seed words, offset 1.0 jitter False
OC_P  0.562 (0.02) MSE mean 17284.245 (44588.95) MSE med 150.583 (132.42)


# Fitted dimensions with seed dimensions

In [None]:
method = { "method": "fitted_seeddims",
          "numfolds" : numfolds,
          "alpha" : hyper_alpha1,
          "do_average" : hyper_average,
          "feature_dim" : feature_dim}

random.seed(5)
randoms = [random.randrange(0,100) for _ in range(num_randseeds)]
results = [ ]

for rval in randoms:
    theseresults = crossvalidation(traintestset, method, word_vectors, grandratings_dir, grandfeatures_df, random_seed = rval)
    results += theseresults

    
    

In [None]:
ocp_mean, ocp_sd, msemean_mean, msemean_sd, msemed_mean, msemed_sd = eval_eval(results)

print("Fitted, with seed dim.s,", 
      "alpha", method["alpha"], "avg", method["do_average"])
print(f"OC_P  {ocp_mean:.3f} ({ocp_sd:.2f})",
      f"MSE mean {msemean_mean:.3f} ({msemean_sd:.2f})",
      f"MSE med {msemed_mean:.3f} ({msemed_sd:.2f})")


# Fitted dimensions with seeds as words and dimensions

In [None]:
method = {"method": "combined",
          "numfolds" : numfolds,
          "alpha" : hyper_alpha2,
          "do_average" : hyper_average,
          "offset" : hyper_offset,
          "jitter" : hyper_jitter,
          "feature_dim" : feature_dim}


random.seed(5)
randoms = [random.randrange(0,100) for _ in range(3)]
results = [ ]

for rval in randoms:
    theseresults = crossvalidation(traintestset, method, word_vectors, grandratings_dir, grandfeatures_df, random_seed = rval)
    results += theseresults

In [None]:
ocp_mean, ocp_sd, msemean_mean, msemean_sd, msemed_mean, msemed_sd = eval_eval(results)

print("Fitted, with seed words and dim.s,", 
      "alpha", method["alpha"], "avg", method["do_average"],
      "offset", method["offset"], "jitter", method["jitter"])
print(f"OC_P  {ocp_mean:.3f} ({ocp_sd:.2f})",
      f"MSE mean {msemean_mean:.3f} ({msemean_sd:.2f})",
      f"MSE med {msemed_mean:.3f} ({msemed_sd:.2f})")
