# Evaluating on the Grand et al data

All models, evaluated on all data except for the development set used in grand_hyper.

# Reading in the data.

## GLoVE

In [1]:
import os
from scipy import stats
import numpy as np 
import pandas as pd
import zipfile
import math
import sklearn
import torch
import torch.optim as optim
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
import matplotlib.pyplot as plt

In [2]:
glove_path = "glove/glove.42B.300d.zip"
glove_file = "glove.42B.300d.txt"

feature_dim = 300

word_vectors = { }

with zipfile.ZipFile(glove_path) as azip:
    with azip.open(glove_file) as f:
        for line in f:
            values = line.split()
            word = values[0].decode()
            vector = np.array(values[1:], dtype=np.float32)
            word_vectors[word] = vector

## Grand features

In [3]:
grandratings_dir = "Grand_etal_csv/"
grandfeatures_path = "/Users/kee252/Data/grand_directions_in_space/features.xlsx"

grandfeatures_df = pd.read_excel(grandfeatures_path)

  warn(msg)


## Function for reading a specific Grand dataset

In [4]:
# reading in Grand data
def read_grand_data(filename, grandratings_dir, grandfeatures_df):
    # extract category and feature
    grandcategory, grandfeature = filename[:-4].split("_")
        
    # read human ratings, make gold column
    df = pd.read_csv(grandratings_dir + filename)
    nspeakers = len(df.columns) -1
    df["Average"] = [row.iloc[1:26].sum() / nspeakers for _, row in df.iterrows()]
    # z-scores of average ratings
    df["Gold"] = (df["Average"] - df["Average"].mean()) / df["Average"].std()
        
    # obtain seed words from excel file
    relevant_row = grandfeatures_df[grandfeatures_df.Dimension == grandfeature]
    seedwords = relevant_row.iloc[:, 1:].values.flatten().tolist()
    pos_seedwords = seedwords[:3]
    neg_seedwords = seedwords[3:]
    
    return (grandcategory, grandfeature, pos_seedwords, neg_seedwords, df)

# Function for running crossvalidation

In [5]:
import eval_dim
import compute_dim
import statistics

def crossvalidation(filenames, method, word_vectors, grandratings_dir, grandfeatures_df, random_seed = 123, verbose = False):
    
    all_evals = [ ]
    
    for filename in filenames:
            grandcategory, grandfeature, pos_seedwords, neg_seedwords, df = read_grand_data(filename, 
                                                                                            grandratings_dir, 
                                                                                            grandfeatures_df)


            # storage for word vectors and gold values for this dataset
            all_thisdata_vectors = []
            all_thisdata_gold = []

            # collect word vectors and gold ratings
            for row in df.itertuples():
                # row.Row is the word. look it up in word_vectors
                all_thisdata_vectors.append( word_vectors[ row.Row ])
                # gold rating: use z-scored average
                all_thisdata_gold.append( row.Gold)

            # crossvalidation setup: give indices to datapoints
            fold = np.random.randint(method["numfolds"], size = len(all_thisdata_gold))

            # store the evaluation results from the different test folds
            evals = [ ]

            # iterate over folds, evaluate for each of them
            for testfold in range(method["numfolds"]):
                # compute training and test data for this fold
                test_indices =  [i for i in range(len(all_thisdata_gold)) if fold[i] == testfold]
                train_indices = [i for i in range(len(all_thisdata_gold)) if fold[i] != testfold]

                gold_test =  [ell["Gold"] for _, ell in df.iloc[ test_indices ].iterrows()]
                gold_train = [ ell["Gold"] for _, ell in df.iloc[ train_indices ].iterrows()]
                words_test =  [ell["Row"] for _, ell in df.iloc[ test_indices].iterrows()]
                words_train = [ell["Row"] for _, ell in df.iloc[ train_indices].iterrows()]
                vec_test =  [word_vectors[ w ] for w in words_test]
                vec_train = [word_vectors[ w ] for w in words_train ]


                # compute seed-based dimension, and its predictions
                if method["method"] == "seedbased":
                    dimension = compute_dim.dimension_seedbased(pos_seedwords, neg_seedwords, word_vectors)
                    df["Pred"] = compute_dim.predict_coord_fromtrain(vec_train, gold_train, dimension, all_thisdata_vectors)

                elif method["method"] == "fitted":
                    dimension, weight, bias = compute_dim.dimension_fitted_fromratings(vec_train, gold_train, 
                                                                                       method["feature_dim"],
                                                                                       random_seed = random_seed)
                    df["Pred"] = compute_dim.predict_coord_fromline(all_thisdata_vectors, dimension, weight, bias)

                elif method["method"] == "fitted_seedwords":
                    dimension, weight, bias = compute_dim.dimension_fitted_fromratings_seedwords(vec_train, gold_train, 
                                                                    method["feature_dim"], 
                                                                    pos_seedwords, neg_seedwords, word_vectors,
                                                                    offset = method["offset"], jitter = method["jitter"],
                                                                    random_seed = random_seed)
                    df["Pred"] = compute_dim.predict_coord_fromline(all_thisdata_vectors, dimension, weight, bias)

                elif method["method"] == "fitted_seeddims":
                    dimension, weight, bias = compute_dim.dimension_fitted_fromratings_seeddims(vec_train, gold_train, 
                                                                    method["feature_dim"], 
                                                                    pos_seedwords, neg_seedwords, word_vectors,
                                                                    do_average = method["do_average"], 
                                                                    alpha = method["alpha"],
                                                                    random_seed = random_seed)
                    df["Pred"] = compute_dim.predict_coord_fromline(all_thisdata_vectors, dimension, weight, bias)

                elif method["method"] == "combined":
                    dimension, weight, bias = compute_dim.dimension_fitted_fromratings_combined(vec_train, gold_train,
                                                                    method["feature_dim"],
                                                                    pos_seedwords, neg_seedwords, word_vectors,
                                                                    offset = method["offset"], jitter = method["jitter"],
                                                                    do_average = method["do_average"], 
                                                                    alpha = method["alpha"],
                                                                    random_seed = random_seed)
                    df["Pred"] = compute_dim.predict_coord_fromline(all_thisdata_vectors, dimension, weight, bias)

                else:
                    raise Exception("shouldn't be here")

                # order consistency pairwise: test values tested for their ordering wrt. all values, training and test
                # MSE: evaluate on test only
                e = { "ocp" : eval_dim.pairwise_order_consistency_wrt(df["Gold"], df["Pred"], test_indices),
                      "mse" : eval_dim.mean_squared_error(gold_test, [p for i, p in enumerate(df["Pred"]) if i in test_indices]),
                      "feature" : grandfeature,
                      "category" : grandcategory}

                all_evals.append(e)

    if verbose:
        ocps = [e["ocp"] for e in all_evals if e["ocp"] is not None]
        mses = [e["mse"] for e in all_evals if e["mse"] is not None]

        print("\n\nOverall", method["method"], 
              f"OC_p {statistics.mean(ocps):.3f} ({statistics.stdev(ocps):.2f})", 
              f"MSE mean {statistics.mean(mses):.3f} ({statistics.stdev(mses):.2f}) median {statistics.median(mses):.3f}")
        
    return all_evals


# Functions for aggregating results

Same as for hyperparameter optimization.

In [6]:
from collections import defaultdict
import statistics

# given a list of results dictionaries, 
# group them by the given dictionary keys
def eval_aggregate_by(evals, keylabels):
    bydataset_eval = defaultdict(list)
    
    for e in evals:
        key = tuple([str(e[k]) for k in keylabels])
        bydataset_eval[ key ].append(e)
        
    return bydataset_eval


# given a list of results dictionaries,
# compute mean, median and standard deviation over values for a particular key
def eval_summary_by(evals, keylabel):
    vals = [e[keylabel] for e in evals if e[keylabel] is not None]
    
    return (statistics.mean(vals), statistics.median(vals), statistics.stdev(vals))

# The data that is not in the development set

We set aside 6 category/feature pairs for development. We use the rest for testing through crossvalidation.

In [7]:
filenames = [f for f in os.listdir(grandratings_dir) if f.endswith("csv")]

import random
random.seed(789)
devset = random.sample(filenames, 6)
traintestset = [f for f in filenames if f not in devset]
[ filename[:-4].split("_") for filename in traintestset]

[['cities', 'temperature'],
 ['professions', 'intelligence'],
 ['clothing', 'location'],
 ['cities', 'arousal'],
 ['clothing', 'arousal'],
 ['states', 'size'],
 ['sports', 'intelligence'],
 ['clothing', 'wealth'],
 ['weather', 'danger'],
 ['professions', 'danger'],
 ['clothing', 'size'],
 ['animals', 'size'],
 ['sports', 'wealth'],
 ['professions', 'valence'],
 ['names', 'wealth'],
 ['cities', 'cost'],
 ['cities', 'wealth'],
 ['professions', 'gender'],
 ['states', 'religiosity'],
 ['clothing', 'age'],
 ['weather', 'wetness'],
 ['professions', 'wealth'],
 ['myth', 'valence'],
 ['clothing', 'cost'],
 ['professions', 'age'],
 ['myth', 'size'],
 ['sports', 'danger'],
 ['names', 'gender'],
 ['sports', 'gender'],
 ['professions', 'location'],
 ['sports', 'speed'],
 ['states', 'temperature'],
 ['professions', 'arousal'],
 ['cities', 'size'],
 ['states', 'wealth'],
 ['sports', 'arousal'],
 ['clothing', 'gender'],
 ['weather', 'temperature'],
 ['cities', 'religiosity'],
 ['animals', 'intelligen

# Running the actual evaluation

## Seed-based dimensions

In [8]:
numfolds = 5
num_randseeds = 3

In [9]:
method = { "method": "seedbased",
          "numfolds" : numfolds}

results = crossvalidation(traintestset, method, word_vectors, grandratings_dir, grandfeatures_df)

In [10]:
ocp_mean, _, ocp_sd = eval_summary_by(results, "ocp")
mse_mean, mse_med, mse_sd = eval_summary_by(results, "mse")

print("Seed-based method:",
      f"OC_P mean {ocp_mean:.3f} ({ocp_sd:.2f})",
      f"MSE median {mse_med:.3f}")
 

Seed-based method: OC_P mean 0.635 (0.12)MSE median 226.407


## Fitted dimensions

In [12]:
method = {"method": "fitted",
          "numfolds" : numfolds,
          "feature_dim" : feature_dim}


random.seed(5)
randoms = [random.randrange(0,100) for _ in range(num_randseeds)]

results = [ ]

for rval in randoms:
    theseresults = crossvalidation(traintestset, method, word_vectors, grandratings_dir, grandfeatures_df, random_seed = rval)
    
    ocp_mean, _, _ = eval_summary_by(theseresults, "ocp")
    _, mse_med, _ = eval_summary_by(theseresults, "mse")
    
    print(ocp_mean, mse_med)

    results += theseresults
    

0.5421174858513682 91.0626807798399
0.5219122305305846 234.07023055288454
0.5694117891829683 38.33066970437781


In [13]:
results_bycond = eval_aggregate_by(results,["category", "feature"])

ocps = [eval_summary_by(cond_results, "ocp")[0] for cond_results in results_bycond.values()]
mses = [eval_summary_by(cond_results, "mse")[1] for cond_results in results_bycond.values()]
    
ocp_mean = statistics.mean(ocps)
ocp_sd = statistics.stdev(ocps)
msemed_mean= statistics.mean(mses)
msemed_sd = statistics.stdev(mses)

print("Fitted method:",
      f"OC_P mean {ocp_mean:.3f} ({ocp_sd:.2f})",
      f"MSE median {msemed_mean:.3f} ({msemed_sd:.2f})")


Fitted method: OC_P mean 0.544 (0.03)MSE median 114.021 (100.04)


# Fitted dimensions with seed words

In [15]:
method = { "method": "fitted_seedwords",
          "numfolds" : numfolds,
          "offset" : 2.0,
          "jitter" : True,
          "feature_dim" : feature_dim}

random.seed(5)
randoms = [random.randrange(0,100) for _ in range(num_randseeds)]
results = [ ]

for rval in randoms:
    theseresults = crossvalidation(traintestset, method, word_vectors, grandratings_dir, grandfeatures_df, random_seed = rval)
    
    ocp_mean, _, _ = eval_summary_by(theseresults, "ocp")
    _, mse_med, _ = eval_summary_by(theseresults, "mse")
    
    print(ocp_mean, mse_med)

    results += theseresults


0.5311938588842575 291.70447837975917
0.5291527131751409 185.7156647626582
0.545760398712621 111.50139446948805


In [16]:
results_bycond = eval_aggregate_by(results,["category", "feature"])

ocps = [eval_summary_by(cond_results, "ocp")[0] for cond_results in results_bycond.values()]
mses = [eval_summary_by(cond_results, "mse")[1] for cond_results in results_bycond.values()]
    
ocp_mean = statistics.mean(ocps)
ocp_sd = statistics.stdev(ocps)
msemed_mean= statistics.mean(mses)
msemed_sd = statistics.stdev(mses)

print("Fitted, with seed words,", 
      "offset", method["offset"], "jitter", method["jitter"],
      f"OC_P mean {ocp_mean:.3f} ({ocp_sd:.2f})",
      f"MSE median {msemed_mean:.3f} ({msemed_sd:.2f})")


Fitted, with seed words, offset 2.0 jitter True OC_P mean 0.535 (0.03) MSE median 229.438 (261.97)


# Fitted dimensions with seed dimensions

In [17]:
method = { "method": "fitted_seeddims",
          "numfolds" : numfolds,
          "alpha" : 0.02,
          "do_average" : True,
          "feature_dim" : feature_dim}

random.seed(5)
randoms = [random.randrange(0,100) for _ in range(num_randseeds)]
results = [ ]

for rval in randoms:
    theseresults = crossvalidation(traintestset, method, word_vectors, grandratings_dir, grandfeatures_df, random_seed = rval)

    ocp_mean, _, _ = eval_summary_by(theseresults, "ocp")
    _, mse_med, _ = eval_summary_by(theseresults, "mse")
    
    print(ocp_mean, mse_med)

    results += theseresults

    
    

0.6529228362960654 5.701850589982518
0.6541907368516412 6.940450263703243
0.6362253492701136 10.110086721618455


In [18]:
results_bycond = eval_aggregate_by(results,["category", "feature"])

ocps = [eval_summary_by(cond_results, "ocp")[0] for cond_results in results_bycond.values()]
mses = [eval_summary_by(cond_results, "mse")[1] for cond_results in results_bycond.values()]
    
ocp_mean = statistics.mean(ocps)
ocp_sd = statistics.stdev(ocps)
msemed_mean= statistics.mean(mses)
msemed_sd = statistics.stdev(mses)

print("Fitted, with seed dim.s,", 
      "alpha", method["alpha"], "avg", method["do_average"],
      f"OC_P mean {ocp_mean:.3f} ({ocp_sd:.2f})",
      f"MSE median {msemed_mean:.3f} ({msemed_sd:.2f})")


Fitted, with seed dim.s, alpha 0.02 avg True OC_P mean 0.648 (0.11) MSE median 64.431 (138.39)


# Fitted dimensions with seeds as words and dimensions

In [19]:
method = {"method": "combined",
          "numfolds" : numfolds,
          "alpha" : 0.05,
          "do_average" : True,
          "offset" : 2,
          "jitter" : True,
          "feature_dim" : feature_dim}


random.seed(5)
randoms = [random.randrange(0,100) for _ in range(3)]
results = [ ]

for rval in randoms:
    theseresults = crossvalidation(traintestset, method, word_vectors, grandratings_dir, grandfeatures_df, random_seed = rval)
    
    ocp_mean, _, _ = eval_summary_by(theseresults, "ocp")
    _, mse_med, _ = eval_summary_by(theseresults, "mse")
    
    print(ocp_mean, mse_med)

    results += theseresults

0.7928723629760392 0.6837275364436712
0.7927436748503687 0.6409548804488194
0.7854006256738303 0.7083679941557908


In [20]:
results_bycond = eval_aggregate_by(results,["category", "feature"])

ocps = [eval_summary_by(cond_results, "ocp")[0] for cond_results in results_bycond.values()]
mses = [eval_summary_by(cond_results, "mse")[1] for cond_results in results_bycond.values()]
    
ocp_mean = statistics.mean(ocps)
ocp_sd = statistics.stdev(ocps)
msemed_mean= statistics.mean(mses)
msemed_sd = statistics.stdev(mses)

print("Fitted, with seed dim.s,", 
      "alpha", method["alpha"], "avg", method["do_average"],
      "offset", method["offset"], "jitter", method["jitter"],
      f"OC_P mean {ocp_mean:.3f} ({ocp_sd:.2f})",
      f"MSE median {msemed_mean:.3f} ({msemed_sd:.2f})")


Fitted, with seed dim.s, alpha 0.05 avg True offset 2 jitter True OC_P mean 0.790 (0.05) MSE median 0.754 (0.32)
