# Hyperparameter optimization: Contextualized embeddings 

Grand et al data

Hyperparameter optimization on a development set.

# Global data, set to run notebook

In [None]:
grandratings_dir = "../data/Grandetal-data/"
grandfeatures_path = "../data/Grandetal-data/features.xlsx"

# embpath = "./"
embpath = "../vectors/bert_vectors"       
# embpath = "../vectors/roberta-large-vectors"

In [None]:
import os
from scipy import stats
import numpy as np 
import pandas as pd
import zipfile
import math
import sklearn
import torch
import torch.optim as optim
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
import matplotlib.pyplot as plt
import pickle

# Reading the data

## Contextualized embeddings

In [None]:
# global settings
num_randseeds = 3
feature_dim = 1024
whichbert = "robltop4"

In [None]:
bert_large_vecs_paths = { "ltop4" : embpath + "bert-large-uncased.top4layers.Grandetal.npz",
                          "robltop4" : embpath + "roberta-large.Grandetal.top4layers.pkl"}
                         
print("using contextualized embeddings", bert_large_vecs_paths[whichbert], "\n")

filename = bert_large_vecs_paths[whichbert]
if filename.endswith("npz"):
    # for npz files:
    data = np.load(bert_large_vecs_paths[whichbert])
elif filename.endswith("pkl"):
    # for pkl files:
    with open(bert_large_vecs_paths[whichbert], "rb") as f:
        data2 = pickle.load(f)
        data = dict([(w, v.numpy()) for w, v in data2.items()])
else:
    raise Exception("should not be here")



In [None]:
word_vectors = {}
    
for word in data:
    vector = data[word]
    if ' ' in word:
        first, second = word.split()
        if 'north' in first and 'dakota' in second:
            word = first + '_' + second
            print(word)
            word_vectors[word] = vector
        else:
            word = first + '-' + second
            print(word)
            word_vectors[word] = vector
    else:
        word_vectors[word] = vector
        


## Grand features

In [None]:


grandfeatures_df = pd.read_excel(grandfeatures_path)

## Function for reading a specific Grand dataset

In [None]:
# reading in Grand data
def read_grand_data(filename, grandratings_dir, grandfeatures_df):
    # extract category and feature
    grandcategory, grandfeature = filename[:-4].split("_")
        
    # read human ratings, make gold column
    df = pd.read_csv(grandratings_dir + filename)
    nspeakers = len(df.columns) -1
    df["Average"] = [row.iloc[1:26].sum() / nspeakers for _, row in df.iterrows()]
    # z-scores of average ratings
    df["Gold"] = (df["Average"] - df["Average"].mean()) / df["Average"].std()
        
    # obtain seed words from excel file
    relevant_row = grandfeatures_df[grandfeatures_df.Dimension == grandfeature]
    seedwords = relevant_row.iloc[:, 1:].values.flatten().tolist()
    pos_seedwords = seedwords[:3]
    neg_seedwords = seedwords[3:]
    
    return (grandcategory, grandfeature, pos_seedwords, neg_seedwords, df)

# Function for running crossvalidation

In [None]:
import eval_dim
import compute_dim
import statistics

def crossvalidation(filenames, method, word_vectors, grandratings_dir, grandfeatures_df, random_seed = 123, verbose = False):
    
    all_evals = [ ]
    
    for filename in filenames:
            grandcategory, grandfeature, pos_seedwords, neg_seedwords, df = read_grand_data(filename, 
                                                                                            grandratings_dir, 
                                                                                            grandfeatures_df)


            # storage for word vectors and gold values for this dataset
            all_thisdata_vectors = []
            all_thisdata_gold = []

            # collect word vectors and gold ratings
            for row in df.itertuples():
                # row.Row is the word. look it up in word_vectors
                all_thisdata_vectors.append( word_vectors[ row.Row ])
                # gold rating: use z-scored average
                all_thisdata_gold.append( row.Gold)

            # crossvalidation setup: give indices to datapoints
            fold = np.random.randint(method["numfolds"], size = len(all_thisdata_gold))

            # store the evaluation results from the different test folds
            evals = [ ]

            # iterate over folds, evaluate for each of them
            for testfold in range(method["numfolds"]):
                # compute training and test data for this fold
                test_indices =  [i for i in range(len(all_thisdata_gold)) if fold[i] == testfold]
                train_indices = [i for i in range(len(all_thisdata_gold)) if fold[i] != testfold]

                gold_test =  [ell["Gold"] for _, ell in df.iloc[ test_indices ].iterrows()]
                gold_train = [ ell["Gold"] for _, ell in df.iloc[ train_indices ].iterrows()]
                words_test =  [ell["Row"] for _, ell in df.iloc[ test_indices].iterrows()]
                words_train = [ell["Row"] for _, ell in df.iloc[ train_indices].iterrows()]
                vec_test =  [word_vectors[ w ] for w in words_test]
                vec_train = [word_vectors[ w ] for w in words_train ]


                # compute seed-based dimension, and its predictions
                if method["method"] == "seedbased":
                    dimension = compute_dim.dimension_seedbased(pos_seedwords, neg_seedwords, word_vectors)
                    df["Pred"] = compute_dim.predict_coord_fromtrain(vec_train, gold_train, dimension, all_thisdata_vectors)

                elif method["method"] == "fitted":
                    dimension, weight, bias = compute_dim.dimension_fitted_fromratings(vec_train, gold_train, 
                                                                                       method["feature_dim"],
                                                                                       random_seed = random_seed)
                    df["Pred"] = compute_dim.predict_coord_fromline(all_thisdata_vectors, dimension, weight, bias)

                elif method["method"] == "fitted_seedwords":
                    dimension, weight, bias = compute_dim.dimension_fitted_fromratings_seedwords(vec_train, gold_train, 
                                                                    method["feature_dim"], 
                                                                    pos_seedwords, neg_seedwords, word_vectors,
                                                                    offset = method["offset"], jitter = method["jitter"],
                                                                    random_seed = random_seed)
                    df["Pred"] = compute_dim.predict_coord_fromline(all_thisdata_vectors, dimension, weight, bias)

                elif method["method"] == "fitted_seeddims":
                    dimension, weight, bias = compute_dim.dimension_fitted_fromratings_seeddims(vec_train, gold_train, 
                                                                    method["feature_dim"], 
                                                                    pos_seedwords, neg_seedwords, word_vectors,
                                                                    do_average = method["do_average"], 
                                                                    alpha = method["alpha"],
                                                                    random_seed = random_seed)
                    df["Pred"] = compute_dim.predict_coord_fromline(all_thisdata_vectors, dimension, weight, bias)

                elif method["method"] == "combined":
                    dimension, weight, bias = compute_dim.dimension_fitted_fromratings_combined(vec_train, gold_train,
                                                                    method["feature_dim"],
                                                                    pos_seedwords, neg_seedwords, word_vectors,
                                                                    offset = method["offset"], jitter = method["jitter"],
                                                                    do_average = method["do_average"], 
                                                                    alpha = method["alpha"],
                                                                    random_seed = random_seed)
                    df["Pred"] = compute_dim.predict_coord_fromline(all_thisdata_vectors, dimension, weight, bias)

                else:
                    raise Exception("shouldn't be here")

                # order consistency pairwise: test values tested for their ordering wrt. all values, training and test
                # MSE: evaluate on test only
                e = { "ocp" : eval_dim.pairwise_order_consistency_wrt(df["Gold"], df["Pred"], test_indices),
                      "mse" : eval_dim.mean_squared_error(gold_test, [p for i, p in enumerate(df["Pred"]) if i in test_indices]),
                      "feature" : grandfeature,
                      "category" : grandcategory}

                all_evals.append(e)

    if verbose:
        ocps = [e["ocp"] for e in all_evals if e["ocp"] is not None]
        mses = [e["mse"] for e in all_evals if e["mse"] is not None]

        print("\n\nOverall", method["method"], 
              f"OC_p {statistics.mean(ocps):.3f} ({statistics.stdev(ocps):.2f})", 
              f"MSE mean {statistics.mean(mses):.3f} ({statistics.stdev(mses):.2f}) median {statistics.median(mses):.3f}")
        
    return all_evals


# Function for aggregating crossvalidation results

We assume results that are dictionaries. 

First, a function for aggregating results by particular labels, into a single dictionary. For example, when results are aggregated by category and feature, the result will be a dictionary whose keys are category/feature tuples, and the values are lists of result dictionaries. 

Second, a function that computes mean, median, and standard deviation, over a list of result dictionaries, for a given labels.

In [None]:
from collections import defaultdict
import statistics

# given a list of results dictionaries, 
# group them by the given dictionary keys
def eval_aggregate_by(evals, keylabels):
    bydataset_eval = defaultdict(list)
    
    for e in evals:
        key = tuple([str(e[k]) for k in keylabels])
        bydataset_eval[ key ].append(e)
        
    return bydataset_eval


# given a list of results dictionaries,
# compute mean, median and standard deviation over values for a particular key
def eval_summary_by(evals, keylabel):
    vals = [e[keylabel] for e in evals if e[keylabel] is not None]
    
    return (statistics.mean(vals), statistics.median(vals), statistics.stdev(vals))

# given a dictionary of results (parameters -> result dictionary list),
# * for each parameter setting, aggregate by cateogy and feature
# * for each category/feature, compute mean ocp and mse values
# * for the parameter setting, compute mean and sd ocp and mse over all category/feature pairs
def eval_hyper(results, parameternames):
    # output dictionary
    results_byparam = [ ]

    # iterate over parameter settings
    for theseresults in results.values():
        # extract parameters
        this_dict = dict([(par, theseresults[0][par]) for par in parameternames])
        
        # aggregate by condition = by category and feature
        results_bycond = eval_aggregate_by(theseresults,["category", "feature"])
        
        # compute mean ocp and mse values.
        # ocp: we use mean for each condition.
        # mse: we use median for each condition
        ocps = [eval_summary_by(cond_results, "ocp")[0] for cond_results in results_bycond.values()]
        mses = [eval_summary_by(cond_results, "mse")[1] for cond_results in results_bycond.values()]

        # compute mean and standard deviation over ocps and mses
        this_dict["ocp_mean"] = statistics.mean(ocps)
        this_dict["ocp_sd"] = statistics.stdev(ocps)
        this_dict["msemed_mean"] = statistics.mean(mses)
        this_dict["msemed_sd"] = statistics.stdev(mses)
        
        results_byparam.append(this_dict)
        
    return results_byparam


# Making a development set

In [None]:
filenames = [f for f in os.listdir(grandratings_dir) if f.endswith("csv")]

import random
random.seed(789)
devset = random.sample(filenames, 6)
[ filename[:-4].split("_") for filename in devset]

# Determining hyperparameters

## Fitted dimensions with seeds as words: offset, jitter

First experiments looked like there was a lot of variance in result with different random seeds. To check into this, we run the hyperparameter tests n times with different random seeds. We then look at mean and standard deviation of the two measures we focus on: mean OC_p and median MSE.

In [None]:
jitter_vals = [True, False]
offset_vals= np.linspace(0.1, 2, num=20)

results = defaultdict(list)

random.seed(5)
randoms = [random.randrange(0,100) for _ in range(num_randseeds)]

for randval in randoms:
    for jval in jitter_vals:
        for oval in offset_vals:

            method = { "method": "fitted_seedwords",
                      "feature_dim" : feature_dim,
                      "numfolds" : 5,
                      "offset" : oval,
                      "jitter" : jval}


            theseresults = crossvalidation(devset, method, word_vectors, grandratings_dir, grandfeatures_df, random_seed = randval)
            
            ocp_mean, _, _ = eval_summary_by(theseresults, "ocp")
            _, mse_med, _ = eval_summary_by(theseresults, "mse")
            print(oval, jval, ocp_mean, mse_med)
            

            for r in theseresults:
                r.update({"offset":oval, "j": jval})
                results[ (str(oval), str(jval))].append(r)
    


In [None]:
df = pd.DataFrame(eval_hyper(results, ["offset", "j"]))
df.head()


We plot mean OC_P values, with standard deviation. 

In [None]:
%matplotlib inline
df[df.j == False].plot(y = "ocp_mean",x = "offset", yerr = "ocp_sd")

In [None]:
df[df.j == True].plot(y = "ocp_mean", x = "offset", yerr = "ocp_sd")

We plot mean values of median MSE, with standard deviation. 

In [None]:
df[df.j == False].plot(y = "msemed_mean", x = "offset", yerr = "msemed_sd")

In [None]:
df[df.j == True].plot(y = "msemed_mean", x = "offset", yerr = "msemed_sd")

### Parameters for seeds as words


**BERT, last 4 layers**:

The differences in performance are small compared to the error bars. So we go with an **offset of 1.0 with no jitter**, same as for GLoVE.  
**RoBERTA, last 4 layers**:

We again go with an **offset of 1.0 with no jitter**, for the same reason.


In [None]:
df.sort_values(by = "ocp_mean", ascending = False).head(10)

In [None]:
df.sort_values(by = "msemed_mean", ascending = True).head(10)

## Fitted dimensions with seeds as dimensions: alpha, averaging

We first run an exploration, with only one random seed.

In [None]:
average_vals = [True, False]
alpha_vals= np.linspace(0.001, 0.25, num=10)

results = defaultdict(list)

randoms = [123]

for randval in randoms:
    for avgval in average_vals:
        for alphaval in alpha_vals:

            method = { "method": "fitted_seeddims",
                      "feature_dim" : feature_dim,
                      "numfolds" : 5,
                      "do_average" : avgval,
                      "alpha" : alphaval}


            theseresults = crossvalidation(devset, method, word_vectors, grandratings_dir, grandfeatures_df, random_seed = randval)
            
            ocp_mean, _, _ = eval_summary_by(theseresults, "ocp")
            _, mse_med, _ = eval_summary_by(theseresults, "mse")
            print(alphaval, avgval, ocp_mean, mse_med)
            
            for r in theseresults:
                r.update({"alpha":alphaval, "avg": avgval})
                results[ (str(alphaval), str(avgval))].append(r)            



In [None]:
df = pd.DataFrame(eval_hyper(results, ["alpha", "avg"]))
df.head()


In [None]:
df[df.avg == True].sort_values(by = "alpha").plot(x = "alpha", y = "ocp_mean", yerr = "ocp_sd")

In [None]:
df[df.avg == False].sort_values(by = "alpha").plot(x = "alpha", y = "ocp_mean", yerr = "ocp_sd")

In [None]:
df[df.avg == True].sort_values(by = "alpha").plot(x = "alpha", y = "msemed_mean", yerr = "msemed_sd")

In [None]:
df[df.avg == False].sort_values(by = "alpha").plot(x = "alpha", y = "msemed_mean", yerr = "msemed_sd")

We see that overall the performance is best, in terms of both OC_P and MSE, for small values of alpha, though again there is a large error bar. We explore the low range of alpha in more detail, both with and without averaging. 

In [None]:
average_vals = [True, False]
alpha_vals= np.linspace(0.001, 0.15, num=10)

results = defaultdict(list)

random.seed(5)
randoms = [random.randrange(0,100) for _ in range(num_randseeds)]

for randval in randoms:
    for avgval in average_vals:
        for alphaval in alpha_vals:

            method = { "method": "fitted_seeddims",
                      "feature_dim" : feature_dim,
                      "numfolds" : 5,
                      "do_average" : avgval,
                      "alpha" : alphaval}


            theseresults = crossvalidation(devset, method, word_vectors, grandratings_dir, grandfeatures_df, random_seed = randval)
            
            ocp_mean, _, _ = eval_summary_by(theseresults, "ocp")
            _, mse_med, _ = eval_summary_by(theseresults, "mse")
            print(alphaval, avgval, ocp_mean, mse_med)
            
            
            for r in theseresults:
                r.update({"alpha":alphaval, "avg": avgval})
                results[ (str(alphaval), str(avgval))].append(r) 

In [None]:
df = pd.DataFrame(eval_hyper(results, ["alpha", "avg"]))
df.head()


We again plot average OC_P mean and MSE median values, with error bars. 

In [None]:
%matplotlib inline

df[df.avg == True].plot(y = "ocp_mean", x = "alpha", yerr = "ocp_sd")

In [None]:
df[df.avg == False].plot(y = "ocp_mean", x = "alpha", yerr = "ocp_sd")

In [None]:
df[df.avg == False].plot(y = "msemed_mean", x = "alpha", yerr = "msemed_sd")

In [None]:
df[df.avg == True].plot(y = "msemed_mean", x = "alpha", yerr = "msemed_sd")

We again look at the numbers.

In [None]:
df[df.avg == True].sort_values(by = 'ocp_mean', ascending = False).head(3)

In [None]:
df[df.avg == False].sort_values(by = 'ocp_mean', ascending = False).head(3)

In [None]:
df[df.avg == False].sort_values(by = 'msemed_mean').head(3)

In [None]:
df[df.avg == True].sort_values(by = 'msemed_mean').head(3)

### Parameters for the fitted model with seed dimensions



**BERT last 4 layers**

Best performance is for lowest values of alpha. In terms of OC_P, it does not matter whether we average or not, but MSE values are lower with averaging, so we choose **alpha = 0.001, with averaging**

**RoBERTA last 4 layers**

Best performance is again for lowest values of alpha, and performance is slightly better with averaging so we again use **alpha = 0.001, with averaging**

In [None]:
raise Exception("stop")

# Combined model: seeds as words and dimensions

We fix jitter, offset, and averaging to the best values determined above as they don't seem to have made much of a difference.


In [None]:
average_vals = [True]
alpha_vals= np.linspace(0.001, 0.15, num=10)
jitter_vals = [False]
offset_vals= [ 1.0]


results = defaultdict(list)

random.seed(5)
randoms = [random.randrange(0,100) for _ in range(num_randseeds)]

for randval in randoms:
    for avgval in average_vals:
        for alphaval in alpha_vals:
            for oval in offset_vals:
                for jval in jitter_vals:

                    method = { "method": "combined",
                              "feature_dim" : feature_dim,
                              "numfolds" : 5,
                              "do_average" : avgval,
                              "alpha" : alphaval,
                              "offset" : oval,
                              "jitter" : jval}


                    theseresults = crossvalidation(devset, method, word_vectors, grandratings_dir, grandfeatures_df, random_seed = randval)
                    
                    ocp_mean, _, _ = eval_summary_by(theseresults, "ocp")
                    _, mse_med, _ = eval_summary_by(theseresults, "mse")
                    print(alphaval, avgval, oval, jval, ocp_mean, mse_med)
            
            
                    for r in theseresults:
                        r.update({"alpha":alphaval, "avg": avgval, "offset": oval, "j": jval})
                        results[ (str(alphaval), str(avgval))].append(r) 




In [None]:
df = pd.DataFrame(eval_hyper(results, ["alpha", "avg", "offset", "j"]))
df.head()


## Plotting the results



In [None]:
df.plot(y = "ocp_mean", x = "alpha", yerr = "ocp_sd")

In [None]:
df.plot(y = "msemed_mean", x = "alpha", yerr = "msemed_sd")

In [None]:
df.sort_values(by = "ocp_mean", ascending = False).head()

In [None]:
df.sort_values(by = "msemed_mean", ascending = True).head()

### Parameters for the fitted model with seed words and seed dimensions


**BERT last 4 layers**

Both OC_P and MSE have a clear best value at **alpha = 0.02**. We use **averaging, no jitter, and an offset of 1.0**,

**RoBERTA last 4 layers**

Again, clear best value is **alpha = 0.02**. We use **averaging, no jitter, and an offset of 1.0**,

