# Evaluating on the Pavlick and Nenkova style data, using BERT embeddings

# Global settings

Expected locations of data:
* Pavlick/Nenkova data: "style-data-Pavlick-Nenkova_2015", subdirectory of current directory
* BERT, RoBERTa: "emb", subdirectory of current directory

In [1]:
pavlick_path = "./style-data-Pavlick-Nenkova_2015/"


In [2]:
import os
from scipy import stats
import numpy as np 
import pandas as pd
import zipfile
import math
import sklearn
import torch
import torch.optim as optim
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
import matplotlib.pyplot as plt
import statistics
from collections import defaultdict
import random
import pickle

In [3]:
num_randseeds = 3
feature_dim = 1024
numfolds = 5


In [4]:
# BERT variants and hyperparameters

bert_dirs = { "complexity" : "emb" , "formality" : "emb"}

berts = { "ltop4" : {
              "path" : {"complexity" : "bert-large-uncased.complexity.top4layers.pkl",
                        "formality" : "bert-large-uncased.formality.top4layers.pkl"},
              "offset" : 1.0,
              "jitter" : False,
              "alpha1" : 0.001,
              "average" : True,
              "alpha2" : 0.02},
         "robltop4" : {
              "path" : {"complexity" : "roberta-large.complexity.top4layers.pkl",
                        "formality" : "roberta-large.formality.top4layers.pkl"},
              "offset" : 1.0,
              "jitter" : False,
              "alpha1" : 0.001,
              "average" : True,
              "alpha2" : 0.02}
        }

In [5]:
thisbert = "robltop4"
thisdataset = "formality"

# Reading the data

## BERT

In [6]:
filename = os.path.join(bert_dirs[thisdataset], berts[thisbert]["path"][thisdataset])

print("Using BERT", thisbert, filename)

with open(filename, "rb") as f:
    data = pickle.load(f)
    
word_vectors = dict([(w, v.numpy()) for w, v in data.items()])
    

Using BERT robltop4 ./roberta-large.formality.top4layers.pkl


## Pavlick and Nenkova data

There are 1,160 complexity scores and 1,274 formality scores.

For each of the two datasets, we z-score the ratings so they will be on a similar scale as the Grand et al ratings.

In [7]:
human_filtered_name = { "complexity" : "complexity/human/filtered_complexity_human_scores.txt",
                       "formality" : "formality/human/filtered_formality_human_scores.txt" }


In [8]:

data_df = pd.read_csv(pavlick_path + human_filtered_name[thisdataset], sep = "\s+", header = None)
data_df.columns = ["rating", "word", "sd"]
data_df

Unnamed: 0,rating,word,sd
0,1.428571,someplace,3.78
1,1.571429,chow,3.36
2,1.571429,yeah,2.15
3,1.714286,dressing,2.93
4,2.571429,grandma,4.43
...,...,...,...
1269,97.000000,scrutiny,4.55
1270,97.285714,endorsement,5.31
1271,97.857143,inequality,5.67
1272,98.000000,adapted,5.29


In [9]:
data_df["z"] = (data_df.rating - data_df.rating.mean()) / data_df.rating.std()
data_df

Unnamed: 0,rating,word,sd,z
0,1.428571,someplace,3.78,-1.694189
1,1.571429,chow,3.36,-1.689661
2,1.571429,yeah,2.15,-1.689661
3,1.714286,dressing,2.93,-1.685134
4,2.571429,grandma,4.43,-1.657968
...,...,...,...,...
1269,97.000000,scrutiny,4.55,1.334787
1270,97.285714,endorsement,5.31,1.343842
1271,97.857143,inequality,5.67,1.361953
1272,98.000000,adapted,5.29,1.366481


# Seeds

Here the seeds come in pairs. Marianna extracted them frmo the Pavlick/Nenkova "pairs" data by using the top rated pairs. 

In [10]:
seeds_str = { "complexity" : """work - employment
further - subsequently
strong - powerful
train - railway
shown - indicated""",
             "formality" : """winner - recipient
terrible - disastrous
membership - affiliation
highest - paramount
test - verify"""}

data_seeds = [ ]
for pairstr in seeds_str[thisdataset].split("\n"):
    pair = [s.strip() for s in pairstr.split("-")]
    data_seeds.append(tuple(pair))
    
data_seeds
    

[('winner', 'recipient'),
 ('terrible', 'disastrous'),
 ('membership', 'affiliation'),
 ('highest', 'paramount'),
 ('test', 'verify')]

## Function for running crossvalidation

In [11]:
import eval_dim
import compute_dim
import statistics

def crossvalidation(method, word_vectors, df, seedpairs, random_seed = 123):
    
    neg_seedwords = [n for n, _ in seedpairs]
    pos_seedwords = [p for _, p in seedpairs]
                     
    all_vectors = [ word_vectors[w] for w in df.word]
    
    # crossvalidation setup: give indices to datapoints
    rng = np.random.default_rng(seed = 3)
    fold = rng.integers(low = 0, high = method["numfolds"], size = len(df.word))

    # store the evaluation results from the different test folds
    all_evals = [ ]

    # iterate over folds, evaluate for each of them
    for testfold in range(method["numfolds"]):
        # compute training and test data for this fold
        test_indices =  [i for i in range(len(df.z)) if fold[i] == testfold]
        train_indices = [i for i in range(len(df.z)) if fold[i] != testfold]

        gold_test =  [ell["z"] for _, ell in df.iloc[ test_indices ].iterrows()]
        gold_train = [ ell["z"] for _, ell in df.iloc[ train_indices ].iterrows()]
        words_test =  [ell["word"] for _, ell in df.iloc[ test_indices].iterrows()]
        words_train = [ell["word"] for _, ell in df.iloc[ train_indices].iterrows()]
        vec_test =  [word_vectors[ w ] for w in words_test]
        vec_train = [word_vectors[ w ] for w in words_train ]


        # compute seed-based dimension, and its predictions
        if method["method"] == "seedbased":
            dimension = compute_dim.dimension_seedbased(pos_seedwords, neg_seedwords, word_vectors, paired = True)
            df["Pred"] = compute_dim.predict_coord_fromtrain(vec_train, gold_train, dimension, all_vectors)

        elif method["method"] == "fitted":
            dimension, weight, bias = compute_dim.dimension_fitted_fromratings(vec_train, gold_train, 
                                                                               method["feature_dim"],
                                                                               random_seed = random_seed)

            df["Pred"] = compute_dim.predict_coord_fromline(all_vectors, dimension, weight, bias)

        elif method["method"] == "fitted_seedwords":
            dimension, weight, bias = compute_dim.dimension_fitted_fromratings_seedwords(vec_train, gold_train, 
                                                            method["feature_dim"], 
                                                            pos_seedwords, neg_seedwords, word_vectors,
                                                            offset = method["offset"], jitter = method["jitter"],
                                                            random_seed = random_seed)
                                                            
            df["Pred"] = compute_dim.predict_coord_fromline(all_vectors, dimension, weight, bias)

        elif method["method"] == "fitted_seeddims":
            dimension, weight, bias = compute_dim.dimension_fitted_fromratings_seeddims(vec_train, gold_train, 
                                                            method["feature_dim"], 
                                                            pos_seedwords, neg_seedwords, word_vectors,
                                                            do_average = method["do_average"], 
                                                            alpha = method["alpha"],
                                                            random_seed = random_seed,
                                                            paired = True)
            df["Pred"] = compute_dim.predict_coord_fromline(all_vectors, dimension, weight, bias)

        elif method["method"] == "combined":
            dimension, weight, bias = compute_dim.dimension_fitted_fromratings_combined(vec_train, gold_train,
                                                            method["feature_dim"],
                                                            pos_seedwords, neg_seedwords, word_vectors,
                                                            offset = method["offset"], jitter = method["jitter"],
                                                            do_average = method["do_average"], 
                                                            alpha = method["alpha"],
                                                            random_seed = random_seed,
                                                            paired = True)
            df["Pred"] = compute_dim.predict_coord_fromline(all_vectors, dimension, weight, bias)

        else:
            raise Exception("shouldn't be here")

        # order consistency pairwise: test values tested for their ordering wrt. all values, training and test
        # MSE: evaluate on test only
        e = { "ocp" : eval_dim.pairwise_order_consistency_wrt(df["z"], df["Pred"], test_indices),
              "mse" : eval_dim.mean_squared_error(gold_test, [p for i, p in enumerate(df["Pred"]) if i in test_indices]) }

        all_evals.append(e)

        
    return all_evals


## Aggregating results

This is yet different from Grand et al because there are no sub-conditions, just a single dataset.
We directly aggregate over all results in the list of results dictionaries.

In [12]:
from collections import defaultdict
import statistics

# given a list of results dictionaries,
# compute mean, median and standard deviation over values for a particular key
def eval_summary_by(evals, keylabel):
    vals = [e[keylabel] for e in evals if e[keylabel] is not None]
    
    return (statistics.mean(vals), statistics.median(vals), statistics.stdev(vals))

# given a dictionary of results (parameters -> result dictionary list),
# all for the same dataset but from different crossvalidatin runs
# and runs with different random seeds 
def eval_eval(results):
    ocp_mean, _, _ = eval_summary_by(results, "ocp")
    mse_mean, mse_med, _ = eval_summary_by(results, "mse")
    
    return ocp_mean, mse_mean, mse_med


# Evaluation


## Seed-based

In [13]:
method = { "method": "seedbased",
          "numfolds" : numfolds}

print("Seed-based method, data", thisdataset)



results = crossvalidation(method, word_vectors, data_df, data_seeds)

ocp_mean, mse_mean, mse_med = eval_eval(results)

print(f"OC_P mean {ocp_mean:.3f}", 
      f"MSE mean {mse_mean:.3f}",
      f"MSE median {mse_med:.3f}")
 

Seed-based method, data formality
OC_P mean 0.674 MSE mean 213.606 MSE median 223.004


# Fitted

In [14]:
method = {"method": "fitted",
          "numfolds" : numfolds,
          "feature_dim" : feature_dim}

print("Fitted method, data", thisdataset)

random.seed(5)
randoms = [random.randrange(0,100) for _ in range(num_randseeds)]


results = [ ]
for rval in randoms:
    theseresults = crossvalidation(method, word_vectors, data_df,data_seeds,
                                   random_seed = rval)
    results += theseresults


ocp_mean, mse_mean, mse_med = eval_eval(results)

print(f"OC_P mean {ocp_mean:.3f}", 
      f"MSE mean {mse_mean:.3f}",
      f"MSE median {mse_med:.3f}")


Fitted method, data formality
OC_P mean 0.529 MSE mean 2957.067 MSE median 324.998


# Fitted, with seeds as words

In [15]:
method = { "method": "fitted_seedwords",
          "numfolds" : numfolds,
          "offset" : 2.0,
          "jitter" : True,
          "feature_dim" : feature_dim}

print("Fitted method with seed words, data", thisdataset)

random.seed(5)
randoms = [random.randrange(0,100) for _ in range(num_randseeds)]


results = [ ]
for rval in randoms:
    theseresults = crossvalidation(method, word_vectors, data_df,data_seeds,
                                   random_seed = rval)
    results += theseresults


ocp_mean, mse_mean, mse_med = eval_eval(results)

print(f"OC_P mean {ocp_mean:.3f}", 
      f"MSE mean {mse_mean:.3f}",
      f"MSE median {mse_med:.3f}")


Fitted method with seed words, data formality
OC_P mean 0.525 MSE mean 2152.185 MSE median 778.103


# Fitted, with seed dimensions

In [16]:
method = { "method": "fitted_seeddims",
          "numfolds" : numfolds,
          "alpha" : 0.02,
          "do_average" : True,
          "feature_dim" : feature_dim}

print("Fitted method with seed dimensions, data", thisdataset)

random.seed(5)
randoms = [random.randrange(0,100) for _ in range(num_randseeds)]

results = [ ]
for rval in randoms:
    theseresults = crossvalidation(method, word_vectors, data_df,data_seeds,
                                   random_seed = rval)
    results += theseresults


ocp_mean, mse_mean, mse_med = eval_eval(results)

print(f"OC_P mean {ocp_mean:.3f}", 
      f"MSE mean {mse_mean:.3f}",
      f"MSE median {mse_med:.3f}")


Fitted method with seed dimensions, data formality
OC_P mean 0.656 MSE mean 7.742 MSE median 7.286


# Fitted, with seeds as words and dimensions

In [17]:
method = {"method": "combined",
          "numfolds" : numfolds,
          "alpha" : 0.05,
          "do_average" : True,
          "offset" : 2,
          "jitter" : True,
          "feature_dim" : feature_dim}

print("Fitted method with seeds as words and dim.s, data", thisdataset)

random.seed(5)
randoms = [random.randrange(0,100) for _ in range(num_randseeds)]


results = [ ]
for rval in randoms:
    theseresults = crossvalidation(method, word_vectors, data_df,data_seeds,
                                   random_seed = rval)
    results += theseresults


ocp_mean, mse_mean, mse_med = eval_eval(results)

print(f"OC_P mean {ocp_mean:.3f}", 
      f"MSE mean {mse_mean:.3f}",
      f"MSE median {mse_med:.3f}")


Fitted method with seeds as words and dim.s, data formality
OC_P mean 0.710 MSE mean 2.404 MSE median 2.372


### 