# Evaluating on the Pavlick and Nenkova style data, using word type embeddings

# Global settings

In [1]:
import os
from scipy import stats
import numpy as np 
import pandas as pd
import zipfile
import math
import sklearn
import torch
import torch.optim as optim
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
import matplotlib.pyplot as plt
import statistics
from collections import defaultdict
import random



In [2]:
num_randseeds = 3
featuredim = 300
numfolds = 5

param_offset = 1.0
param_jitter = False
param_average = True
param_alpha = 0.02
param_alpha_comb = 0.05

print(param_alpha)

0.02


# Reading in the data.

## GLoVE

In [3]:
glove_path = "../glove/glove.42B.300d.zip"
glove_file = "glove.42B.300d.txt"

feature_dim = 300

word_vectors = { }

with zipfile.ZipFile(glove_path) as azip:
    with azip.open(glove_file) as f:
        for line in f:
            values = line.split()
            word = values[0].decode()
            vector = np.array(values[1:], dtype=np.float32)
            word_vectors[word] = vector

print('ok')

ok


## Pavlick and Nenkova data

There are 1,160 complexity scores and 1,274 formality scores.

For each of the two datasets, we z-score the ratings so they will be on a similar scale as the Grand et al ratings.

In [30]:
# pavlick_path = "/Users/kee252/Projects/Marianna/interpretable-dimensions/style-data-Pavlick-Nenkova_2015/"
pavlick_path = "../../github-dimensions/interpretable-dimensions/style-data-Pavlick-Nenkova_2015/"
formality_human_filtered_name = "formality/human/filtered_formality_human_scores.txt"
complexity_human_filtered_name = "complexity/human/filtered_complexity_human_scores.txt"

formality_frequency_file = "/Users/marianna/Documents/NSF-Katrin/jupyter-notebooks/freq_ranking_results.style/freq_ranking.unsorted.formality"
complexity_frequency_file = "/Users/marianna/Documents/NSF-Katrin/jupyter-notebooks/freq_ranking_results.style/freq_ranking.unsorted.complexity"

print(complexity_human_filtered_name)

complexity/human/filtered_complexity_human_scores.txt


In [31]:
formality_df = pd.read_csv(pavlick_path + formality_human_filtered_name, sep = "\s+", header = None)
formality_df.columns = ["rating", "word", "sd"]
formality_df

Unnamed: 0,rating,word,sd
0,1.428571,someplace,3.78
1,1.571429,chow,3.36
2,1.571429,yeah,2.15
3,1.714286,dressing,2.93
4,2.571429,grandma,4.43
...,...,...,...
1269,97.000000,scrutiny,4.55
1270,97.285714,endorsement,5.31
1271,97.857143,inequality,5.67
1272,98.000000,adapted,5.29


In [32]:
(formality_df.rating - formality_df.rating.mean()) / formality_df.rating.std()

0      -1.694189
1      -1.689661
2      -1.689661
3      -1.685134
4      -1.657968
          ...   
1269    1.334787
1270    1.343842
1271    1.361953
1272    1.366481
1273    1.371008
Name: rating, Length: 1274, dtype: float64

In [33]:
formality_df["z"] = (formality_df.rating - formality_df.rating.mean()) / formality_df.rating.std()
formality_df

Unnamed: 0,rating,word,sd,z
0,1.428571,someplace,3.78,-1.694189
1,1.571429,chow,3.36,-1.689661
2,1.571429,yeah,2.15,-1.689661
3,1.714286,dressing,2.93,-1.685134
4,2.571429,grandma,4.43,-1.657968
...,...,...,...,...
1269,97.000000,scrutiny,4.55,1.334787
1270,97.285714,endorsement,5.31,1.343842
1271,97.857143,inequality,5.67,1.361953
1272,98.000000,adapted,5.29,1.366481


In [34]:
complexity_df = pd.read_csv(pavlick_path + complexity_human_filtered_name, sep = "\s+", header = None)
complexity_df.columns = ["rating", "word", "sd"]
complexity_df

Unnamed: 0,rating,word,sd
0,0.428571,woman,0.79
1,1.285714,walk,2.21
2,1.571429,tells,3.31
3,1.857143,last,4.10
4,1.857143,next,4.49
...,...,...,...
1155,94.428571,systematic,6.70
1156,95.428571,diplomatic,6.43
1157,96.285714,referendum,5.25
1158,96.714286,archaeological,5.68


In [35]:
complexity_df["z"] = (complexity_df.rating - complexity_df.rating.mean()) / complexity_df.rating.std()
complexity_df

Unnamed: 0,rating,word,sd,z
0,0.428571,woman,0.79,-1.492158
1,1.285714,walk,2.21,-1.465328
2,1.571429,tells,3.31,-1.456384
3,1.857143,last,4.10,-1.447441
4,1.857143,next,4.49,-1.447441
...,...,...,...,...
1155,94.428571,systematic,6.70,1.450255
1156,95.428571,diplomatic,6.43,1.481557
1157,96.285714,referendum,5.25,1.508388
1158,96.714286,archaeological,5.68,1.521803


In [36]:
frequency_df_formality = pd.read_csv(formality_frequency_file, sep = "\s+", header = None)
frequency_df_formality.columns = ["word", "frequency"]
frequency_df_formality["log_frequency"] = np.log(frequency_df_formality["frequency"])

frequency_df_complexity = pd.read_csv(complexity_frequency_file, sep = "\s+", header = None)
frequency_df_complexity.columns = ["word", "frequency"]
frequency_df_complexity["log_frequency"] = np.log(frequency_df_complexity["frequency"])

print(frequency_df_formality)
print(frequency_df_complexity)

             word  frequency  log_frequency
0       someplace     533015      13.186305
1            chow     369072      12.818747
2            yeah   11643327      16.270244
3        dressing    3817659      15.155148
4         grandma    1134937      13.942088
...           ...        ...            ...
1269     scrutiny    2737436      14.822532
1270  endorsement    4669066      15.356470
1271   inequality    2893527      14.877987
1272      adapted    5603112      15.538833
1273     exchange   38565747      17.467875

[1274 rows x 3 columns]
                word  frequency  log_frequency
0              woman   51276703      17.752747
1               walk   27633191      17.134528
2              tells   14620721      16.497950
3               last  241219808      19.301219
4               next  183243510      19.026326
...              ...        ...            ...
1155      systematic    5141697      15.452894
1156      diplomatic    2659134      14.793511
1157      referendum    

# Seeds

Here the seeds come in pairs. Marianna extracted them frmo the Pavlick/Nenkova "pairs" data by using the top rated pairs. 

In [37]:
complexity_seeds_str = """work - employment
further - subsequently
strong - powerful
train - railway
shown - indicated"""
complexity_seeds = [ ]
for pairstr in complexity_seeds_str.split("\n"):
    pair = [s.strip() for s in pairstr.split("-")]
    complexity_seeds.append(tuple(pair))
    
complexity_seeds
    

[('work', 'employment'),
 ('further', 'subsequently'),
 ('strong', 'powerful'),
 ('train', 'railway'),
 ('shown', 'indicated')]

In [38]:
formality_seeds_str = """winner - recipient
terrible - disastrous
membership - affiliation
highest - paramount
test - verify"""
formality_seeds = [ ]
for pairstr in formality_seeds_str.split("\n"):
    pair = [s.strip() for s in pairstr.split("-")]
    formality_seeds.append(tuple(pair))
    
formality_seeds

[('winner', 'recipient'),
 ('terrible', 'disastrous'),
 ('membership', 'affiliation'),
 ('highest', 'paramount'),
 ('test', 'verify')]

## Function for running crossvalidation

In [42]:
import eval_dim
import compute_dim
import statistics

def crossvalidation(method, word_vectors, df, seedpairs, random_seed = 123):
    
    neg_seedwords = [n for n, _ in seedpairs]
    pos_seedwords = [p for _, p in seedpairs]
                     
    all_vectors = [ word_vectors[w] for w in df.word]
    
    # crossvalidation setup: give indices to datapoints
    rng = np.random.default_rng(seed = 3)
    fold = rng.integers(low = 0, high = method["numfolds"], size = len(df.word))

    # store the evaluation results from the different test folds
    all_evals = [ ]

    # iterate over folds, evaluate for each of them
    for testfold in range(method["numfolds"]):
        # compute training and test data for this fold
        test_indices =  [i for i in range(len(df.z)) if fold[i] == testfold]
        train_indices = [i for i in range(len(df.z)) if fold[i] != testfold]
        
        gold_test =  [ell["z"] for _, ell in df.iloc[ test_indices ].iterrows()]
        gold_train = [ell["z"] for _, ell in df.iloc[ train_indices ].iterrows()]
        
        words_test =  [ell["word"] for _, ell in df.iloc[ test_indices].iterrows()]
        words_train = [ell["word"] for _, ell in df.iloc[ train_indices].iterrows()]
        
        vec_test =  [word_vectors[ w ] for w in words_test]
        vec_train = [word_vectors[ w ] for w in words_train ]


        # compute seed-based dimension, and its predictions
        if method["method"] == "seedbased":
            dimension = compute_dim.dimension_seedbased(pos_seedwords, neg_seedwords, word_vectors, paired = True)
            df["Pred"] = compute_dim.predict_coord_fromtrain(vec_train, gold_train, dimension, all_vectors)
            predictions = df["Pred"]
            print('PREDICTIONS ====>','\n')
            print(predictions)

        elif method["method"] == "fitted":
            dimension, weight, bias = compute_dim.dimension_fitted_fromratings(vec_train, gold_train, 
                                                                               method["feature_dim"],
                                                                               random_seed = random_seed)

            df["Pred"] = compute_dim.predict_coord_fromline(all_vectors, dimension, weight, bias)

        elif method["method"] == "fitted_seedwords":
            dimension, weight, bias = compute_dim.dimension_fitted_fromratings_seedwords(vec_train, gold_train, 
                                                            method["feature_dim"], 
                                                            pos_seedwords, neg_seedwords, word_vectors,
                                                            offset = method["offset"], jitter = method["jitter"],
                                                            random_seed = random_seed)
                                                            
            df["Pred"] = compute_dim.predict_coord_fromline(all_vectors, dimension, weight, bias)

        elif method["method"] == "fitted_seeddims":
            dimension, weight, bias = compute_dim.dimension_fitted_fromratings_seeddims(vec_train, gold_train, 
                                                            method["feature_dim"], 
                                                            pos_seedwords, neg_seedwords, word_vectors,
                                                            do_average = method["do_average"], 
                                                            alpha = method["alpha"],
                                                            random_seed = random_seed,
                                                            paired = True)
            df["Pred"] = compute_dim.predict_coord_fromline(all_vectors, dimension, weight, bias)

        elif method["method"] == "combined":
            dimension, weight, bias = compute_dim.dimension_fitted_fromratings_combined(vec_train, gold_train,
                                                            method["feature_dim"],
                                                            pos_seedwords, neg_seedwords, word_vectors,
                                                            offset = method["offset"], jitter = method["jitter"],
                                                            do_average = method["do_average"], 
                                                            alpha = method["alpha"],
                                                            random_seed = random_seed,
                                                            paired = True)
            df["Pred"] = compute_dim.predict_coord_fromline(all_vectors, dimension, weight, bias)

        elif method["method"] == "frequency":
            if df is formality_df:
                df["Pred"] = frequency_df_formality["log_frequency"]
            elif df is complexity_df:
                df["Pred"] = frequency_df_complexity["log_frequency"]
            
        else:
            raise Exception("shouldn't be here")

        # for ind in test_indices:
            # gold_scores = df["z"]
            # frequencies = df["Pred"]
            # words = frequency_df_formality["word"]
            
            # gold_df_row = df.iloc[ind]
            # if df is formality_df:
            #     freq_df_row = frequency_df_formality.iloc[ind]
            # elif df is complexity_df:
            #     freq_df_row = frequency_df_complexity.iloc[ind]
               
            # if freq_df_row['word'] == gold_df_row['word']:
            #     print('yes',freq_df_row['word'], gold_df_row['word'] )
            # else:
            #     print('no', freq_df_row['word'], gold_df_row['word'] )
            
                       
        # order consistency pairwise: test values tested for their ordering wrt. all values, training and test
        # MSE: evaluate on test only
        e = { "ocp" : eval_dim.pairwise_order_consistency_wrt(df["z"], df["Pred"], test_indices),
              "mse" : eval_dim.mean_squared_error(gold_test, [p for i, p in enumerate(df["Pred"]) if i in test_indices]) }
        
        
        all_evals.append(e)

        
    return all_evals


## Aggregating results

This is yet different from Grand et al because there are no sub-conditions, just a single dataset.
We directly aggregate over all results in the list of results dictionaries.

In [40]:
from collections import defaultdict
import statistics

# given a list of results dictionaries,
# compute mean, median and standard deviation over values for a particular key
def eval_summary_by(evals, keylabel):
    vals = [e[keylabel] for e in evals if e[keylabel] is not None]
    
    return (statistics.mean(vals), statistics.median(vals), statistics.stdev(vals))

# given a dictionary of results (parameters -> result dictionary list),
# all for the same dataset but from different crossvalidatin runs
# and runs with different random seeds 
def eval_eval(results):
    ocp_mean, _, _ = eval_summary_by(results, "ocp")
    mse_mean, mse_med, _ = eval_summary_by(results, "mse")
    
    return ocp_mean, mse_mean, mse_med


# Evaluation


## Frequency baseline

In [41]:
method = {"method": "frequency",
          "numfolds" : numfolds}

print("Frequency baseline")

for data_label, data_df, data_seeds in [ ("Formality", formality_df, formality_seeds), 
                                         ("Complexity", complexity_df, complexity_seeds) ]:

    # frequency_df_formality, frequency_df_complexity
    results = crossvalidation(method, word_vectors, data_df, data_seeds)

    ocp_mean, mse_mean, mse_med = eval_eval(results)

    print("\t", data_label, "dataset", 
          f"OC_P mean {ocp_mean:.3f}", 
          f"MSE mean {mse_mean:.3f}", 
          f"MSE median {mse_med:.3f}")
    

Frequency baseline
goldvalues :  [-1.6896614587105296, -1.6896614587105296, -1.6579681341329078, -1.626274809555286, -1.6172195830169154, -1.5764710319009227, -1.5628881604000422, -1.5628881604000422, -1.5402500940541155, -1.5085567694764936, -1.5085567694764936, -1.5040291562073083, -1.4995015112447982, -1.4904462847064277, -1.4859186714372423, -1.4768634448988718, -1.4361148620895545, -1.4180044090128132, -1.3998939242427473, -1.3908386977043767, -1.381783471166006, -1.3682005996651254, -1.36367298639594, -1.3591453731267547, -1.3591453731267547, -1.350090146588384, -1.3229244352799474, -1.3183968220107622, -1.3183968220107622, -1.3138692087415769, -1.3048139505098815, -1.3048139505098815, -1.295758723971511, -1.2912311107023255, -1.282175884163955, -1.277648239201445, -1.277648239201445, -1.264065399393889, -1.264065399393889, -1.2595377861247037, -1.2504825595863331, -1.2459549146238231, -1.2459549146238231, -1.2368996880854524, -1.2323720748162672, -1.2233168482778964, -1.22331684

## Seed-based

In [None]:
method = { "method": "seedbased",
          "numfolds" : numfolds}

print("Seed-based method")

for data_label, data_df, data_seeds in [ ("Formality", formality_df, formality_seeds), 
                                         ("Complexity", complexity_df, complexity_seeds) ]:

    print('formality_df : ')
    print(formality_df)
    results = crossvalidation(method, word_vectors, data_df, data_seeds)
    
    ocp_mean, mse_mean, mse_med = eval_eval(results)

    print("\t", data_label, "dataset", 
          f"OC_P mean {ocp_mean:.3f}", 
          f"MSE mean {mse_mean:.3f}",
          f"MSE median {mse_med:.3f}")
 

# Fitted

In [16]:
method = {"method": "fitted",
          "numfolds" : numfolds,
          "feature_dim" : feature_dim}

print("Fitted method")

random.seed(5)
randoms = [random.randrange(0,100) for _ in range(num_randseeds)]

for data_label, data_df, data_seeds in [ ("Formality", formality_df, formality_seeds), 
                                         ("Complexity", complexity_df, complexity_seeds) ]:
    results = [ ]
    for rval in randoms:
        theseresults = crossvalidation(method, word_vectors, data_df,data_seeds,
                                       random_seed = rval)
        results += theseresults
        
    
    ocp_mean, mse_mean, mse_med = eval_eval(results)

    print("\t", data_label, "dataset", 
          f"OC_P mean {ocp_mean:.3f}", 
          f"MSE mean {mse_mean:.3f}", 
          f"MSE median {mse_med:.3f}")


Fitted method
testfold
testfold
testfold


KeyboardInterrupt: 

# Fitted, with seeds as words

In [None]:
method = { "method": "fitted_seedwords",
          "numfolds" : numfolds,
          "offset" : 2.0,
          "jitter" : True,
          "feature_dim" : feature_dim}

print("Fitted method with seed words")

random.seed(5)
randoms = [random.randrange(0,100) for _ in range(num_randseeds)]

for data_label, data_df, data_seeds in [ ("Formality", formality_df, formality_seeds), 
                                         ("Complexity", complexity_df, complexity_seeds) ]:
    results = [ ]
    for rval in randoms:
        theseresults = crossvalidation(method, word_vectors, data_df,data_seeds,
                                       random_seed = rval)
        results += theseresults
        
    
    ocp_mean, mse_mean, mse_med = eval_eval(results)

    print("\t", data_label, "dataset", 
          f"OC_P mean {ocp_mean:.3f}", 
          f"MSE mean {mse_mean:.3f}", 
          f"MSE median {mse_med:.3f}")


# Fitted, with seed dimensions

In [None]:
method = { "method": "fitted_seeddims",
          "numfolds" : numfolds,
          "alpha" : 0.02,
          "do_average" : True,
          "feature_dim" : feature_dim}

print("Fitted method with seed dimensions")

random.seed(5)
randoms = [random.randrange(0,100) for _ in range(num_randseeds)]

for data_label, data_df, data_seeds in [ ("Formality", formality_df, formality_seeds), 
                                         ("Complexity", complexity_df, complexity_seeds) ]:
    results = [ ]
    for rval in randoms:
        theseresults = crossvalidation(method, word_vectors, data_df,data_seeds,
                                       random_seed = rval)
        results += theseresults
        
    
    ocp_mean, mse_mean, mse_med = eval_eval(results)

    print("\t", data_label, "dataset", 
          f"OC_P mean {ocp_mean:.3f}", 
          f"MSE mean {mse_mean:.3f}",
          f"MSE median {mse_med:.3f}")


# Fitted, with seeds as words and dimensions

In [None]:
method = {"method": "combined",
          "numfolds" : numfolds,
          "alpha" : 0.05,
          "do_average" : True,
          "offset" : 2,
          "jitter" : True,
          "feature_dim" : feature_dim}

print("Fitted method with seeds as words and dim.s")

random.seed(5)
randoms = [random.randrange(0,100) for _ in range(num_randseeds)]

for data_label, data_df, data_seeds in [ ("Formality", formality_df, formality_seeds), 
                                         ("Complexity", complexity_df, complexity_seeds) ]:
    results = [ ]
    for rval in randoms:
        theseresults = crossvalidation(method, word_vectors, data_df,data_seeds,
                                       random_seed = rval)
        results += theseresults
        
    
    ocp_mean, mse_mean, mse_med = eval_eval(results)

    print("\t", data_label, "dataset", 
          f"OC_P mean {ocp_mean:.3f}", 
          f"MSE mean {mse_mean:.3f}", 
          f"MSE median {mse_med:.3f}")
