# Reading the GLoVE data

In [1]:
import os
from scipy import stats
import numpy as np 
import pandas as pd
import zipfile
import math
import sklearn
import torch
import torch.optim as optim
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
import matplotlib.pyplot as plt

In [2]:
glove_path = "glove/glove.42B.300d.zip"
glove_file = "glove.42B.300d.txt"

feature_dim = 300

word_vectors = { }

with zipfile.ZipFile(glove_path) as azip:
    with azip.open(glove_file) as f:
        for line in f:
            values = line.split()
            word = values[0].decode()
            vector = np.array(values[1:], dtype=np.float32)
            word_vectors[word] = vector

# Read Grand features

In [3]:
grandratings_dir = "Grand_etal_csv/"
grandfeatures_path = "/Users/kee252/Data/grand_directions_in_space/features.xlsx"

grandfeatures_df = pd.read_excel(grandfeatures_path)

  warn(msg)


# Functions: Seed-based dimensions

In [4]:
# averaging over seed pair vectors
def average_dim_vector(seeds_pos, seeds_neg, space):
    diffvectors = [ ]
    
    for negword in seeds_neg:
        for posword in seeds_pos:
            diffvectors.append(space[posword] - space[negword])

    # average
    dimvec = np.mean(diffvectors, axis = 0)
    return dimvec

# scalar projection of a vector along a given direction:
# length of the projection vector
# (vec * direction) / ||direction||
def vector_scalar_projection(vec, direction):
    dir_veclen = math.sqrt(np.dot(direction, direction))
    return np.dot(vec, direction) / dir_veclen

# projection of a vector along a direction,
# where we want the actual vector, not its length
# (vec * direction1) * direction1
def vector_projection(vec, direction):
    dir_veclen = math.sqrt(np.dot(direction, direction))
    direction1 = direction / dir_veclen
    return np.dot(vec, direction1) * direction1

from scipy import stats

# fitted dimensions come with a weight and bias for prediction. 
# we can compute those also for seed-based dimensions
# to make predictions on the same order of magnitude as the original ratings.
# We need that in order to do a Mean Squared Error (MSE) evaluation.
# 
# This function uses linear regression to compute weight and bias. formula:
#
# model_rating ~ weight * gold_rating + bias
#
# formulated this way round to match the formulation in the objective function
# of the fitted dimensions model
def coef_for_seed_dimension(gold_ratings, model_ratings):
    result = stats.linregress(gold_ratings, model_ratings)
    
    return (result.slope, result.intercept)

# given a scalar projection on a seed-based dimension,
# along with weight and bias computed by
# coef_for_seed_dimension,
# compute a predicted rating.
def sprediction(sprojection, weight, bias):
    return (sprojection - bias) / weight

# Functions: Fitted dimensions

## Original fitted dimension formulation

In [5]:
# computing a fitted dimension based on a list of word vectors and matching list of gold ratings.
# feature_dim is dimensionality of the vectors. 
# This fits a dimension using an objective function based on the Jameel and Schockaert idea
def ideal_dimension(word_vectors_list, gold_ratings, feature_dim, random_seed = 123):
    torch.manual_seed(random_seed) 

    # we compute: a vector of same dimensionality as our embeddings,
    # and a weight and a bias constant
    feature_vector = torch.randn(feature_dim, requires_grad=True) # dtype=torch.float32)
    weight_constant = torch.randn(1, requires_grad=True) 
    bias_constant = torch.randn(1, requires_grad=True)    

    
    optimizer = optim.Adam([feature_vector, weight_constant, bias_constant], lr=0.01)
    # optimizer = optim.SGD([feature_vector, weight_constant, bias_constant], lr=0.001)

    # Number of optimization steps
    num_steps = 1000

    losses = []

    # Gradient clipping threshold
    max_norm = 1.0  

    for step in range(num_steps):
        total_loss = 0

        # for i in range(len(X_train)):
        # 	word_embedding = torch.tensor(X_train[i]) 
        # 	gold_rating = y_train[i]

        for i in range(len(word_vectors_list)):
            word_embedding = torch.tensor(word_vectors_list[i])
            gold_rating = gold_ratings[i]

            dot_product = torch.dot(word_embedding, feature_vector)
            weighted_gold = gold_rating * weight_constant
            loss = ((dot_product - weighted_gold - bias_constant) ** 2)

            total_loss += loss

        # Average loss over all words in X_train or in the whole human dic (2,801 annotated single words in total)
        # avg_loss = total_loss / len(X_train)
        avg_loss = total_loss / len(word_vectors_list)

        avg_loss.backward()

        # Compute the gradient norms and monitor them during training

        # feature_vector_grad_norm = torch.norm(feature_vector.grad)
        # print(f"Step {step+1}, Feature Vector Gradient Norm: {feature_vector_grad_norm.item()}")

        torch.nn.utils.clip_grad_norm_([feature_vector, weight_constant, bias_constant], max_norm)
        optimizer.step()

        losses.append(avg_loss.item())
    
    return (feature_vector.detach().numpy(), weight_constant.item(), bias_constant.item())

# we have defined our loss to shoot for
# (vec * direction) \approx weight* goldrating + bias
# hence the predicted rating is
# predictedrating = ((vec * direction) - bias ) / weight
def iprediction(vec, direction, weight, bias):
    return (np.dot(vec, direction) - bias) / weight
    

## Fitted dimension with seeds as additional words

In [6]:
# combined method: fitted dimension with seeds
# thisdata_vectors: vectors for category words
# thisdata_gold: gold ratings for category words
# feature_dim: dimensionality
# pos_seedwords: list of positive seedwords
# neg_seedwords: list of negative seedwords
# word_vectors: mapping word-> vector
# offset: synthetic rating for seed words should be this far beyond
#   the rating of the most positive/most negative word
# jitter: if true: add a bit of random variation to seed word ratings
def fitted_dimension_withseedwords(thisdata_vectors, thisdata_gold, feature_dim, 
                                   pos_seedwords, neg_seedwords, word_vectors, offset = 0.5, jitter = False):
    # adding seed words
    
    lowvalue = min(thisdata_gold) - offset
    highvalue = max(thisdata_gold) + offset
        
    for seed in pos_seedwords:
        thisdata_vectors.append( word_vectors [ seed ])
        j = random.uniform(0.001, 0.005) if jitter else 0
        thisdata_gold.append( highvalue + j)

    for seed in neg_seedwords:
        thisdata_vectors.append( word_vectors [ seed ])
        j = random.uniform(0.001, 0.005) if jitter else 0
        thisdata_gold.append( lowvalue - j)
            
            
    return ideal_dimension(thisdata_vectors, thisdata_gold, feature_dim)


## Fitted dimensions with combined loss function: similarity to seed dimensions and match with human ratings

In [16]:
# fitted dimension, with seeds too

# Weight/bias defined as trainable variables to optimize during backpropagation along with the feature vector
#
# parameters:
# - word vectors list: list of vectors for the words in our category
# - gold ratings: gold ratings on the dimension of interest, 
#    in the same order as the word vectors list
# - feature_dim: dimensionality of the vectors
# - random seed
#
# returns: 
# - computed vector for the ideal dimension
# - weight and bias such that vector * idealvec \approx weight * goldrating + bias
#
def fitted_dimension_withseeddims(word_vectors_list, gold_ratings, seed_dims, feature_dims, alpha = 0.5, random_seed = 123):
    torch.manual_seed(random_seed) 

    # we compute: a vector of same dimensionality as our embeddings,
    # and a weight and a bias constant
    feature_vector = torch.randn(feature_dim, requires_grad=True) # dtype=torch.float32)
    weight_constant = torch.randn(1, requires_grad=True) 
    bias_constant = torch.randn(1, requires_grad=True)    

    
    optimizer = optim.Adam([feature_vector, weight_constant, bias_constant], lr=0.01)
    # optimizer = optim.SGD([feature_vector, weight_constant, bias_constant], lr=0.001)

    # Number of optimization steps
    num_steps = 1000

    losses = []
    
    criterion2 = torch.nn.CosineEmbeddingLoss()

    # Gradient clipping threshold
    max_norm = 1.0  

    for step in range(num_steps):
        total_loss1 = 0

        # for i in range(len(X_train)):
        # 	word_embedding = torch.tensor(X_train[i]) 
        # 	gold_rating = y_train[i]

        for i in range(len(word_vectors_list)):
            word_embedding = torch.tensor(word_vectors_list[i])
            gold_rating = gold_ratings[i]

            dot_product = torch.dot(word_embedding, feature_vector)
            weighted_gold = gold_rating * weight_constant
            loss1 = ((dot_product - weighted_gold - bias_constant) ** 2)

            total_loss1 += loss1
            
        
        avg_loss1 = total_loss1 / len(word_vectors_list)
        loss2 = sum([criterion2( feature_vector, d, torch.tensor(1.0)) for d in seed_dims]) / len(seed_dims)
        total_loss = alpha*avg_loss1 + (1-alpha) * loss2

        total_loss.backward()

        # Compute the gradient norms and monitor them during training

        # feature_vector_grad_norm = torch.norm(feature_vector.grad)
        # print(f"Step {step+1}, Feature Vector Gradient Norm: {feature_vector_grad_norm.item()}")

        torch.nn.utils.clip_grad_norm_([feature_vector, weight_constant, bias_constant], max_norm)
        optimizer.step()

        losses.append(total_loss.item())

    
    return (feature_vector.detach().numpy(), weight_constant.item(), bias_constant.item())

# Function for reading in a specific Grand dataset

In [8]:
# reading in Grand data
def read_grand_data(filename, grandratings_dir, grandfeatures_df):
    # extract category and feature
    grandcategory, grandfeature = filename[:-4].split("_")
        
    # read human ratings, make gold column
    df = pd.read_csv(grandratings_dir + filename)
    df["Average"] = [row.iloc[1:26].sum() / 25 for _, row in df.iterrows()]
    # z-scores of average ratings
    df["Gold"] = (df["Average"] - df["Average"].mean()) / df["Average"].std()
        
    # obtain seed words from excel file
    relevant_row = grandfeatures_df[grandfeatures_df.Dimension == grandfeature]
    seedwords = relevant_row.iloc[:, 1:].values.flatten().tolist()
    pos_seedwords = seedwords[:3]
    neg_seedwords = seedwords[3:]
    
    return (grandcategory, grandfeature, pos_seedwords, neg_seedwords, df)

# Evaluation measures: OC_p, MSE

In [9]:
import itertools

# pairwise order consistency, normal definition
def pairwise_order_consistency(goldvalues, modelvalues):
    if len(goldvalues) != len(modelvalues):
        raise Exception("shouldn't be here")
        
    outcomes = [ ]
    for i1, i2 in itertools.combinations(range(len(goldvalues)), 2):
        goldrel = (goldvalues[i1] > goldvalues[i2])
        modelrel = (modelvalues[i1] > modelvalues[i2])
        outcomes.append(int(goldrel == modelrel))
        
    return sum(outcomes) / len(outcomes)

# pairwise order consistency only for a subset of the model values 
def pairwise_order_consistency_wrt(goldvalues, modelvalues, test_indices):
    if len(goldvalues) != len(modelvalues):
        raise Exception("shouldn't be here")
        
    outcomes = [ ]
    for i1 in test_indices:
        for i2 in range(len(goldvalues)):
            if i1 == i2: continue
            
            goldrel = (goldvalues[i1] > goldvalues[i2])
            modelrel = (modelvalues[i1] > modelvalues[i2])
            outcomes.append(int(goldrel == modelrel))
        
    return sum(outcomes) / len(outcomes)

# mean squared error
def mean_squared_error(goldvalues, modelvalues):
    if len(goldvalues) != len(modelvalues):
        raise Exception("shouldn't be here")
    return sum([(g - m)**2 for g, m in zip(goldvalues, modelvalues)]) / len(goldvalues)

# Making a development set

In [13]:
filenames = [f for f in os.listdir(grandratings_dir) if f.endswith("csv")]
[ filename[:-4].split("_") for filename in filenames]

import random
random.seed(789)
devset = random.sample(filenames, 6)
[ filename[:-4].split("_") for filename in devset]

[['cities', 'danger'],
 ['states', 'political'],
 ['animals', 'wetness'],
 ['cities', 'intelligence'],
 ['animals', 'weight'],
 ['names', 'age']]

# Running an evaluation with crossvalidation on the development set

In [19]:
import numpy as np
import statistics

print("Training and test split")

numfolds = 5
offset = 1.0
alpha = 0.1
jitter = False

all_evals = [ ]

for filename in devset: 
        grandcategory, grandfeature, pos_seedwords, neg_seedwords, df = read_grand_data(filename, grandratings_dir, grandfeatures_df)
        
        
        # storage for word vectors and gold values for this dataset
        all_thisdata_vectors = []
        all_thisdata_gold = []

        for row in df.itertuples():
            # row.Row is the word. look it up in word_vectors
            all_thisdata_vectors.append( word_vectors[ row.Row ])
            # gold rating: use z-scored average
            all_thisdata_gold.append( row.Gold)
      
        # crossvalidation setup: give indices to datapoints
        fold = np.random.randint(numfolds, size = len(all_thisdata_gold))
        
        # store the evaluation results from the different test folds
        evals = [ ]
        
        # iterate over folds, evaluate for each of them
        for testfold in range(numfolds):
            # compute training and test data for this fold
            test_indices =  [i for i in range(len(all_thisdata_gold)) if fold[i] == testfold]
            train_indices = [i for i in range(len(all_thisdata_gold)) if fold[i] != testfold]
        
            gold_test =  [ell["Gold"] for _, ell in df.iloc[ test_indices ].iterrows()]
            gold_train = [ ell["Gold"] for _, ell in df.iloc[ train_indices ].iterrows()]
            words_test =  [ell["Row"] for _, ell in df.iloc[ test_indices].iterrows()]
            words_train = [ell["Row"] for _, ell in df.iloc[ train_indices].iterrows()]
            vec_test =  [word_vectors[ w ] for w in words_test]
            vec_train = [word_vectors[ w ] for w in words_train ]
        

            # compute seed-based dimension, and its predictions
            seed_dim = average_dim_vector(pos_seedwords, neg_seedwords, word_vectors)
            # prediction: first scalar projection, then compute weight and bias,
            # and use them to make predictions
            p0 = [vector_scalar_projection( word_vectors[w], seed_dim) for w in df["Row"]]
            weight, bias = coef_for_seed_dimension(df["Gold"], p0)
            df["SPred"] = [sprediction(p, weight, bias) for p in p0]
            
            
            # compute fitted dimension, and its predictions
            fitted_dim, fitted_wt, fitted_bias = ideal_dimension(vec_train, gold_train, feature_dim)
            df["FPred"] = [ iprediction( word_vectors[ w], fitted_dim, fitted_wt, fitted_bias) for w in df["Row"]]
            
            
            # compute fitted dimension with seeds, and its predictions
            fitted_dim, fitted_wt, fitted_bias = fitted_dimension_withseedwords(vec_train, gold_train, feature_dim, 
                                                                                pos_seedwords, neg_seedwords, word_vectors,
                                                                                offset = offset, jitter = jitter)
            df["FSPred"] = [ iprediction( word_vectors[ w], fitted_dim, fitted_wt, fitted_bias) for w in df["Row"]]
            
            # compute fitted dimension with seed dimensions, and its predictions
            diffvectors = [ ]
    
            for negword in neg_seedwords:
                for posword in pos_seedwords:
                    diffvectors.append(word_vectors[posword] - word_vectors[negword])
                    
            dimvec = torch.from_numpy(np.mean(diffvectors, axis = 0))
            
            fitted_dim, fitted_wt, fitted_bias = fitted_dimension_withseeddims(vec_train, gold_train, [dimvec], 
                                                                               feature_dim, alpha = alpha)
        
            
            df["FS2Pred"] = [ iprediction( word_vectors[ w], fitted_dim, fitted_wt, fitted_bias) for w in df["Row"]]
            
            # order consistency pairwise: test values tested for their ordering wrt. all values, training and test
            # MSE: evaluate on test only
            e = { "ocp_s" : pairwise_order_consistency_wrt(df["Gold"], df["SPred"], test_indices),
                  "ocp_f" : pairwise_order_consistency_wrt(df["Gold"], df["FPred"], test_indices),
                  "ocp_fs": pairwise_order_consistency_wrt(df["Gold"], df["FSPred"], test_indices),
                  "ocp_fs2": pairwise_order_consistency_wrt(df["Gold"], df["FS2Pred"], test_indices),
                  "mse_s" : mean_squared_error(gold_test, [p for i, p in enumerate(df["SPred"]) if i in test_indices]),
                  "mse_f" : mean_squared_error(gold_test, [p for i, p in enumerate(df["FPred"]) if i in test_indices]),
                  "mse_fs": mean_squared_error(gold_test, [p for i, p in enumerate(df["FSPred"]) if i in test_indices]),
                  "mse_fs2": mean_squared_error(gold_test, [p for i, p in enumerate(df["FS2Pred"]) if i in test_indices])}
            
            evals.append(e)
            all_evals.append(e)
        
        print(grandcategory, "-", grandfeature)
        print("\t", end = "")
        for suffix, name in [("s", "Seed"), ("f", "Fitted"), ("fs", "Fitted-Seed"), ("fs2", "Fitted-SeedD")]:
            ocps = [e["ocp_" + suffix] for e in evals]
            print(name, f"OC_p {statistics.mean(ocps):.3f} ({statistics.stdev(ocps):.2f})", end = " ")
        print("\n\t", end = "")
        for suffix, name in [("s", "Seed"), ("f", "Fitted"), ("fs", "Fitted-Seed"), ("fs2", "Fitted-SeedD")]:
            mses = [e["mse_" + suffix] for e in evals]
            print(name, f"MSE {statistics.mean(mses):.3f} ({statistics.stdev(mses):.2f})", end = " ")
        print()

Training and test split
cities - danger
	Seed OC_p 0.770 (0.03) Fitted OC_p 0.651 (0.04) Fitted-Seed OC_p 0.658 (0.04) Fitted-SeedD OC_p 0.826 (0.04) 
	Seed MSE 0.975 (0.38) Fitted MSE 12.674 (5.61) Fitted-Seed MSE 11.837 (3.38) Fitted-SeedD MSE 0.594 (0.20) 
states - political
	Seed OC_p 0.663 (0.06) Fitted OC_p 0.604 (0.06) Fitted-Seed OC_p 0.591 (0.08) Fitted-SeedD OC_p 0.768 (0.03) 
	Seed MSE 4.877 (2.78) Fitted MSE 14.012 (9.66) Fitted-Seed MSE 22.456 (9.13) Fitted-SeedD MSE 0.945 (0.12) 
animals - wetness
	Seed OC_p 0.755 (0.05) Fitted OC_p 0.635 (0.14) Fitted-Seed OC_p 0.633 (0.14) Fitted-SeedD OC_p 0.774 (0.11) 
	Seed MSE 0.569 (0.26) Fitted MSE 9.231 (7.49) Fitted-Seed MSE 23.144 (18.38) Fitted-SeedD MSE 0.691 (0.32) 
cities - intelligence
	Seed OC_p 0.584 (0.06) Fitted OC_p 0.586 (0.06) Fitted-Seed OC_p 0.606 (0.10) Fitted-SeedD OC_p 0.802 (0.05) 
	Seed MSE 35.250 (22.71) Fitted MSE 10.895 (4.53) Fitted-Seed MSE 10.555 (4.67) Fitted-SeedD MSE 0.716 (0.35) 
animals - weight
	S

In [20]:
for suffix, name in [("s", "Seed"), ("f", "Fitted"), ("fs", "Fitted-Seed"), ("fs2", "Fitted-SeedD")]:
    ocps = [e["ocp_" + suffix] for e in all_evals]
    print(name, f"OC_p mean {statistics.mean(ocps):.3f} ({statistics.stdev(ocps):.2f})")
print()

for suffix, name in [("s", "Seed"), ("f", "Fitted"), ("fs", "Fitted-Seed"), ("fs2", "Fitted-SeedD")]:
    mses = [e["mse_" + suffix] for e in evals]
    print(name, f"MSE med {statistics.median(mses):.3f} mean {statistics.mean(mses):.3f} ({statistics.stdev(mses):.2f})") 
    


Seed OC_p mean 0.667 (0.10)
Fitted OC_p mean 0.623 (0.08)
Fitted-Seed OC_p mean 0.623 (0.08)
Fitted-SeedD OC_p mean 0.813 (0.06)

Seed MSE med 1.318 mean 1.499 (0.49)
Fitted MSE med 13.149 mean 14.792 (4.35)
Fitted-Seed MSE med 16.654 mean 16.658 (3.29)
Fitted-SeedD MSE med 0.361 mean 0.391 (0.16)
