# running through all the datasets in Grand et al

# Reading the GLoVE data

In [1]:
import os
from scipy import stats
import numpy as np 
import pandas as pd
import zipfile
import math
import sklearn
import torch
import torch.optim as optim
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
import matplotlib.pyplot as plt

In [2]:
glove_path = "glove/glove.42B.300d.zip"
glove_file = "glove.42B.300d.txt"

feature_dim = 300

word_vectors = { }

with zipfile.ZipFile(glove_path) as azip:
    with azip.open(glove_file) as f:
        for line in f:
            values = line.split()
            word = values[0].decode()
            vector = np.array(values[1:], dtype=np.float32)
            word_vectors[word] = vector

# Read Grand features

In [3]:
grandratings_dir = "Grand_etal_csv/"
grandfeatures_path = "/Users/kee252/Data/grand_directions_in_space/features.xlsx"

grandfeatures_df = pd.read_excel(grandfeatures_path)

  warn(msg)


# Relevant functions

In [4]:
# averaging over seed pair vectors
def average_dim_vector(seeds_pos, seeds_neg, space):
    diffvectors = [ ]
    
    for negword in seeds_neg:
        for posword in seeds_pos:
            diffvectors.append(space[posword] - space[negword])

    # average
    dimvec = np.mean(diffvectors, axis = 0)
    return dimvec

# scalar projection of a vector along a given direction:
# length of the projection vector
# (vec * direction) / ||direction||
def vector_scalar_projection(vec, direction):
    dir_veclen = math.sqrt(np.dot(direction, direction))
    return np.dot(vec, direction) / dir_veclen

# projection of a vector along a direction,
# where we want the actual vector, not its length
# (vec * direction1) * direction1
def vector_projection(vec, direction):
    dir_veclen = math.sqrt(np.dot(direction, direction))
    direction1 = direction / dir_veclen
    return np.dot(vec, direction1) * direction1

# computing a fitted dimension based on a list of word vectors and matching list of gold ratings.
# feature_dim is dimensionality of the vectors. 
# This fits a dimension using an objective function based on the Jameel and Schockaert idea
def ideal_dimension(word_vectors_list, gold_ratings, feature_dim, random_seed = 123):
    torch.manual_seed(random_seed) 

    # we compute: a vector of same dimensionality as our embeddings,
    # and a weight and a bias constant
    feature_vector = torch.randn(feature_dim, requires_grad=True) # dtype=torch.float32)
    weight_constant = torch.randn(1, requires_grad=True) 
    bias_constant = torch.randn(1, requires_grad=True)    

    
    optimizer = optim.Adam([feature_vector, weight_constant, bias_constant], lr=0.01)
    # optimizer = optim.SGD([feature_vector, weight_constant, bias_constant], lr=0.001)

    # Number of optimization steps
    num_steps = 1000

    losses = []

    # Gradient clipping threshold
    max_norm = 1.0  

    for step in range(num_steps):
        total_loss = 0

        # for i in range(len(X_train)):
        # 	word_embedding = torch.tensor(X_train[i]) 
        # 	gold_rating = y_train[i]

        for i in range(len(word_vectors_list)):
            word_embedding = torch.tensor(word_vectors_list[i])
            gold_rating = gold_ratings[i]

            dot_product = torch.dot(word_embedding, feature_vector)
            weighted_gold = gold_rating * weight_constant
            loss = ((dot_product - weighted_gold - bias_constant) ** 2)

            total_loss += loss

        # Average loss over all words in X_train or in the whole human dic (2,801 annotated single words in total)
        # avg_loss = total_loss / len(X_train)
        avg_loss = total_loss / len(word_vectors_list)

        avg_loss.backward()

        # Compute the gradient norms and monitor them during training

        # feature_vector_grad_norm = torch.norm(feature_vector.grad)
        # print(f"Step {step+1}, Feature Vector Gradient Norm: {feature_vector_grad_norm.item()}")

        torch.nn.utils.clip_grad_norm_([feature_vector, weight_constant, bias_constant], max_norm)
        optimizer.step()

        losses.append(avg_loss.item())
    
    return (feature_vector.detach().numpy(), weight_constant.item(), bias_constant.item())

# we have defined our loss to shoot for
# (vec * direction) \approx weight* goldrating + bias
# hence the predicted rating is
# predictedrating = ((vec * direction) - bias ) / weight
def iprediction(vec, direction, weight, bias):
    return (np.dot(vec, direction) - bias) / weight
    


In [5]:
# reading in Grand data
def read_grand_data(filename, grandratings_dir, grandfeatures_df):
    # extract category and feature
    grandcategory, grandfeature = filename[:-4].split("_")
        
    # read human ratings, make gold column
    df = pd.read_csv(grandratings_dir + filename)
    df["Average"] = [row.iloc[1:26].sum() / 25 for _, row in df.iterrows()]
    # z-scores of average ratings
    df["Gold"] = (df["Average"] - df["Average"].mean()) / df["Average"].std()
        
    # obtain seed words from excel file
    relevant_row = grandfeatures_df[grandfeatures_df.Dimension == grandfeature]
    seedwords = relevant_row.iloc[:, 1:].values.flatten().tolist()
    pos_seedwords = seedwords[:3]
    neg_seedwords = seedwords[3:]
    
    return (grandcategory, grandfeature, pos_seedwords, neg_seedwords, df)

In [6]:
from scipy import stats

# fitted dimensions come with a weight and bias for prediction. 
# we can compute those also for seed-based dimensions
# to make predictions on the same order of magnitude as the original ratings.
# We need that in order to do a Mean Squared Error (MSE) evaluation.
# 
# This function uses linear regression to compute weight and bias. formula:
#
# model_rating ~ weight * gold_rating + bias
#
# formulated this way round to match the formulation in the objective function
# of the fitted dimensions model
def coef_for_seed_dimension(gold_ratings, model_ratings):
    result = stats.linregress(gold_ratings, model_ratings)
    
    return (result.slope, result.intercept)

# given a scalar projection on a seed-based dimension,
# along with weight and bias computed by
# coef_for_seed_dimension,
# compute a predicted rating.
def sprediction(sprojection, weight, bias):
    return (sprojection - bias) / weight

# Correlation with human ratings: seed-based versus fitted dimensions

We compute fitted dimensions from the entirety of the human ratings data. We also compute seed-based dimensions.
We evaluate both of them using Pearson's r for correlation with human ratings.

We find that we can in all cases compute a near-perfect dimension matching the human ratings. 

In [7]:
results_fitted = { }

print("Seed-based versus fitted dimensions, correlation with human ratings")

for filename in os.listdir(grandratings_dir):
    if filename.endswith("csv"):
        grandcategory, grandfeature, pos_seedwords, neg_seedwords, df = read_grand_data(filename, grandratings_dir, grandfeatures_df)
        
        # make average seed-based dimension, use for predictions
        dimension_vec = average_dim_vector(pos_seedwords, neg_seedwords, word_vectors)
        df["Pred"] = [vector_scalar_projection( word_vectors[w], dimension_vec) for w in df["Row"]]
        
        # make ideal dimension, use for predictions
        thisdata_vectors = []
        thisdata_gold = []

        for row in df.itertuples():
            # row.Row is the word. look it up in word_vectors
            thisdata_vectors.append( word_vectors[ row.Row ])
            # gold rating: use z-scored average
            thisdata_gold.append( row.Gold)
        
        ideal_dim, ideal_wt, ideal_bias = ideal_dimension(thisdata_vectors, thisdata_gold, feature_dim)
        
        df["IPred"] = [iprediction( word_vectors[w], ideal_dim, ideal_wt, ideal_bias) for w in df["Row"]]
        
        # evaluate
        res_s = stats.pearsonr(df["Gold"], df["Pred"])
        res_i = stats.pearsonr(df["Gold"], df["IPred"])
        
        print(f"{grandcategory:12s}-{grandfeature:12s} Pred r={res_s.statistic:.3f} p={res_s.pvalue:.3f} IPred ={res_i.statistic:.3f} p={res_i.pvalue:.3f}")
        
        results_fitted[ ( grandcategory, grandfeature) ] = {
            "grandcategory" : grandcategory,
            "grandfeature" : grandfeature,
            "seed_r" : res_s.statistic,
            "seed_p" : res_s.pvalue,
            "fitted_r" : res_i.statistic,
            "fitted_p" : res_i.pvalue } 


Seed-based versus fitted dimensions, correlation with human ratings
cities      -temperature  Pred r=-0.086 p=0.553 IPred =0.986 p=0.000
professions -intelligence Pred r=0.468 p=0.001 IPred =0.999 p=0.000
cities      -intelligence Pred r=0.161 p=0.265 IPred =0.997 p=0.000
clothing    -location     Pred r=0.323 p=0.022 IPred =0.996 p=0.000
cities      -arousal      Pred r=0.001 p=0.996 IPred =0.994 p=0.000
clothing    -arousal      Pred r=0.185 p=0.199 IPred =0.987 p=0.000
states      -size         Pred r=0.331 p=0.019 IPred =0.992 p=0.000
sports      -intelligence Pred r=0.613 p=0.000 IPred =0.998 p=0.000
names       -age          Pred r=0.616 p=0.000 IPred =0.999 p=0.000
clothing    -wealth       Pred r=0.369 p=0.008 IPred =0.998 p=0.000
weather     -danger       Pred r=0.790 p=0.000 IPred =0.996 p=0.000
professions -danger       Pred r=0.446 p=0.001 IPred =0.993 p=0.000
clothing    -size         Pred r=0.097 p=0.503 IPred =0.993 p=0.000
animals     -size         Pred r=0.668 p=0.000 

In [17]:
print("Percentage with significant correlations:")
seed_sig = sum([r["seed_p"] <= 0.05 for r in results_fitted.values() if r["seed_r"] > 0])
print("Seed-based:", round(seed_sig / len(results_fitted), 3))
fitted_sig = sum([r["fitted_p"] <= 0.05 for r in results_fitted.values() if r["fitted_r"] > 0])
print("Fitted:", round(fitted_sig / len(results_fitted), 3))

Percentage with significant correlations:
Seed-based: 0.679
Fitted: 1.0


# Scrambled ratings

The fitted dimensions above work very well, maybe too well. Is it maybe possible to compute a fitted dimension from any arbitrary ratings collection? We test this by scrambling ratings, so that they are arbitrary. Do we still manage to compute meaningful fitted dimensions? We again test using Pearson's r for correlation.

The result is that yes, we actually do manage to compute axes on which the words are put in arbitrary bizarre orders, showing that the fitted dimensions have too many degrees of freedom.

In [28]:
import random

print("Scrambled ratings")

for filename in os.listdir(grandratings_dir):
    if filename.endswith("csv"):
        grandcategory, grandfeature, pos_seedwords, neg_seedwords, df = read_grand_data(filename, grandratings_dir, grandfeatures_df)
        
        # make average seed-based dimension, use for predictions
        dimension_vec = average_dim_vector(pos_seedwords, neg_seedwords, word_vectors)
        df["Pred"] = [vector_scalar_projection( word_vectors[w], dimension_vec) for w in df["Row"]]
        
        # make ideal dimension, use for predictions
        thisdata_vectors = []
        thisdata_gold = []

        for row in df.itertuples():
            # row.Row is the word. look it up in word_vectors
            thisdata_vectors.append( word_vectors[ row.Row ])
            # gold rating: use z-scored average
            thisdata_gold.append( row.Gold)
      
        #SCRAMBLING
        random.shuffle(thisdata_gold)
        
        ideal_dim, ideal_wt, ideal_bias = ideal_dimension(thisdata_vectors, thisdata_gold, feature_dim)
        
        df["IPred"] = [iprediction( word_vectors[w], ideal_dim, ideal_wt, ideal_bias) for w in df["Row"]]
        
        # evaluate, again against the scrambled ratings
        res_s = stats.pearsonr(thisdata_gold, df["Pred"])
        res_i = stats.pearsonr(thisdata_gold, df["IPred"])
        
        print(f"{grandcategory:12s}-{grandfeature:12s} Pred r={res_s.statistic:.3f} p={res_s.pvalue:.3f} IPred ={res_i.statistic:.3f} p={res_i.pvalue:.3f}")
        results_fitted[(grandcategory, grandfeature)]["fitted_scrambled_r"] = res_i.statistic
        results_fitted[(grandcategory, grandfeature)]["fitted_scrambled_p"] = res_i.pvalue
        results_fitted[(grandcategory, grandfeature)]["seed_scrambled_r"] = res_s.statistic
        results_fitted[(grandcategory, grandfeature)]["seed_scrambled_p"] = res_s.pvalue             
  

Scrambled ratings
cities      -temperature  Pred r=-0.030 p=0.835 IPred =0.997 p=0.000
professions -intelligence Pred r=-0.275 p=0.056 IPred =0.981 p=0.000
cities      -intelligence Pred r=-0.010 p=0.946 IPred =0.997 p=0.000
clothing    -location     Pred r=-0.098 p=0.497 IPred =0.993 p=0.000
cities      -arousal      Pred r=-0.015 p=0.919 IPred =0.991 p=0.000
clothing    -arousal      Pred r=-0.090 p=0.532 IPred =0.992 p=0.000
states      -size         Pred r=0.017 p=0.904 IPred =0.996 p=0.000
sports      -intelligence Pred r=0.054 p=0.710 IPred =0.995 p=0.000
names       -age          Pred r=0.046 p=0.751 IPred =0.994 p=0.000
clothing    -wealth       Pred r=-0.323 p=0.022 IPred =0.996 p=0.000
weather     -danger       Pred r=-0.097 p=0.567 IPred =0.988 p=0.000
professions -danger       Pred r=0.014 p=0.926 IPred =0.998 p=0.000
clothing    -size         Pred r=0.110 p=0.448 IPred =0.990 p=0.000
animals     -size         Pred r=-0.242 p=0.167 IPred =0.993 p=0.000
sports      -wealth  

In [29]:
print("Percentage with significant correlations:")
sig = sum([r["fitted_scrambled_p"] <= 0.05 for r in results_fitted.values() if r["fitted_scrambled_r"] > 0])
print("Fitted dimensions w scrambled ratings:", round(sig / len(results_fitted), 3))
sig = sum([r["seed_scrambled_p"] <= 0.05 for r in results_fitted.values() if r["seed_scrambled_r"] > 0])
print("Seed dimensions w scrambled ratings:", round(sig / len(results_fitted), 3))

Percentage with significant correlations:
Fitted dimensions w scrambled ratings: 1.0
Seed dimensions w scrambled ratings: 0.054


In the above test, the ratings are scrambled, but the words all come from a common category, like "animals". What if the words are random too? We run an exacerbated version of this test where we don't just scramble the ratings, we exchange the words by random ones.

This time, we mostly do not find any correlation between gold and predicted values. This means that we cannot fit a dimension through arbitrary words, but for words that do belong to the same category, there are too many ways of fitting a dimension.

In [11]:

print("Scrambled ratings, arbitrary words")

for filename in os.listdir(grandratings_dir):
    if filename.endswith("csv"):
        grandcategory, grandfeature, pos_seedwords, neg_seedwords, df = read_grand_data(filename, grandratings_dir, grandfeatures_df)
        
        # make average seed-based dimension, use for predictions
        dimension_vec = average_dim_vector(pos_seedwords, neg_seedwords, word_vectors)
        df["Pred"] = [vector_scalar_projection( word_vectors[w], dimension_vec) for w in df["Row"]]
        
        # make ideal dimension, use for predictions
        thisdata_vectors = []
        thisdata_gold = []

        for row in df.itertuples():
            # row.Row is the word. look it up in word_vectors
            thisdata_vectors.append( word_vectors[ row.Row ])
            # gold rating: use z-scored average
            thisdata_gold.append( row.Gold)
      
        # Scrambling the ratings
        random.shuffle(thisdata_gold)
        # exchanging the vectors for those of arbitrary words
        thisdata_vectors = [word_vectors[random.choice(list(word_vectors.keys()))] for w in thisdata_vectors]
        
        ideal_dim, ideal_wt, ideal_bias = ideal_dimension(thisdata_vectors, thisdata_gold, feature_dim)
        
        df["IPred"] = [iprediction( word_vectors[w], ideal_dim, ideal_wt, ideal_bias) for w in df["Row"]]
        
        # evaluate, again against the scrambled ratings
        res_s = stats.pearsonr(thisdata_gold, df["Pred"])
        res_i = stats.pearsonr(thisdata_gold, df["IPred"])
        
        print(f"{grandcategory:12s}-{grandfeature:12s} Pred r={res_s.statistic:.3f} p={res_s.pvalue:.3f} IPred ={res_i.statistic:.3f} p={res_i.pvalue:.3f}")
        results_fitted[(grandcategory, grandfeature)]["fitted_rand_scrambled_r"] = res_i.statistic
        results_fitted[(grandcategory, grandfeature)]["fitted_rand_scrambled_p"] = res_i.pvalue               
  

Scrambled ratings, arbitrary words
cities      -temperature  Pred r=0.150 p=0.298 IPred =-0.033 p=0.821
professions -intelligence Pred r=0.095 p=0.514 IPred =0.036 p=0.806
cities      -intelligence Pred r=-0.099 p=0.496 IPred =0.074 p=0.612
clothing    -location     Pred r=0.025 p=0.864 IPred =0.182 p=0.206
cities      -arousal      Pred r=0.065 p=0.652 IPred =-0.098 p=0.500
clothing    -arousal      Pred r=0.095 p=0.512 IPred =0.127 p=0.380
states      -size         Pred r=-0.081 p=0.576 IPred =-0.162 p=0.261
sports      -intelligence Pred r=-0.068 p=0.640 IPred =-0.197 p=0.171
names       -age          Pred r=0.148 p=0.305 IPred =-0.059 p=0.683
clothing    -wealth       Pred r=0.076 p=0.601 IPred =0.202 p=0.159
weather     -danger       Pred r=0.113 p=0.507 IPred =0.088 p=0.604
professions -danger       Pred r=0.273 p=0.057 IPred =-0.155 p=0.286
clothing    -size         Pred r=-0.259 p=0.070 IPred =-0.358 p=0.011
animals     -size         Pred r=0.277 p=0.113 IPred =0.064 p=0.721
sp

In [19]:
print("Percentage with significant correlations:")
sig = sum([r["fitted_rand_scrambled_p"] <= 0.05 for r in results_fitted.values() if r["fitted_rand_scrambled_r"] > 0])
print("Fitted dimensions, random words, scrambled ratings:", round(sig / len(results_fitted), 3))

Percentage with significant correlations:
Fitted dimensions, random words, scrambled ratings: 0.0


# Seed words as additional input to fitted dimensions

What if we give the fitted dimension model the seed words along with the word/ratings pairs? Will that "disambiguate" the dimension through the words of a category, so that we cannot fit arbitrary scrambled ratings?

While we are at it, what happens if we fit a dimension only with the seed words?

To do this, we have to give the seed words made-up ratings. Ratings are z-scores, so the seed words should get ratings either at around -2 or +2, far out on the scale. 

But does the Grand et al data give us hints which seed words should be at +2 and which at -2? Are the positive seed words always the ones that should have high values on the scale? Let's check first. We see that indeed the positive seeds are always the ones that should have a high value on the scale. 

In [13]:
for filename in os.listdir(grandratings_dir):
    if filename.endswith("csv"):
        grandcategory, grandfeature, pos_seedwords, neg_seedwords, df = read_grand_data(filename, grandratings_dir, grandfeatures_df)

        df0 = df.sort_values(by = "Gold")
        lowest = df0.Row[0]
        lowestval = df0.Gold[0]
        highest = df0.Row.iat[-1]
        highestval = df0.Gold.iat[-1]
        print(grandcategory, grandfeature, 
              "pseed:", pos_seedwords[0], "nseed:", neg_seedwords[0],
              f"lowrating: {lowest} ({lowestval:.2f})", 
              f"highrating: {highest} ({highestval:.2f})")

cities temperature pseed: hot nseed: cold lowrating: amsterdam (-1.01) highrating: dallas (2.19)
professions intelligence pseed: intelligent nseed: stupid lowrating: actor (-0.38) highrating: doctor (1.68)
cities intelligence pseed: intelligent nseed: stupid lowrating: amsterdam (0.75) highrating: tokyo (2.26)
clothing location pseed: indoor nseed: outdoor lowrating: bathrobe (2.07) highrating: pajamas (2.18)
cities arousal pseed: interesting nseed: boring lowrating: amsterdam (1.18) highrating: new-york (1.90)
clothing arousal pseed: interesting nseed: boring lowrating: bathrobe (-0.62) highrating: watch (2.47)
states size pseed: large nseed: small lowrating: alabama (-0.26) highrating: texas (2.72)
sports intelligence pseed: intelligent nseed: stupid lowrating: aerobics (-0.56) highrating: golf (2.03)
names age pseed: old nseed: young lowrating: anthony (-0.84) highrating: donald (1.97)
clothing wealth pseed: rich nseed: poor lowrating: bathrobe (-0.27) highrating: tuxedo (2.80)
weat

We first train fitted dimensions on seeds only. We set the ratings for the seed words to be 0.5 beyond the highest/lowest rating in the dataset, with a bit of jitter added. 

We use only the dimension and re-scale, as we cannot expect the weight and bias to be useful in this case. 

We see that these dimensions don't work very well.

In [22]:
print("Seed-based versus fitted *seed-based* dimensions")

for filename in os.listdir(grandratings_dir):
    if filename.endswith("csv"):
        grandcategory, grandfeature, pos_seedwords, neg_seedwords, df = read_grand_data(filename, grandratings_dir, grandfeatures_df)
        
        # make average seed-based dimension, use for predictions
        dimension_vec = average_dim_vector(pos_seedwords, neg_seedwords, word_vectors)
        df["Pred"] = [vector_scalar_projection( word_vectors[w], dimension_vec) for w in df["Row"]]
        
        # make ideal dimension, use for predictions
        lowvalue = df["Gold"].min() - 0.5
        highvalue = df["Gold"].max() + 0.5
        
        # make ideal dimension, use for predictions
        thisdata_vectors = []
        thisdata_gold = []
        
        for seed in pos_seedwords:
            thisdata_vectors.append( word_vectors [ seed ])
            thisdata_gold.append( highvalue + random.uniform(0.001, 0.005))

        for seed in neg_seedwords:
            thisdata_vectors.append( word_vectors [ seed ])
            thisdata_gold.append( lowvalue - random.uniform(0.001, 0.005))

        
        ideal_dim, _, _= ideal_dimension(thisdata_vectors, thisdata_gold, feature_dim)
        
        df["IPred"] = [vector_scalar_projection( word_vectors[w], ideal_dim) for w in df["Row"]]
        
        # evaluate
        res_s = stats.pearsonr(df["Gold"], df["Pred"])
        res_i = stats.pearsonr(df["Gold"], df["IPred"])
        
        print(f"{grandcategory:12s}-{grandfeature:12s} Pred r={res_s.statistic:.3f} p={res_s.pvalue:.3f} IPred ={res_i.statistic:.3f} p={res_i.pvalue:.3f}")
        results_fitted[(grandcategory, grandfeature)]["fitted_seed_r"] = res_i.statistic
        results_fitted[(grandcategory, grandfeature)]["fitted_seed_p"] = res_i.pvalue               
  


Seed-based versus fitted *seed-based* dimensions
cities      -temperature  Pred r=-0.086 p=0.553 IPred =-0.095 p=0.511
professions -intelligence Pred r=0.468 p=0.001 IPred =0.564 p=0.000
cities      -intelligence Pred r=0.161 p=0.265 IPred =-0.212 p=0.139
clothing    -location     Pred r=0.323 p=0.022 IPred =-0.178 p=0.217
cities      -arousal      Pred r=0.001 p=0.996 IPred =-0.270 p=0.058
clothing    -arousal      Pred r=0.185 p=0.199 IPred =0.169 p=0.241
states      -size         Pred r=0.331 p=0.019 IPred =0.102 p=0.480
sports      -intelligence Pred r=0.613 p=0.000 IPred =0.384 p=0.006
names       -age          Pred r=0.616 p=0.000 IPred =-0.012 p=0.932
clothing    -wealth       Pred r=0.369 p=0.008 IPred =0.307 p=0.030
weather     -danger       Pred r=0.790 p=0.000 IPred =0.259 p=0.121
professions -danger       Pred r=0.446 p=0.001 IPred =-0.143 p=0.327
clothing    -size         Pred r=0.097 p=0.503 IPred =0.132 p=0.362
animals     -size         Pred r=0.668 p=0.000 IPred =0.041 

In [23]:
print("Percentage with significant correlations:")
sig = sum([r["fitted_seed_p"] <= 0.05 for r in results_fitted.values() if r["fitted_seed_r"] > 0])
print("Fitted dimensions, seed-based:", round(sig / len(results_fitted), 3))
seed_sig = sum([r["seed_p"] <= 0.05 for r in results_fitted.values() if r["seed_r"] > 0])
print("For comparison, seed-based:", round(seed_sig / len(results_fitted), 3))

Percentage with significant correlations:
Fitted dimensions, seed-based: 0.214
For comparison, seed-based: 0.679


Next we fit dimensions with both seed words and ratings. This works very well: we again get significant correlations in all cases. 

In [24]:
print("Seed-based versus fitted *seed-based* dimensions with ratings")

for filename in os.listdir(grandratings_dir):
    if filename.endswith("csv"):
        grandcategory, grandfeature, pos_seedwords, neg_seedwords, df = read_grand_data(filename, grandratings_dir, grandfeatures_df)
        
        # make average seed-based dimension, use for predictions
        dimension_vec = average_dim_vector(pos_seedwords, neg_seedwords, word_vectors)
        df["Pred"] = [vector_scalar_projection( word_vectors[w], dimension_vec) for w in df["Row"]]
        
        # make ideal dimension, use for predictions
        lowvalue = df["Gold"].min() - 0.5
        highvalue = df["Gold"].max() + 0.5
        
        # make ideal dimension, use for predictions
        thisdata_vectors = []
        thisdata_gold = []
        
        for seed in pos_seedwords:
            thisdata_vectors.append( word_vectors [ seed ])
            thisdata_gold.append( highvalue + random.uniform(0.001, 0.005))

        for seed in neg_seedwords:
            thisdata_vectors.append( word_vectors [ seed ])
            thisdata_gold.append( lowvalue - random.uniform(0.001, 0.005))

        for row in df.itertuples():
            # row.Row is the word. look it up in word_vectors
            thisdata_vectors.append( word_vectors[ row.Row ])
            # gold rating: use z-scored average
            thisdata_gold.append( row.Gold)
        
        ideal_dim, ideal_wt, ideal_bias = ideal_dimension(thisdata_vectors, thisdata_gold, feature_dim)
        
        df["IPred"] = [iprediction( word_vectors[w], ideal_dim, ideal_wt, ideal_bias) for w in df["Row"]]
                
        # evaluate
        res_s = stats.pearsonr(df["Gold"], df["Pred"])
        res_i = stats.pearsonr(df["Gold"], df["IPred"])
        
        print(f"{grandcategory:12s}-{grandfeature:12s} Pred r={res_s.statistic:.3f} p={res_s.pvalue:.3f} IPred ={res_i.statistic:.3f} p={res_i.pvalue:.3f}")
        results_fitted[(grandcategory, grandfeature)]["fitted_rseed_r"] = res_i.statistic
        results_fitted[(grandcategory, grandfeature)]["fitted_rseed_p"] = res_i.pvalue               
  


Seed-based versus fitted *seed-based* dimensions with ratings
cities      -temperature  Pred r=-0.086 p=0.553 IPred =0.985 p=0.000
professions -intelligence Pred r=0.468 p=0.001 IPred =0.989 p=0.000
cities      -intelligence Pred r=0.161 p=0.265 IPred =0.992 p=0.000
clothing    -location     Pred r=0.323 p=0.022 IPred =0.989 p=0.000
cities      -arousal      Pred r=0.001 p=0.996 IPred =0.997 p=0.000
clothing    -arousal      Pred r=0.185 p=0.199 IPred =0.995 p=0.000
states      -size         Pred r=0.331 p=0.019 IPred =0.983 p=0.000
sports      -intelligence Pred r=0.613 p=0.000 IPred =0.990 p=0.000
names       -age          Pred r=0.616 p=0.000 IPred =0.998 p=0.000
clothing    -wealth       Pred r=0.369 p=0.008 IPred =0.993 p=0.000
weather     -danger       Pred r=0.790 p=0.000 IPred =0.996 p=0.000
professions -danger       Pred r=0.446 p=0.001 IPred =0.990 p=0.000
clothing    -size         Pred r=0.097 p=0.503 IPred =0.994 p=0.000
animals     -size         Pred r=0.668 p=0.000 IPred 

In [25]:
print("Percentage with significant correlations:")
sig = sum([r["fitted_rseed_p"] <= 0.05 for r in results_fitted.values() if r["fitted_rseed_r"] > 0])
print("Fitted dimensions, seed-based:", round(sig / len(results_fitted), 3))
seed_sig = sum([r["seed_p"] <= 0.05 for r in results_fitted.values() if r["seed_r"] > 0])
print("For comparison, seed-based:", round(seed_sig / len(results_fitted), 3))

Percentage with significant correlations:
Fitted dimensions, seed-based: 1.0
For comparison, seed-based: 0.679


# Does having the seeds helps against under-determined dimensions?

Previously we saw that we were able to compute perfect fitted dimensions with scrambled ratings, as long as the words were from a coherent category. Our guess was that this was because there are many ways in which words from a coherent category relate to each other, so that the dimension is under-determined given just the word embeddings. So when we add seed words to the training data, does it get harder to fit dimensions with scrambled ratings?

Nope. We still get significant correlations in all cases. Maybe the words just overrule the seeds. We may need a more complex objective function that lets us give more weight to the seeds. 

So, how well are we predicting the seed words at the end of the procedure? 

In [31]:
print("Scrambled ratings, fitted dimensions with seed words")

for filename in os.listdir(grandratings_dir):
    if filename.endswith("csv"):
        grandcategory, grandfeature, pos_seedwords, neg_seedwords, df = read_grand_data(filename, grandratings_dir, grandfeatures_df)

        
        # make ideal dimension, use for predictions
        thisdata_vectors = []
        thisdata_gold = []

        for row in df.itertuples():
            # row.Row is the word. look it up in word_vectors
            thisdata_vectors.append( word_vectors[ row.Row ])
            # gold rating: use z-scored average
            thisdata_gold.append( row.Gold)
      
        #SCRAMBLING
        random.shuffle(thisdata_gold)
        thisdata_gold_noseed = thisdata_gold.copy()
        
        # adding seed words, with non-scrambled ratings
        lowvalue = df["Gold"].min() - 0.5
        highvalue = df["Gold"].max() + 0.5
        
        for seed in pos_seedwords:
            thisdata_vectors.append( word_vectors [ seed ])
            thisdata_gold.append( highvalue + random.uniform(0.001, 0.005))

        for seed in neg_seedwords:
            thisdata_vectors.append( word_vectors [ seed ])
            thisdata_gold.append( lowvalue - random.uniform(0.001, 0.005))
            
            
        ideal_dim, ideal_wt, ideal_bias = ideal_dimension(thisdata_vectors, thisdata_gold, feature_dim)
        
        df["IPred"] = [iprediction( word_vectors[w], ideal_dim, ideal_wt, ideal_bias) for w in df["Row"]]
        
        # evaluate, again against the scrambled ratings
        res_i = stats.pearsonr(thisdata_gold_noseed, df["IPred"])
        
        print(f"{grandcategory:12s}-{grandfeature:12s} IPred ={res_i.statistic:.3f} p={res_i.pvalue:.3f}")
        results_fitted[(grandcategory, grandfeature)]["fitted_rseed_scrambled_r"] = res_i.statistic
        results_fitted[(grandcategory, grandfeature)]["fitted_rseed_scrambled_p"] = res_i.pvalue
        
  

Scrambled ratings, fitted dimensions with seed words
cities      -temperature  IPred =0.984 p=0.000
professions -intelligence IPred =0.979 p=0.000
cities      -intelligence IPred =0.996 p=0.000
clothing    -location     IPred =0.996 p=0.000
cities      -arousal      IPred =0.996 p=0.000
clothing    -arousal      IPred =0.996 p=0.000
states      -size         IPred =0.998 p=0.000
sports      -intelligence IPred =0.991 p=0.000
names       -age          IPred =0.996 p=0.000
clothing    -wealth       IPred =0.994 p=0.000
weather     -danger       IPred =0.989 p=0.000
professions -danger       IPred =0.994 p=0.000
clothing    -size         IPred =0.992 p=0.000
animals     -size         IPred =0.995 p=0.000
sports      -wealth       IPred =0.995 p=0.000
professions -valence      IPred =0.991 p=0.000
names       -wealth       IPred =0.997 p=0.000
cities      -cost         IPred =0.994 p=0.000
cities      -wealth       IPred =0.992 p=0.000
professions -gender       IPred =0.994 p=0.000
states 

In [32]:
print("Percentage with significant correlations:")
sig = sum([r["fitted_rseed_scrambled_p"] <= 0.05 for r in results_fitted.values() if r["fitted_rseed_scrambled_r"] > 0])
print("Fitted dimensions w scrambled ratings:", round(sig / len(results_fitted), 3))


Percentage with significant correlations:
Fitted dimensions w scrambled ratings: 1.0


# Training and test set

When we set aside a test set, and train only on part of the data, we can test how well the dimensions generalize. 

The original seed dimensions, trained on all ratings, don't generalize. Will fitted dimensions with seeds generalize? 

## Evaluation measure issues

We cannot use Pearson's r anymore because the p values become unreliable with small datasets. Grand et al evaluate using OC_p, pairwise order consistency. Another obvious option is Mean Squared Error MSE, basically squared residuals. If we switch evaluations like that, will the results still tell the same story? We first check for correlation among evaluation measures when training on the whole data, testing for seed-based dimensions, fitted dimensions, and fitted dimensions with seeds. 


In [38]:
import itertools

# pairwise order consistency, normal definition
def pairwise_order_consistency(goldvalues, modelvalues):
    if len(goldvalues) != len(modelvalues):
        raise Exception("shouldn't be here")
        
    outcomes = [ ]
    for i1, i2 in itertools.combinations(range(len(goldvalues)), 2):
        goldrel = (goldvalues[i1] > goldvalues[i2])
        modelrel = (modelvalues[i1] > modelvalues[i2])
        outcomes.append(int(goldrel == modelrel))
        
    return sum(outcomes) / len(outcomes)

# pairwise order consistency only for a subset of the model values 
def pairwise_order_consistency_wrt(goldvalues, modelvalues, test_indices):
    if len(goldvalues) != len(modelvalues):
        raise Exception("shouldn't be here")
        
    outcomes = [ ]
    for i1 in test_indices:
        for i2 in range(len(goldvalues)):
            if i1 == i2: continue
            
            goldrel = (goldvalues[i1] > goldvalues[i2])
            modelrel = (modelvalues[i1] > modelvalues[i2])
            outcomes.append(int(goldrel == modelrel))
        
    return sum(outcomes) / len(outcomes)

# mean squared error
def mean_squared_error(goldvalues, modelvalues):
    if len(goldvalues) != len(modelvalues):
        raise Exception("shouldn't be here")
    return sum([(g - m)**2 for g, m in zip(goldvalues, modelvalues)]) / len(goldvalues)

In [46]:
# combined method: fitted dimension with seeds
def fitted_dimension_withseeds(thisdata_vectors, thisdata_gold, feature_dim, 
                               pos_seedwords, neg_seedwords, word_vectors, offset = 0.5, jitter = False):
    # adding seed words
    
    lowvalue = min(thisdata_gold) - offset
    highvalue = max(thisdata_gold) + offset
        
    for seed in pos_seedwords:
        thisdata_vectors.append( word_vectors [ seed ])
        j = random.uniform(0.001, 0.005) if jitter else 0
        thisdata_gold.append( highvalue + j)

    for seed in neg_seedwords:
        thisdata_vectors.append( word_vectors [ seed ])
        j = random.uniform(0.001, 0.005) if jitter else 0
        thisdata_gold.append( lowvalue - j)
            
            
    return ideal_dimension(thisdata_vectors, thisdata_gold, feature_dim)


In [None]:
# training on all data, computing more evaluation measures 
# so we can test for correlation among evaluation measures

rvalues_seed = [ ]
rvalues_fit = [ ]
rvalues_sfit = [ ]
ocp_seed = [ ]
ocp_fit = [ ]
ocp_sfit = [ ]
msevalues_seed = [ ]
msevalues_fit = [ ]
msevalues_sfit = [ ]

offset = 0.5
jitter = False

for filename in os.listdir(grandratings_dir):
    if filename.endswith("csv"):
        grandcategory, grandfeature, pos_seedwords, neg_seedwords, df = read_grand_data(filename, grandratings_dir, grandfeatures_df)
        
        # make average seed-based dimension, use for predictions
        dimension_vec = average_dim_vector(pos_seedwords, neg_seedwords, word_vectors)
        df["Pred"] = [vector_scalar_projection( word_vectors[w], dimension_vec) for w in df["Row"]]
        sweight, sbias = coef_for_seed_dimension(df["Gold"], df["Pred"])
        adjusted_spred = sprediction(df["Pred"], sweight, sbias)
        
        # make ideal dimension, use for predictions
        thisdata_vectors = []
        thisdata_gold = []

        for row in df.itertuples():
            # row.Row is the word. look it up in word_vectors
            thisdata_vectors.append( word_vectors[ row.Row ])
            # gold rating: use z-scored average
            thisdata_gold.append( row.Gold)
        
        ideal_dim, ideal_wt, ideal_bias = ideal_dimension(thisdata_vectors, thisdata_gold, feature_dim)
        
        df["IPred"] = [iprediction( word_vectors[w], ideal_dim, ideal_wt, ideal_bias) for w in df["Row"]]
        
        # make fitted dimension with seeds, use for predictions    
        lowvalue = df["Gold"].min() - offset
        highvalue = df["Gold"].max() + offset
        
        for seed in pos_seedwords:
            thisdata_vectors.append( word_vectors [ seed ])
            j = random.uniform(0.001, 0.005) if jitter else 0
            thisdata_gold.append( highvalue + j)

        for seed in neg_seedwords:
            thisdata_vectors.append( word_vectors [ seed ])
            j = random.uniform(0.001, 0.005) if jitter else 0
            thisdata_gold.append( lowvalue - j)
            
            
        ideal_dim, ideal_wt, ideal_bias = ideal_dimension(thisdata_vectors, thisdata_gold, feature_dim)
        df["I2Pred"] = [iprediction( word_vectors[w], ideal_dim, ideal_wt, ideal_bias) for w in df["Row"]]
        
        # evaluate
        res_s = stats.pearsonr(df["Gold"], df["Pred"])
        res_i = stats.pearsonr(df["Gold"], df["IPred"])
        res_2 = stats.pearsonr(df["Gold"], df["I2Pred"])
        ocp_s = pairwise_order_consistency(df["Gold"], df["Pred"])
        ocp_i = pairwise_order_consistency(df["Gold"], df["IPred"])
        ocp_2 = pairwise_order_consistency(df["Gold"], df["I2Pred"])
        mse_s = mean_squared_error(df["Gold"], adjusted_spred)
        mse_i = mean_squared_error(df["Gold"], df["IPred"])
        mse_2 = mean_squared_error(df["Gold"], df["I2Pred"])
        
        rvalues_seed.append(res_s.statistic)
        rvalues_fit.append(res_i.statistic)
        rvalues_sfit.append(res_2.statistic)
        ocp_seed.append(ocp_s)
        ocp_fit.append(ocp_i)
        ocp_sfit.append(ocp_2)
        msevalues_seed.append(mse_s)
        msevalues_fit.append(mse_i)
        msevalues_sfit.append(mse_2)
        
        
        print(f"{grandcategory:12s}-{grandfeature:12s} Pred r={res_s.statistic:.2f} p={res_s.pvalue:.2f} ocp={ocp_s:.2f} mse={mse_s:.2f}",
              f"IPred ={res_i.statistic:.2f} p={res_i.pvalue:.2f} ocp={ocp_i:.2f} mse={mse_i:.2f}",
              f"ISPred ={res_2.statistic:.2f} p={res_2.pvalue:.2f} ocp={ocp_2:.2f} mse={mse_2:.2f}")
        

In [42]:
import statistics

# average evaluation ratings
print(f"Mean and sd OC_P, seed-based: {statistics.mean(ocp_seed):.3f} {statistics.stdev(ocp_seed):.3f}")
print(f"Mean and sd OC_P, fitted: {statistics.mean(ocp_fit):.3f} {statistics.stdev(ocp_fit):.3f}")
print(f"Mean and sd OC_P, fit+seed: {statistics.mean(ocp_sfit):.3f} {statistics.stdev(ocp_sfit):.3f}")
print()

print(f"Median, mean and sd MSE, seed-based: {statistics.median(msevalues_seed):.3f} {statistics.mean(msevalues_seed):.3f} {statistics.stdev(msevalues_seed):.3f}")
print(f"Median, mean and sd MSE, fitted: {statistics.median(msevalues_fit):.3f} {statistics.mean(msevalues_fit):.3f} {statistics.stdev(msevalues_fit):.3f}")
print(f"Median, mean and sd MSE, fit_seed: {statistics.median(msevalues_sfit):.3f} {statistics.mean(msevalues_sfit):.3f} {statistics.stdev(msevalues_sfit):.3f}")
 
    
# Let's do that again and remove extreme outliers among MSE values
new_mseseed = [ ]
removing_ix = [ ]
for i in range(len(msevalues_seed)):
    if msevalues_seed[i] > 1000:
        removing_ix.append(i)
    else:
        new_mseseed.append(msevalues_seed[i])
        
print(f"Median, mean and sd MSE, seed-based, fixed: {statistics.median(new_mseseed):.3f} {statistics.mean(new_mseseed):.3f} {statistics.stdev(new_mseseed):.3f}")


Mean and sd OC_P, seed-based: 0.641 0.108
Mean and sd OC_P, fitted: 0.967 0.012
Mean and sd OC_P, fit+seed: 0.962 0.015

Median, mean and sd MSE, seed-based: 3.812 37839.374 278945.854
Median, mean and sd MSE, fitted: 0.059 0.074 0.056
Median, mean and sd MSE, fit_seed: 0.068 0.079 0.056
Median, mean and sd MSE, seed-based, fixed: 3.493 30.575 57.855


In [44]:
# correlation across ratings
res = stats.pearsonr(rvalues_seed, ocp_seed)
print(f"Correlation Pearson/OC_p, seed-based: {res.statistic:.3f} {res.pvalue:.3f}")
res = stats.pearsonr(rvalues_fit, ocp_fit)
print(f"Correlation Pearson/OC_p, fitted: {res.statistic:.3f} {res.pvalue:.3f}")
res = stats.pearsonr(rvalues_sfit, ocp_sfit)
print(f"Correlation Pearson/OC_p, fit+seed: {res.statistic:.3f} {res.pvalue:.3f}")
print()

res = stats.pearsonr(rvalues_seed, msevalues_seed)
print(f"Correlation Pearson/MSE, seed-based: {res.statistic:.3f} {res.pvalue:.3f}")
res = stats.pearsonr(rvalues_fit, msevalues_fit)
print(f"Correlation Pearson/MSE, fitted: {res.statistic:.3f} {res.pvalue:.3f}")
res = stats.pearsonr(rvalues_sfit, msevalues_sfit)
print(f"Correlation Pearson/MSE, fit+seed: {res.statistic:.3f} {res.pvalue:.3f}")


new_rseed = [x for i, x in enumerate(rvalues_seed) if i not in removing_ix]
res = stats.pearsonr(new_rseed, new_mseseed)
print(f"Correlation Pearson/MSE, seed-based, fixed: {res.statistic:.3f} {res.pvalue:.3f}")

print("Number of removed datapoints due to gigantic MSE", len(removing_ix))

Correlation Pearson/OC_p, seed-based: 0.972 0.000
Correlation Pearson/OC_p, fitted: 0.642 0.000
Correlation Pearson/OC_p, fit+seed: 0.487 0.000

Correlation Pearson/MSE, seed-based: -0.186 0.170
Correlation Pearson/MSE, fitted: -0.398 0.002
Correlation Pearson/MSE, fit+seed: -0.138 0.312
Correlation Pearson/MSE, seed-based, fixed: -0.720 0.000
Number of removed datapoints due to gigantic MSE 3


### What do we learn from this?

OC_p is highly correlated with Pearson's r values when computed over the whole dataset. With MSE we do get a highly significant correlation for the fitted dimensions. We also get it for the seed-based dimension if we exclude 3 rogue datapoints. But for the fitted+seed dimensions the correlation is not significant, so MSE may be measuring something different.

So we go ahead with OCP and MSE. We want to view OC_p as a stand-in for Pearson's r, however, caveat: We're going to use OC_p differently below, as pairwise order consistency of test set compared to training set. 

In any case: We can evaluate the train/test split using OC_p and MSE instead of R. 
This lets us avoid the problem of evaluating on a tiny dataset: We can compute OC_P with respect to all predicted values, 
and MSE is per individual datapoint anyway, so it's not a problem that there aren't many datapoints to compare.

So, with these new measures, what do we learn about generalization of fitted dimensions?

In [49]:
import numpy as np

print("Training and test split")

numfolds = 5
offset = 1.0
jitter = False

all_evals = [ ]

for filename in os.listdir(grandratings_dir):
    if filename.endswith("csv"):
        grandcategory, grandfeature, pos_seedwords, neg_seedwords, df = read_grand_data(filename, grandratings_dir, grandfeatures_df)
        
        
        # storage for word vectors and gold values for this dataset
        all_thisdata_vectors = []
        all_thisdata_gold = []

        for row in df.itertuples():
            # row.Row is the word. look it up in word_vectors
            all_thisdata_vectors.append( word_vectors[ row.Row ])
            # gold rating: use z-scored average
            all_thisdata_gold.append( row.Gold)
      
        # crossvalidation setup: give indices to datapoints
        fold = np.random.randint(numfolds, size = len(all_thisdata_gold))
        
        # store the evaluatresults from the different test folds
        evals = [ ]
        
        # iterate over folds, evaluate for each of them
        for testfold in range(numfolds):
            # compute training and test data for this fold
            test_indices =  [i for i in range(len(all_thisdata_gold)) if fold[i] == testfold]
            train_indices = [i for i in range(len(all_thisdata_gold)) if fold[i] != testfold]
        
            gold_test =  [ell["Gold"] for _, ell in df.iloc[ test_indices ].iterrows()]
            gold_train = [ ell["Gold"] for _, ell in df.iloc[ train_indices ].iterrows()]
            words_test =  [ell["Row"] for _, ell in df.iloc[ test_indices].iterrows()]
            words_train = [ell["Row"] for _, ell in df.iloc[ train_indices].iterrows()]
            vec_test =  [word_vectors[ w ] for w in words_test]
            vec_train = [word_vectors[ w ] for w in words_train ]
        

            # compute seed-based dimension, and its predictions
            seed_dim = average_dim_vector(pos_seedwords, neg_seedwords, word_vectors)
            p0 = [vector_scalar_projection( word_vectors[w], seed_dim) for w in df["Row"]]
            weight, bias = coef_for_seed_dimension(df["Gold"], p0)
            df["SPred"] = [sprediction(p, weight, bias) for p in p0]
            
            
            # compute fitted dimension, and its predictions
            fitted_dim, fitted_wt, fitted_bias = ideal_dimension(vec_train, gold_train, feature_dim)
            df["FPred"] = [ iprediction( word_vectors[ w], fitted_dim, fitted_wt, fitted_bias) for w in df["Row"]]
            
            
            # compute fitted dimension with seeds, and its predictions
            fitted_dim, fitted_wt, fitted_bias = fitted_dimension_withseeds(vec_train, gold_train, feature_dim, 
                                                                            pos_seedwords, neg_seedwords, word_vectors,
                                                                            offset = offset, jitter = jitter)
            df["FSPred"] = [ iprediction( word_vectors[ w], fitted_dim, fitted_wt, fitted_bias) for w in df["Row"]]
            
            # order consistency pairwise: test values tested for their ordering wrt. all values, training and test
            # MSE: evaluate on test only
            e = { "ocp_s" : pairwise_order_consistency_wrt(df["Gold"], df["SPred"], test_indices),
                  "ocp_f" : pairwise_order_consistency_wrt(df["Gold"], df["FPred"], test_indices),
                  "ocp_fs": pairwise_order_consistency_wrt(df["Gold"], df["FSPred"], test_indices),
                  "mse_s" : mean_squared_error(gold_test, [p for i, p in enumerate(df["SPred"]) if i in test_indices]),
                  "mse_f" : mean_squared_error(gold_test, [p for i, p in enumerate(df["FPred"]) if i in test_indices]),
                  "mse_fs": mean_squared_error(gold_test, [p for i, p in enumerate(df["FSPred"]) if i in test_indices])}
            
            evals.append(e)
            all_evals.append(e)
        
        print(grandcategory, "-", grandfeature)
        print("\t", end = "")
        for suffix, name in [("s", "Seed"), ("f", "Fitted"), ("fs", "Fitted-Seed")]:
            ocps = [e["ocp_" + suffix] for e in evals]
            print(name, f"OC_p {statistics.mean(ocps):.3f} ({statistics.stdev(ocps):.2f})", end = " ")
        print("\n\t", end = "")
        for suffix, name in [("s", "Seed"), ("f", "Fitted"), ("fs", "Fitted-Seed")]:
            mses = [e["mse_" + suffix] for e in evals]
            print(name, f"MSE {statistics.mean(mses):.3f} ({statistics.stdev(mses):.2f})", end = " ")
        print()

Training and test split
cities - temperature
	Seed OC_p 0.529 (0.03) Fitted OC_p 0.477 (0.08) Fitted-Seed OC_p 0.495 (0.09) 
	Seed MSE 143.853 (39.16) Fitted MSE 27.238 (8.72) Fitted-Seed MSE 24.798 (9.13) 
professions - intelligence
	Seed OC_p 0.619 (0.05) Fitted OC_p 0.674 (0.09) Fitted-Seed OC_p 0.668 (0.08) 
	Seed MSE 3.543 (1.34) Fitted MSE 7.126 (3.16) Fitted-Seed MSE 13.969 (7.30) 
cities - intelligence
	Seed OC_p 0.590 (0.10) Fitted OC_p 0.611 (0.04) Fitted-Seed OC_p 0.601 (0.04) 
	Seed MSE 39.146 (27.80) Fitted MSE 13.576 (9.36) Fitted-Seed MSE 12.986 (8.74) 
clothing - location
	Seed OC_p 0.583 (0.07) Fitted OC_p 0.604 (0.06) Fitted-Seed OC_p 0.579 (0.12) 
	Seed MSE 8.363 (4.88) Fitted MSE 12.970 (5.55) Fitted-Seed MSE 21.090 (8.64) 
cities - arousal
	Seed OC_p 0.516 (0.08) Fitted OC_p 0.578 (0.11) Fitted-Seed OC_p 0.581 (0.11) 
	Seed MSE 2135790.901 (672025.07) Fitted MSE 13.399 (7.08) Fitted-Seed MSE 11.887 (6.33) 
clothing - arousal
	Seed OC_p 0.561 (0.04) Fitted OC_p 0.60

In [50]:
for suffix, name in [("s", "Seed"), ("f", "Fitted"), ("fs", "Fitted-Seed")]:
    ocps = [e["ocp_" + suffix] for e in all_evals]
    print(name, f"OC_p mean {statistics.mean(ocps):.3f} ({statistics.stdev(ocps):.2f})")
print()

for suffix, name in [("s", "Seed"), ("f", "Fitted"), ("fs", "Fitted-Seed")]:
    mses = [e["mse_" + suffix] for e in evals]
    print(name, f"MSE med {statistics.median(mses):.3f} mean {statistics.mean(mses):.3f} ({statistics.stdev(mses):.2f})") 
    


Seed OC_p mean 0.646 (0.11)
Fitted OC_p mean 0.595 (0.10)
Fitted-Seed OC_p mean 0.596 (0.10)

Seed MSE med 1.141 mean 1.281 (0.70)
Fitted MSE med 11.315 mean 10.332 (2.99)
Fitted-Seed MSE med 13.712 mean 15.887 (6.17)


### Results so far

Seed-based dimensions show better results than fitted dimensions, both on terms of OC_p and MSE. There is no clear distinction between fitted dimensions and fitted dimensions with seed in terms of OC_p. In terms of MSE, adding the seeds to the fitted dimensions deteriorates performance. 

But is this the right way of adding seeds to the fitted dimensions? Another way would be to have a loss that considers both fit of ratings and given seed-based dimensions. An argument for trying this is that fitted+seed dimensions are still able to fit scrambled ratings. 

# Adding seeds to the fitted dimensions, version 2

In this formulation, we actually use the seed-based dimensions to "disambiguate" what we want the fitted dimension to do. Our loss function now has two parts, one from the human ratings, and one from the seed-based dimensions. (Currently I'm just summing up the losses from the different seed-based dimensions. One could also try to learn weights for them.)

Here is the model: 

In [65]:
# fitted dimension, with seeds too

# Weight/bias defined as trainable variables to optimize during backpropagation along with the feature vector
#
# parameters:
# - word vectors list: list of vectors for the words in our category
# - gold ratings: gold ratings on the dimension of interest, 
#    in the same order as the word vectors list
# - feature_dim: dimensionality of the vectors
# - random seed
#
# returns: 
# - computed vector for the ideal dimension
# - weight and bias such that vector * idealvec \approx weight * goldrating + bias
#
def fitted_dimension_withseeds(word_vectors_list, gold_ratings, seed_dims, feature_dims, alpha = 0.5, random_seed = 123):
    torch.manual_seed(random_seed) 

    # we compute: a vector of same dimensionality as our embeddings,
    # and a weight and a bias constant
    feature_vector = torch.randn(feature_dim, requires_grad=True) # dtype=torch.float32)
    weight_constant = torch.randn(1, requires_grad=True) 
    bias_constant = torch.randn(1, requires_grad=True)    

    
    optimizer = optim.Adam([feature_vector, weight_constant, bias_constant], lr=0.01)
    # optimizer = optim.SGD([feature_vector, weight_constant, bias_constant], lr=0.001)

    # Number of optimization steps
    num_steps = 1000

    losses = []
    
    criterion2 = torch.nn.CosineEmbeddingLoss()

    # Gradient clipping threshold
    max_norm = 1.0  

    for step in range(num_steps):
        total_loss1 = 0

        # for i in range(len(X_train)):
        # 	word_embedding = torch.tensor(X_train[i]) 
        # 	gold_rating = y_train[i]

        for i in range(len(word_vectors_list)):
            word_embedding = torch.tensor(word_vectors_list[i])
            gold_rating = gold_ratings[i]

            dot_product = torch.dot(word_embedding, feature_vector)
            weighted_gold = gold_rating * weight_constant
            loss1 = ((dot_product - weighted_gold - bias_constant) ** 2)

            total_loss1 += loss1
            
        
        avg_loss1 = total_loss1 / len(word_vectors_list)
        loss2 = sum([criterion2( feature_vector, d, torch.tensor(1.0)) for d in seed_dims]) / len(seed_dims)
        total_loss = alpha*avg_loss1 + (1-alpha) * loss2

        total_loss.backward()

        # Compute the gradient norms and monitor them during training

        # feature_vector_grad_norm = torch.norm(feature_vector.grad)
        # print(f"Step {step+1}, Feature Vector Gradient Norm: {feature_vector_grad_norm.item()}")

        torch.nn.utils.clip_grad_norm_([feature_vector, weight_constant, bias_constant], max_norm)
        optimizer.step()

        losses.append(total_loss.item())

    
    return (feature_vector.detach().numpy(), weight_constant.item(), bias_constant.item())

## Scrambled ratings again

We again scramble the ratings but keep the words, so that they are from the same category. With the seed-based dimensions to pull the model in the direction of the intended property, it should be harder for the model to learn dimensions for scrambled ratings -- and indeed it is. 

Alpha is the parameter for how to combine the two loss functions, where alpha is the contribution of the human ratings, and (1-alpha) the contribution from the seed dimensions. I am not really sure on what scale the losses from the human ratings are, so it's hard to say what alpha means. 


In [70]:
print("Scrambled ratings, fitted dimensions with seed words, new formulation")

alpha = 0.03

for filename in os.listdir(grandratings_dir):
    if filename.endswith("csv"):
        grandcategory, grandfeature, pos_seedwords, neg_seedwords, df = read_grand_data(filename, grandratings_dir, grandfeatures_df)

        
        # make ideal dimension, use for predictions
        thisdata_vectors = []
        thisdata_gold = []

        for row in df.itertuples():
            # row.Row is the word. look it up in word_vectors
            thisdata_vectors.append( word_vectors[ row.Row ])
            # gold rating: use z-scored average
            thisdata_gold.append( row.Gold)
      
        #SCRAMBLING
        random.shuffle(thisdata_gold)

        diffvectors = [ ]
    
        for negword in neg_seedwords:
            for posword in pos_seedwords:
                diffvectors.append(torch.from_numpy(word_vectors[posword] - word_vectors[negword]))
            
        ideal_dim, ideal_wt, ideal_bias = fitted_dimension_withseeds(thisdata_vectors, thisdata_gold, diffvectors, feature_dim, alpha = alpha)
        
        df["IPred"] = [iprediction( word_vectors[w], ideal_dim, ideal_wt, ideal_bias) for w in df["Row"]]
        
        # evaluate, again against the scrambled ratings
        res_i = stats.pearsonr(thisdata_gold, df["IPred"])
        
        print(f"{grandcategory:12s}-{grandfeature:12s} IPred r={res_i.statistic:.3f} p={res_i.pvalue:.3f}")
        
        
  

Scrambled ratings, fitted dimensions with seed words, new formulation
cities      -temperature  IPred r=0.864 p=0.000
professions -intelligence IPred r=0.313 p=0.029
cities      -intelligence IPred r=0.163 p=0.257
clothing    -location     IPred r=0.562 p=0.000
cities      -arousal      IPred r=0.343 p=0.015
clothing    -arousal      IPred r=0.551 p=0.000
states      -size         IPred r=0.004 p=0.978
sports      -intelligence IPred r=0.274 p=0.054
names       -age          IPred r=0.866 p=0.000
clothing    -wealth       IPred r=0.634 p=0.000
weather     -danger       IPred r=0.312 p=0.060
professions -danger       IPred r=0.422 p=0.003
clothing    -size         IPred r=0.661 p=0.000
animals     -size         IPred r=-0.173 p=0.327
sports      -wealth       IPred r=0.022 p=0.880
professions -valence      IPred r=-0.131 p=0.368
names       -wealth       IPred r=0.240 p=0.093
cities      -cost         IPred r=0.670 p=0.000
cities      -wealth       IPred r=0.706 p=0.000
professions -gen

## So, how about a training and test split?

With the combined loss, do we do better at predicting results for the test data? 

In [79]:
import numpy as np

print("Training and test split, two-loss formulation for fitted dimensions")

numfolds = 5
alpha = 0.1

all_evals = [ ]

for filename in os.listdir(grandratings_dir):
    if filename.endswith("csv"):
        grandcategory, grandfeature, pos_seedwords, neg_seedwords, df = read_grand_data(filename, grandratings_dir, grandfeatures_df)
        
        
        # storage for word vectors and gold values for this dataset
        all_thisdata_vectors = []
        all_thisdata_gold = []

        for row in df.itertuples():
            # row.Row is the word. look it up in word_vectors
            all_thisdata_vectors.append( word_vectors[ row.Row ])
            # gold rating: use z-scored average
            all_thisdata_gold.append( row.Gold)
      
        # crossvalidation setup: give indices to datapoints
        fold = np.random.randint(numfolds, size = len(all_thisdata_gold))
        
        # store the evaluatresults from the different test folds
        evals = [ ]
        
        # iterate over folds, evaluate for each of them
        for testfold in range(numfolds):
            # compute training and test data for this fold
            test_indices =  [i for i in range(len(all_thisdata_gold)) if fold[i] == testfold]
            train_indices = [i for i in range(len(all_thisdata_gold)) if fold[i] != testfold]
        
            gold_test =  [ell["Gold"] for _, ell in df.iloc[ test_indices ].iterrows()]
            gold_train = [ ell["Gold"] for _, ell in df.iloc[ train_indices ].iterrows()]
            words_test =  [ell["Row"] for _, ell in df.iloc[ test_indices].iterrows()]
            words_train = [ell["Row"] for _, ell in df.iloc[ train_indices].iterrows()]
            vec_test =  [word_vectors[ w ] for w in words_test]
            vec_train = [word_vectors[ w ] for w in words_train ]
        

            # compute seed-based dimension, and its predictions
            seed_dim = average_dim_vector(pos_seedwords, neg_seedwords, word_vectors)
            p0 = [vector_scalar_projection( word_vectors[w], seed_dim) for w in df["Row"]]
            weight, bias = coef_for_seed_dimension(df["Gold"], p0)
            df["SPred"] = [sprediction(p, weight, bias) for p in p0]
            
            
            # compute fitted dimension, and its predictions
            fitted_dim, fitted_wt, fitted_bias = ideal_dimension(vec_train, gold_train, feature_dim)
            df["FPred"] = [ iprediction( word_vectors[ w], fitted_dim, fitted_wt, fitted_bias) for w in df["Row"]]
            
            
            # compute fitted dimension with seeds, and its predictions
            diffvectors = [ ]
    
            for negword in neg_seedwords:
                for posword in pos_seedwords:
                    diffvectors.append(word_vectors[posword] - word_vectors[negword])
                    
            dimvec = torch.from_numpy(np.mean(diffvectors, axis = 0))
            
            fitted_dim, fitted_wt, fitted_bias = fitted_dimension_withseeds(thisdata_vectors, thisdata_gold, [dimvec], 
                                                                            feature_dim, alpha = alpha)
        
            
            df["FSPred"] = [ iprediction( word_vectors[ w], fitted_dim, fitted_wt, fitted_bias) for w in df["Row"]]
            
            # order consistency pairwise: test values tested for their ordering wrt. all values, training and test
            # MSE: evaluate on test only
            e = { "ocp_s" : pairwise_order_consistency_wrt(df["Gold"], df["SPred"], test_indices),
                  "ocp_f" : pairwise_order_consistency_wrt(df["Gold"], df["FPred"], test_indices),
                  "ocp_fs": pairwise_order_consistency_wrt(df["Gold"], df["FSPred"], test_indices),
                  "mse_s" : mean_squared_error(gold_test, [p for i, p in enumerate(df["SPred"]) if i in test_indices]),
                  "mse_f" : mean_squared_error(gold_test, [p for i, p in enumerate(df["FPred"]) if i in test_indices]),
                  "mse_fs": mean_squared_error(gold_test, [p for i, p in enumerate(df["FSPred"]) if i in test_indices])}
            
            evals.append(e)
            all_evals.append(e)
        
        print(grandcategory, "-", grandfeature)
        print("\t", end = "")
        for suffix, name in [("s", "Seed"), ("f", "Fitted"), ("fs", "Fitted-Seed")]:
            ocps = [e["ocp_" + suffix] for e in evals]
            print(name, f"OC_p {statistics.mean(ocps):.3f} ({statistics.stdev(ocps):.2f})", end = " ")
        print("\n\t", end = "")
        for suffix, name in [("s", "Seed"), ("f", "Fitted"), ("fs", "Fitted-Seed")]:
            mses = [e["mse_" + suffix] for e in evals]
            print(name, f"MSE {statistics.mean(mses):.3f} ({statistics.stdev(mses):.2f})", end = " ")
        print()

Training and test split, two-loss formulation for fitted dimensions
cities - temperature
	Seed OC_p 0.523 (0.05) Fitted OC_p 0.507 (0.08) Fitted-Seed OC_p 0.645 (0.05) 
	Seed MSE 119.793 (62.04) Fitted MSE 24.528 (7.56) Fitted-Seed MSE 956.412 (315.99) 
professions - intelligence
	Seed OC_p 0.630 (0.04) Fitted OC_p 0.654 (0.04) Fitted-Seed OC_p 0.361 (0.04) 
	Seed MSE 3.236 (1.73) Fitted MSE 8.965 (5.01) Fitted-Seed MSE 1522530.221 (722445.99) 
cities - intelligence
	Seed OC_p 0.581 (0.05) Fitted OC_p 0.569 (0.09) Fitted-Seed OC_p 0.533 (0.06) 
	Seed MSE 34.913 (23.60) Fitted MSE 11.133 (4.65) Fitted-Seed MSE 1139667.756 (351579.68) 
clothing - location
	Seed OC_p 0.601 (0.06) Fitted OC_p 0.554 (0.08) Fitted-Seed OC_p 0.508 (0.07) 
	Seed MSE 7.964 (4.10) Fitted MSE 12.845 (5.89) Fitted-Seed MSE 98.463 (68.91) 
cities - arousal
	Seed OC_p 0.525 (0.05) Fitted OC_p 0.585 (0.06) Fitted-Seed OC_p 0.583 (0.05) 
	Seed MSE 2076351.291 (809620.70) Fitted MSE 13.715 (6.49) Fitted-Seed MSE 1080.6

In [80]:
for suffix, name in [("s", "Seed"), ("f", "Fitted"), ("fs", "Fitted-Seed")]:
    ocps = [e["ocp_" + suffix] for e in all_evals]
    print(name, f"OC_p mean {statistics.mean(ocps):.3f} ({statistics.stdev(ocps):.2f})")
print()

for suffix, name in [("s", "Seed"), ("f", "Fitted"), ("fs", "Fitted-Seed")]:
    mses = [e["mse_" + suffix] for e in evals]
    print(name, f"MSE med {statistics.median(mses):.3f} mean {statistics.mean(mses):.3f} ({statistics.stdev(mses):.2f})") 
    


Seed OC_p mean 0.647 (0.11)
Fitted OC_p mean 0.597 (0.10)
Fitted-Seed OC_p mean 0.501 (0.14)

Seed MSE med 1.327 mean 1.418 (0.75)
Fitted MSE med 12.054 mean 10.173 (5.82)
Fitted-Seed MSE med 6.178 mean 6.286 (2.27)


# Where are we now

So far I have not found a formulation based on fitted dimensions that outperforms seed-based dimensions on unseen data for the Grand datasets. I still think our current best hypothesis is that words of the same category have too many properties in common, so that the direction of the fitted dimension is underdetermined, and the model can overfit to the given ratings. And I still think that using seeds whiler computing fitted dimensions is our best option to "tell" the model which property we mean. 

Things I've tried so far:

* Throwing the seeds in with the category words, but with extreme ratings. I've used numbers that are 0.5 to 1 units out from the most extreme ratings, in terms of z-scores. 

* Making a two-part loss function, where one part is about having a high cosine similarity to given seed-based dimensions, and the other part is about matching human ratings on category words. But I'm not sure I've been doing it right, I don't have much epxerience with pytorch. Setting the mixing parameter alpha to zero should give all weight to the seed dimensions, and should get me numbers that are basically the same as with a seed-based dimension -- but it doesn't, it's lower. So what was I doing wrong?

* If we add high match with given seed-based dimensions to the loss functions, could we in principle also learn weights for the different seed-based dimensions, so that it would learn to give high weight to the ones that match with the human ratings, and low weight to the ones that are pointing in the wrong direction?

I'm making a new notebook with just the definitions needed to further explore the train/test problem. I'll also make a development set so that we don't always test on all the Grand data.
