In [None]:
!pip install gensim
from gensim.models import KeyedVectors
import numpy as np
import pandas as pd
from gensim.downloader import load

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [None]:
model = load("word2vec-google-news-300")



In [None]:
#functions to be used later
def normalize(vec):
  return vec/np.linalg.norm(vec)

def distance_with_concept(word, concept1, concept2, model):
  for w in [word, concept1, concept2]:
    if w not in model.key_to_index:
      raise ValueError(f"{w} not in vocabulary")
  v_word = normalize(model[word])
  v_concept1 = model[concept1]
  v_concept2 = model[concept2]

  concept_axis = normalize(v_concept2 - v_concept1)
  dot_product = np.dot(v_word, concept_axis)
  return dot_product

#extended version with multiple anchor pairs
#to make sure all anchor words are used, I took the average all high anchor words and the average of all low anchor words
#then I obtained the vectors for each dimension by subtracting the two
def distance_with_concepts_multi_averaged(word, low_anchors, high_anchors, model):
  all_words = [word] + low_anchors + high_anchors
  for w in all_words:
    if w not in model.key_to_index:
      raise ValueError(f"{w} not in vocabulary")
  v_word = normalize(model[word])

  v_high_mean = np.mean([model[w] for w in high_anchors], axis=0)
  v_low_mean = np.mean([model[w] for w in low_anchors], axis=0)

  axis = v_high_mean - v_low_mean
  avg_axis = normalize(axis)

  dot_product = np.dot(v_word, avg_axis)
  return dot_product

In [None]:
#looking at some examples

print("able on valence axis:", distance_with_concepts_multi_averaged("able", low_anchors=["unhappy", "annoyed", "unsatisfied", "melancholic", "despaired", "bored"], high_anchors=["happy", "pleased", "satisfied", "contented", "hopeful"], model=model))
  #uneven list; low_anchors "bored" not used
print("able on arousal axis:", distance_with_concepts_multi_averaged("able", low_anchors=["relaxed", "calm", "sluggish", "dull", "sleepy"], high_anchors=["stimulated", "excited", "frenzied", "jittery", "awake", "aroused"], model=model))
  #took out "unaroused" because not in vocabulary; but this way the high_anchors "aroused" won't be used
  #took out "wide-"awake becasue not in vocabulary
print("able on dominance axis:", distance_with_concepts_multi_averaged("able", low_anchors=["controlled", "influenced", "cared", "awed", "submissive", "guided"], high_anchors=["control", "influential", "important", "dominant", "autonomous", "controlling"], model=model))
  #took out "in" control because not in vocabulary
  #took out cared"-for" because not in vocabulary

able on valence axis: 0.25455704
able on arousal axis: 0.08567465
able on dominance axis: -0.03819998


In [None]:
#define anchor words using instructions for participants in Warriner et al. (2013)
valence_low  = ["unhappy",  "annoyed",  "unsatisfied",  "melancholic",  "despaired", "bored"]
valence_high = [  "happy",  "pleased",    "satisfied",    "contented",    "hopeful"]

arousal_low  = [   "relaxed",    "calm", "sluggish",    "dull", "sleepy"]
arousal_high = ["stimulated", "excited", "frenzied", "jittery",  "awake", "aroused"]

dominance_low  = ["controlled",  "influenced",     "cared",     "awed", "submissive",      "guided"]
dominance_high = [   "control", "influential", "important", "dominant", "autonomous", "controlling"]

In [None]:
#upload file
from google.colab import files
uploaded = files.upload() #choose "words" file

#import file
import io
df = pd.read_excel(io.BytesIO(uploaded['words.xlsx']))
print(df.head())

Saving words.xlsx to words.xlsx
          Word
0     aardvark
1      abalone
2      abandon
3  abandonment
4        abbey


In [None]:
#compute embedding scores of the three emotions for each word
valence_scores = []
arousal_scores = []
dominance_scores = []

for index, row in df.iterrows():
    word = row['Word']
    if word in model.key_to_index:
        v = distance_with_concepts_multi_averaged(word, valence_low, valence_high, model)
        a = distance_with_concepts_multi_averaged(word, arousal_low, arousal_high, model)
        d = distance_with_concepts_multi_averaged(word, dominance_low, dominance_high, model)
    else:
        print(f"'{word}' not in vocabulary, assigning 'na'.")
        v, a, d = "na", "na", "na"

    valence_scores.append(v)
    arousal_scores.append(a)
    dominance_scores.append(d)

# create new DataFrame (same length as original)
df_scores = pd.DataFrame({
    'Word': df['Word'],
    'valence': valence_scores,
    'arousal': arousal_scores,
    'dominance': dominance_scores
})

# export results to Excel
df_scores.to_excel("word_emotional_scores.xlsx", index=False)

'acknowledgement' not in vocabulary, assigning 'na'.
'action figure' not in vocabulary, assigning 'na'.
'aircraft carrier' not in vocabulary, assigning 'na'.
'alarm clock' not in vocabulary, assigning 'na'.
'altar boy' not in vocabulary, assigning 'na'.
'apple juice' not in vocabulary, assigning 'na'.
'archaeology' not in vocabulary, assigning 'na'.
'axe' not in vocabulary, assigning 'na'.
'barbie doll' not in vocabulary, assigning 'na'.
'baseball bat' not in vocabulary, assigning 'na'.
'bass guitar' not in vocabulary, assigning 'na'.
'black snake' not in vocabulary, assigning 'na'.
'blue spruce' not in vocabulary, assigning 'na'.
'boa constrictor' not in vocabulary, assigning 'na'.
'board game' not in vocabulary, assigning 'na'.
'brass knuckle' not in vocabulary, assigning 'na'.
'breaking and entering' not in vocabulary, assigning 'na'.
'business man' not in vocabulary, assigning 'na'.
'caddie' not in vocabulary, assigning 'na'.
'can opener' not in vocabulary, assigning 'na'.
'catalog

In [None]:
#upload file
uploaded = files.upload() #choose "Warriner_et_al._(2013)" file

#import file
df = pd.read_excel(io.BytesIO(uploaded['Warriner_et_al._(2013).xlsx']))
print(df.head())

Saving Warriner_et_al._(2013).xlsx to Warriner_et_al._(2013).xlsx
   Unnamed: 0         Word  V.Mean.Sum  V.SD.Sum  V.Rat.Sum  A.Mean.Sum  \
0           1     aardvark        6.26      2.21         19        2.41   
1           2      abalone        5.30      1.59         20        2.65   
2           3      abandon        2.84      1.54         19        3.73   
3           4  abandonment        2.63      1.74         19        4.95   
4           5        abbey        5.85      1.69         20        2.20   

   A.SD.Sum  A.Rat.Sum  D.Mean.Sum  D.SD.Sum  ...  A.Rat.L  A.Mean.H  A.SD.H  \
0      1.40         22        4.27      1.75  ...       11      2.55    1.29   
1      1.90         20        4.95      1.79  ...       12      2.38    1.92   
2      2.43         22        3.32      2.50  ...       11      3.82    2.14   
3      2.64         21        2.64      1.81  ...       14      5.29    2.63   
4      1.70         20        5.00      2.02  ...        9      2.55    1.92   

  

In [None]:
#read the original Excel file again to get the other columns
df_original = pd.read_excel(io.BytesIO(uploaded['Warriner_et_al._(2013).xlsx']))

#merge the original DataFrame with the scores DataFrame
merged_df = pd.merge(df_original, df_scores, on='Word', how='left')

#display the first few rows of the merged DataFrame
display(merged_df.head())

#export the merged DataFrame to a new Excel file
merged_df.to_excel("combined.xlsx", index=False)

Unnamed: 0.1,Unnamed: 0,Word,V.Mean.Sum,V.SD.Sum,V.Rat.Sum,A.Mean.Sum,A.SD.Sum,A.Rat.Sum,D.Mean.Sum,D.SD.Sum,...,A.Rat.H,D.Mean.L,D.SD.L,D.Rat.L,D.Mean.H,D.SD.H,D.Rat.H,valence,arousal,dominance
0,1,aardvark,6.26,2.21,19,2.41,1.4,22,4.27,1.75,...,11,4.12,1.64,8,4.43,1.99,7,-0.149426,-0.064354,-0.041666
1,2,abalone,5.3,1.59,20,2.65,1.9,20,4.95,1.79,...,8,5.55,2.21,11,4.36,1.03,11,0.004894,-0.06554,0.02424
2,3,abandon,2.84,1.54,19,3.73,2.43,22,3.32,2.5,...,11,2.77,2.09,13,4.11,2.93,9,-0.199115,0.002747,-0.114986
3,4,abandonment,2.63,1.74,19,4.95,2.64,21,2.64,1.81,...,7,2.31,1.45,16,3.08,2.19,12,-0.133572,0.040948,-0.099895
4,5,abbey,5.85,1.69,20,2.2,1.7,20,5.0,2.02,...,11,4.83,2.18,18,5.43,1.62,7,-0.051369,-0.063565,-0.08843


In [None]:
merged_df = merged_df[merged_df.ne("na").all(axis=1)]
print(len(df))
print(len(merged_df)) #13787; original 13915

13915
13787


In [None]:
#normalize human ratings
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(-1, 1))
merged_df['V.Mean.Sum_scaled'] = scaler.fit_transform(merged_df[['V.Mean.Sum']])
merged_df['A.Mean.Sum_scaled'] = scaler.fit_transform(merged_df[['A.Mean.Sum']])
merged_df['D.Mean.Sum_scaled'] = scaler.fit_transform(merged_df[['D.Mean.Sum']])

In [None]:
#calculate correlation coefficients and p-values
from scipy.stats import pearsonr

cols = ['valence', 'arousal', 'dominance', 'V.Mean.Sum_scaled', 'A.Mean.Sum_scaled', 'D.Mean.Sum_scaled']

#convert columns to numeric, coercing errors to NaN
for col in cols:
    merged_df[col] = pd.to_numeric(merged_df[col], errors='coerce')

#drop rows that now contain NaN in the specified columns
df_cleaned = merged_df.dropna(subset=cols)


#create empty DataFrame for formatted results
combined_matrix = pd.DataFrame(index=cols, columns=cols, dtype=object)

#compute correlation and p-value for each pair
for i in cols:
    for j in cols:
        #ensure both columns are numeric before calculating correlation
        if pd.api.types.is_numeric_dtype(df_cleaned[i]) and pd.api.types.is_numeric_dtype(df_cleaned[j]):
            corr, pval = pearsonr(df_cleaned[i], df_cleaned[j])
            combined_matrix.loc[i, j] = f"{corr:.3f} ({pval:.3f})"
        else:
            combined_matrix.loc[i, j] = "N/A"


print("Correlation Coefficient (p-value) Matrix:")
print(combined_matrix)

Correlation Coefficient (p-value) Matrix:
                          valence         arousal       dominance  \
valence             1.000 (0.000)   0.084 (0.000)   0.368 (0.000)   
arousal             0.084 (0.000)   1.000 (0.000)   0.072 (0.000)   
dominance           0.368 (0.000)   0.072 (0.000)   1.000 (0.000)   
V.Mean.Sum_scaled   0.341 (0.000)   0.004 (0.604)   0.001 (0.937)   
A.Mean.Sum_scaled  -0.136 (0.000)   0.148 (0.000)  -0.055 (0.000)   
D.Mean.Sum_scaled   0.346 (0.000)  -0.005 (0.581)   0.053 (0.000)   

                  V.Mean.Sum_scaled A.Mean.Sum_scaled D.Mean.Sum_scaled  
valence               0.341 (0.000)    -0.136 (0.000)     0.346 (0.000)  
arousal               0.004 (0.604)     0.148 (0.000)    -0.005 (0.581)  
dominance             0.001 (0.937)    -0.055 (0.000)     0.053 (0.000)  
V.Mean.Sum_scaled     1.000 (0.000)    -0.183 (0.000)     0.718 (0.000)  
A.Mean.Sum_scaled    -0.183 (0.000)     1.000 (0.000)    -0.178 (0.000)  
D.Mean.Sum_scaled     0.718 (0