### A newer embedding model that may be interesting to use. Larger parameters and dimensions
Zhang et al. (2025). Qwen3 Embedding: Advancing Text Embedding and Reranking Through Foundation Models. https://arxiv.org/abs/2506.05176

Spefically the "Qwen3-Embedding-0.6B" model: https://huggingface.co/Qwen/Qwen3-Embedding-0.6B

In [None]:
#run this using GPU instead of CPU to reduce processing time
#on google colab, go to "runtime", go to "change runtime type", click on "GPu", hit "save"

In [1]:
#install dependencies
!pip install transformers accelerate sentencepiece
import torch
from transformers import AutoModel, AutoTokenizer
import numpy as np
import pandas as pd



In [2]:
#load the model + tokenizer
#downloaded from hugginface: https://huggingface.co/Qwen/Qwen3-Embedding-0.6B
model_name = "Qwen/Qwen3-Embedding-0.6B"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(
    model_name, torch_dtype="auto", device_map="auto")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/1.19G [00:00<?, ?B/s]

In [4]:
#test it out
text = ["This is a test sentence."]

inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
inputs = {k: v.to(model.device) for k, v in inputs.items()}  #sends tensors to same device as model

with torch.no_grad():
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)  #simple mean pool

embeddings.shape  #shape of the embedding vector that the model produced. 1024 dimensions

torch.Size([1, 1024])

In [5]:
model.device  #GPU becasue it's "cuda"

device(type='cuda', index=0)

In [11]:
def embed_word(word):
    """Return a normalized embedding for a single word using Qwen3."""
    inputs = tokenizer([word], return_tensors="pt", padding=True, truncation=True).to(model.device)

    with torch.no_grad():
        outputs = model(**inputs)
        vec = outputs.last_hidden_state.mean(dim=1).squeeze().float().cpu().numpy()


    return normalize(vec)

def in_vocab(word):
    tokens = tokenizer.tokenize(word)
    return len(tokens) > 0

def normalize(vec):
    return vec / np.linalg.norm(vec)

#vector dimension (single low/high)
def distance_with_concept(word, concept1, concept2):
    for w in [word, concept1, concept2]:
        if not in_vocab(w):
            raise ValueError(f"{w} not tokenizable by Qwen tokenizer")

    v_word = embed_word(word)
    v_c1 = embed_word(concept1)
    v_c2 = embed_word(concept2)

    axis = normalize(v_c2 - v_c1)
    return np.dot(v_word, axis)

#vector dimension (averaged multiple pairs)
def distance_with_concepts_multi_averaged(word, low_anchors, high_anchors):
    all_words = [word] + low_anchors + high_anchors
    for w in all_words:
        if not in_vocab(w):
            raise ValueError(f"{w} not tokenizable by Qwen tokenizer")

    v_word = embed_word(word)

    v_high = np.mean([embed_word(w) for w in high_anchors], axis=0)
    v_low  = np.mean([embed_word(w) for w in low_anchors], axis=0)

    axis = normalize(v_high - v_low)
    return np.dot(v_word, axis)

#vector dimension (average all pairwise high-low pairs)
def distance_with_concepts_multi_pairs(word, low_anchors, high_anchors):
    all_words = [word] + low_anchors + high_anchors
    for w in all_words:
        if not in_vocab(w):
            raise ValueError(f"{w} not tokenizable by Qwen tokenizer")

    v_word = embed_word(word)

    directions = []
    for h in high_anchors:
        for l in low_anchors:
            vec = embed_word(h) - embed_word(l)
            directions.append(vec)

    axis = normalize(np.mean(directions, axis=0))
    return np.dot(v_word, axis)

In [8]:
#anchors
shape_low  = ["spiky", "sharp", "pointy"]
shape_high = ["round", "curved", "smooth"]

In [9]:
from google.colab import files
uploaded = files.upload() #choose "words" file

import io
df = pd.read_excel(io.BytesIO(uploaded['words.xlsx']))
print(df.head())

Saving words.xlsx to words.xlsx
         Word
0    necklace
1  watermelon
2        rind
3     handsaw
4     pumpkin


In [12]:
import sys
import time

shape_scores = []
total = len(df)

for index, row in df.iterrows():
    word = row['Word']

    # --- progress display ---
    progress_str = f"\rProcessing {index+1}/{total} words..."
    sys.stdout.write(progress_str)
    sys.stdout.flush()
    # -------------------------

    if in_vocab(word):
        score = distance_with_concepts_multi_pairs(word, shape_low, shape_high)
    else:
        score = "na"

    shape_scores.append(score)

# Print newline so the prompt is clean afterward
print()

df_scores = pd.DataFrame({
    'Word': df['Word'],
    'ShapeScoreAvg': shape_scores})

###note that batching would make processing time even faster

Processing 1756/1756 words...


In [13]:
uploaded = files.upload() #choose "Sidhu_et_al._(2021)_cleaned" file

#import file
dfAll = pd.read_excel(io.BytesIO(uploaded['Sidhu_et_al._(2021)_cleaned.xlsx']))
print(dfAll.head())

Saving Sidhu_et_al._(2021)_cleaned.xlsx to Sidhu_et_al._(2021)_cleaned.xlsx
         Word  ShapeRating  LENGTH      Freq  Valence  Arousal  \
0    necklace     0.630586       6  0.309377     6.85     3.52   
1  watermelon     0.732784       9  0.206202     6.75     4.64   
2        rind     0.516273       4  0.222485      NaN      NaN   
3     handsaw     0.252606       6  0.015352      NaN      NaN   
4     pumpkin     0.741543       6  1.320069     7.00     3.43   

   Brys_Concreteness  image_m  image_sd  Compound  Proper   AoA   Size  \
0               4.96    6.543     0.840         0       0  5.00  2.771   
1               4.89      NaN       NaN         1       0  4.22    NaN   
2               4.48      NaN       NaN         0       0  8.95    NaN   
3               5.00      NaN       NaN         1       0  8.56    NaN   
4               4.90    6.849     0.359         0       0  4.78  3.206   

   Nmorph  CW_Concreteness  SoundScore  PerWin_Iconicity  Derived_Iconicity  
0   

In [15]:
#merge shape_score to the original dataframe df
df_merge = dfAll.merge(df_scores, on="Word", how="left")

#removing na rows
df_final = df_merge[df_merge.ne("na").all(axis=1)]
print(df_final.head())

         Word  ShapeRating  LENGTH      Freq  Valence  Arousal  \
0    necklace     0.630586       6  0.309377     6.85     3.52   
1  watermelon     0.732784       9  0.206202     6.75     4.64   
2        rind     0.516273       4  0.222485      NaN      NaN   
3     handsaw     0.252606       6  0.015352      NaN      NaN   
4     pumpkin     0.741543       6  1.320069     7.00     3.43   

   Brys_Concreteness  image_m  image_sd  Compound  Proper   AoA   Size  \
0               4.96    6.543     0.840         0       0  5.00  2.771   
1               4.89      NaN       NaN         1       0  4.22    NaN   
2               4.48      NaN       NaN         0       0  8.95    NaN   
3               5.00      NaN       NaN         1       0  8.56    NaN   
4               4.90    6.849     0.359         0       0  4.78  3.206   

   Nmorph  CW_Concreteness  SoundScore  PerWin_Iconicity  Derived_Iconicity  \
0     2.0         1.123024   -1.080652         -0.200000          -1.178247   


In [18]:
#Spearman correlations for SoundScore
from scipy.stats import spearmanr

cols = ['ShapeScoreAvg', 'SoundScore']

# Convert columns to numeric, coercing errors to NaN
for col in cols:
    df_final[col] = pd.to_numeric(df_final[col], errors='coerce')

# Drop rows that now contain NaN in the specified columns
merged_df_cleaned = df_final.dropna(subset=cols)


#create empty DataFrame for formatted results
combined_matrix = pd.DataFrame(index=cols, columns=cols, dtype=object)

#compute correlation and p-value for each pair
for i in cols:
    for j in cols:
        # Ensure both columns are numeric before calculating correlation
        if pd.api.types.is_numeric_dtype(merged_df_cleaned[i]) and pd.api.types.is_numeric_dtype(merged_df_cleaned[j]):
            corr, pval = spearmanr(merged_df_cleaned[i], merged_df_cleaned[j])
            combined_matrix.loc[i, j] = f"{corr:.3f} ({pval:.3f})"
        else:
            combined_matrix.loc[i, j] = "N/A" # Or some other indicator for non-numeric pairs


print("Spearman Correlation Coefficient (p-value) Matrix for SoundScore:")
print(combined_matrix)

Spearman Correlation Coefficient (p-value) Matrix for SoundScore:
               ShapeScoreAvg     SoundScore
ShapeScoreAvg  1.000 (0.000)  0.043 (0.069)
SoundScore     0.043 (0.069)  1.000 (0.000)


In [20]:
#Spearman correlations for ShapeRating
cols = ['ShapeScoreAvg', 'ShapeRating']

# Convert columns to numeric, coercing errors to NaN
for col in cols:
    df_final[col] = pd.to_numeric(df_final[col], errors='coerce')

# Drop rows that now contain NaN in the specified columns
merged_df_cleaned = df_final.dropna(subset=cols)

# Create empty DataFrame for formatted results
combined_matrix = pd.DataFrame(index=cols, columns=cols, dtype=object)

# Compute Spearman correlation and p-value for each pair
for i in cols:
    for j in cols:
        if pd.api.types.is_numeric_dtype(merged_df_cleaned[i]) and pd.api.types.is_numeric_dtype(merged_df_cleaned[j]):
            corr, pval = spearmanr(merged_df_cleaned[i], merged_df_cleaned[j]) #using spearmanr here
            combined_matrix.loc[i, j] = f"{corr:.3f} ({pval:.3f})"
        else:
            combined_matrix.loc[i, j] = "N/A"

print("Spearman Correlation Coefficient (p-value) Matrix for ShapeRating:")
print(combined_matrix)

Spearman Correlation Coefficient (p-value) Matrix for ShapeRating:
               ShapeScoreAvg    ShapeRating
ShapeScoreAvg  1.000 (0.000)  0.111 (0.000)
ShapeRating    0.111 (0.000)  1.000 (0.000)


In [21]:
# Columns to include in the combined matrix
cols = ['ShapeScoreAvg', 'SoundScore', 'ShapeRating']

# Convert columns to numeric (coerce errors to NaN)
for col in cols:
    df_final[col] = pd.to_numeric(df_final[col], errors='coerce')

# Drop rows with ANY missing values in these columns
df_corr = df_final.dropna(subset=cols)

# Create empty result DataFrame
combined_matrix = pd.DataFrame(index=cols, columns=cols, dtype=object)

# Compute Spearman correlations for all pairs
for i in cols:
    for j in cols:
        if pd.api.types.is_numeric_dtype(df_corr[i]) and pd.api.types.is_numeric_dtype(df_corr[j]):
            corr, pval = spearmanr(df_corr[i], df_corr[j])
            combined_matrix.loc[i, j] = f"{corr:.3f} ({pval:.3f})"
        else:
            combined_matrix.loc[i, j] = "N/A"

print("Combined Spearman Correlation Coefficient (p-value) Matrix:")
print(combined_matrix)


Combined Spearman Correlation Coefficient (p-value) Matrix:
               ShapeScoreAvg     SoundScore    ShapeRating
ShapeScoreAvg  1.000 (0.000)  0.043 (0.069)  0.111 (0.000)
SoundScore     0.043 (0.069)  1.000 (0.000)  0.213 (0.000)
ShapeRating    0.111 (0.000)  0.213 (0.000)  1.000 (0.000)
