### Sound symbolism in real words.

To what extent are the words' meanings influenced by their sounds? This project looks specifically at "shape sound symbolism".



In [None]:
!pip install gensim
from gensim.models import KeyedVectors
import numpy as np
import pandas as pd
from gensim.downloader import load

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m54.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [None]:
model = load("word2vec-google-news-300")  #bigger model word2vec



In [None]:
#functions to be used later
def normalize(vec):
  return vec/np.linalg.norm(vec)

def distance_with_concept(word, concept1, concept2, model):
  for w in [word, concept1, concept2]:
    if w not in model.key_to_index:
      raise ValueError(f"{w} not in vocabulary")
  v_word = normalize(model[word])
  v_concept1 = model[concept1]
  v_concept2 = model[concept2]

  concept_axis = normalize(v_concept2 - v_concept1)
  dot_product = np.dot(v_word, concept_axis)
  return dot_product

#extended version with multiple anchor pairs
def distance_with_concepts_multi_averaged(word, low_anchors, high_anchors, model):
  all_words = [word] + low_anchors + high_anchors
  for w in all_words:
    if w not in model.key_to_index:
      raise ValueError(f"{w} not in vocabulary")
  v_word = normalize(model[word])

  v_high_mean = np.mean([model[w] for w in high_anchors], axis=0)
  v_low_mean = np.mean([model[w] for w in low_anchors], axis=0)
  axis = v_high_mean - v_low_mean
  avg_axis = normalize(axis)

  dot_product = np.dot(v_word, avg_axis)
  return dot_product

def distance_with_concepts_multi_pairs(word, low_anchors, high_anchors, model):
    all_words = [word] + low_anchors + high_anchors
    for w in all_words:
        if w not in model.key_to_index:
            raise ValueError(f"{w} not in vocabulary")

    v_word = normalize(model[word])

    pair_directions = []
    for h in high_anchors:
        for l in low_anchors:
            direction = model[h] - model[l]
            pair_directions.append(direction)

    avg_axis = normalize(np.mean(pair_directions, axis=0))

    dot_product = np.dot(v_word, avg_axis)
    return dot_product

In [None]:
#research on shape sound symbolism has primarily use "round", "curved", "smooth", "spiky", "sharp", "pointy" as anchor words
#therefore, I used these words as anchor words

In [None]:
#examples
print("example 1:")
print("cactus on spiky-round axis:", distance_with_concept("cactus", "spiky", "round", model))
print("balloon on spiky-round axis:", distance_with_concept("balloon", "spiky", "round", model))

print("\nexample 2 (multi-anchor):")
print("cactus on shape (spiky-round) axis:", distance_with_concepts_multi_averaged("cactus", low_anchors=["spiky", "sharp", "pointy"], high_anchors=["round", "curved", "smooth"], model=model))
print("balloon on shape (spiky-round) axis:", distance_with_concepts_multi_averaged("balloon", low_anchors=["spiky", "sharp", "pointy"], high_anchors=["round","curved", "smooth"], model=model))
print("cake on shape (spiky-round) axis:", distance_with_concepts_multi_averaged("cake", low_anchors=["spiky", "sharp", "pointy"], high_anchors=["round","curve", "smooth"], model=model))

print("cactus on shape (spiky-round) axis:", distance_with_concepts_multi_pairs("cactus", low_anchors=["spiky", "sharp", "pointy"], high_anchors=["round", "curved", "smooth"], model=model))
print("balloon on shape (spiky-round) axis:", distance_with_concepts_multi_pairs("balloon", low_anchors=["spiky", "sharp", "pointy"], high_anchors=["round","curved", "smooth"], model=model))
print("cake on shape (spiky-round) axis:", distance_with_concepts_multi_pairs("cake", low_anchors=["spiky", "sharp", "pointy"], high_anchors=["round","curve", "smooth"], model=model))

print("\nexample 3 (gender axis):")
print("king on female-male axis:", distance_with_concept("king", "female", "male", model))
print("queen on female-male axis:", distance_with_concept("queen", "female", "male", model))

example 1:
cactus on spiky-round axis: -0.24701288
balloon on spiky-round axis: -0.075282946

example 2 (multi-anchor):
cactus on shape (spiky-round) axis: -0.20043871
balloon on shape (spiky-round) axis: -0.033829622
cake on shape (spiky-round) axis: -0.020619528
cactus on shape (spiky-round) axis: -0.20043871
balloon on shape (spiky-round) axis: -0.03382962
cake on shape (spiky-round) axis: -0.02061953

example 3 (gender axis):
king on female-male axis: 0.096518114
queen on female-male axis: -0.13230057


In [None]:
from google.colab import files
uploaded = files.upload() #choose "words" file

import io
df = pd.read_excel(io.BytesIO(uploaded['words.xlsx']))
print(df.head())

Saving words.xlsx to words.xlsx
         Word
0    necklace
1  watermelon
2        rind
3     handsaw
4     pumpkin


In [None]:
#define anchor words. these words are the ones that are commonly used sound symbolism research
shape_low  = ["spiky", "sharp", "pointy"]
shape_high = ["round", "curved", "smooth"]

In [None]:
#using distance_with_concepts_multi_averaged to calculate embedding scores of the words
shape_scores = []

for index, row in df.iterrows():
    word = row['Word']
    if word in model.key_to_index:
        s = distance_with_concepts_multi_averaged(word, shape_low, shape_high, model)
    else:
        print(f"'{word}' not in vocabulary, assigning 'na'.")
        s = "na"

    shape_scores.append(s)

# create new DataFrame (same length as original)
df_scores = pd.DataFrame({
    'Word': df['Word'],
    'shape': shape_scores
})

# export results to Excel
df_scores.to_excel("word_shape_scores.xlsx", index=False)

'oldsmobile' not in vocabulary, assigning 'na'.


In [None]:
#upload file
uploaded = files.upload() #choose "Sidhu_et_al._(2021)_cleaned" file

#import file
df = pd.read_excel(io.BytesIO(uploaded['Sidhu_et_al._(2021)_cleaned.xlsx']))
print(df.head())

Saving Sidhu_et_al._(2021)_cleaned.xlsx to Sidhu_et_al._(2021)_cleaned.xlsx
         Word  ShapeRating  LENGTH      Freq  Valence  Arousal  \
0    necklace     0.630586       6  0.309377     6.85     3.52   
1  watermelon     0.732784       9  0.206202     6.75     4.64   
2        rind     0.516273       4  0.222485      NaN      NaN   
3     handsaw     0.252606       6  0.015352      NaN      NaN   
4     pumpkin     0.741543       6  1.320069     7.00     3.43   

   Brys_Concreteness  image_m  image_sd  Compound  Proper   AoA   Size  \
0               4.96    6.543     0.840         0       0  5.00  2.771   
1               4.89      NaN       NaN         1       0  4.22    NaN   
2               4.48      NaN       NaN         0       0  8.95    NaN   
3               5.00      NaN       NaN         1       0  8.56    NaN   
4               4.90    6.849     0.359         0       0  4.78  3.206   

   Nmorph  CW_Concreteness  SoundScore  PerWin_Iconicity  Derived_Iconicity  
0   

In [None]:
#merge the original DataFrame with the scores DataFrame
merged_df = pd.merge(df, df_scores, on='Word', how='left')

#display the first few rows of the merged DataFrame
display(merged_df.head())

#export the merged DataFrame to a new Excel file
merged_df.to_excel("combined.xlsx", index=False)

Unnamed: 0,Word,ShapeRating,LENGTH,Freq,Valence,Arousal,Brys_Concreteness,image_m,image_sd,Compound,Proper,AoA,Size,Nmorph,CW_Concreteness,SoundScore,PerWin_Iconicity,Derived_Iconicity,shape
0,necklace,0.630586,6,0.309377,6.85,3.52,4.96,6.543,0.84,0,0,5.0,2.771,2.0,1.123024,-1.080652,-0.2,-1.178247,-0.10693
1,watermelon,0.732784,9,0.206202,6.75,4.64,4.89,,,1,0,4.22,,2.0,1.112606,0.584514,,1.145951,-0.090577
2,rind,0.516273,4,0.222485,,,4.48,,,0,0,8.95,,1.0,1.111874,0.083942,,0.009816,-0.059618
3,handsaw,0.252606,6,0.015352,,,5.0,,,1,0,8.56,,2.0,1.100676,0.083942,,-0.178646,-0.181465
4,pumpkin,0.741543,6,1.320069,7.0,3.43,4.9,6.849,0.359,0,0,4.78,3.206,1.0,1.098661,-0.477923,-0.583333,-0.972626,-0.07271


In [None]:
merged_df = merged_df[merged_df.ne("na").all(axis=1)]
print(len(df))
print(len(merged_df)) #1756; original 1755

1756
1755


In [None]:
#normalize human ratings
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(-1, 1))
merged_df['SoundScore_normalized'] = scaler.fit_transform(merged_df[['SoundScore']])
merged_df['ShapeRating_normalized'] = scaler.fit_transform(merged_df[['ShapeRating']])

In [None]:
from scipy.stats import pearsonr

cols = ['shape', 'SoundScore_normalized']

# Convert columns to numeric, coercing errors to NaN
for col in cols:
    merged_df[col] = pd.to_numeric(merged_df[col], errors='coerce')

# Drop rows that now contain NaN in the specified columns
merged_df_cleaned = merged_df.dropna(subset=cols)


#create empty DataFrame for formatted results
combined_matrix = pd.DataFrame(index=cols, columns=cols, dtype=object)

#compute correlation and p-value for each pair
for i in cols:
    for j in cols:
        # Ensure both columns are numeric before calculating correlation
        if pd.api.types.is_numeric_dtype(merged_df_cleaned[i]) and pd.api.types.is_numeric_dtype(merged_df_cleaned[j]):
            corr, pval = pearsonr(merged_df_cleaned[i], merged_df_cleaned[j])
            combined_matrix.loc[i, j] = f"{corr:.3f} ({pval:.3f})"
        else:
            combined_matrix.loc[i, j] = "N/A" # Or some other indicator for non-numeric pairs


print("Correlation Coefficient (p-value) Matrix for SoundScore:")
print(combined_matrix)

Correlation Coefficient (p-value) Matrix:
                                shape SoundScore_normalized
shape                   1.000 (0.000)        -0.019 (0.416)
SoundScore_normalized  -0.019 (0.416)         1.000 (0.000)


In [None]:
cols = ['shape', 'ShapeRating_normalized']

# Convert columns to numeric, coercing errors to NaN
for col in cols:
    merged_df[col] = pd.to_numeric(merged_df[col], errors='coerce')

# Drop rows that now contain NaN in the specified columns
merged_df_cleaned = merged_df.dropna(subset=cols)


#create empty DataFrame for formatted results
combined_matrix = pd.DataFrame(index=cols, columns=cols, dtype=object)

#compute correlation and p-value for each pair
for i in cols:
    for j in cols:
        # Ensure both columns are numeric before calculating correlation
        if pd.api.types.is_numeric_dtype(merged_df_cleaned[i]) and pd.api.types.is_numeric_dtype(merged_df_cleaned[j]):
            corr, pval = pearsonr(merged_df_cleaned[i], merged_df_cleaned[j])
            combined_matrix.loc[i, j] = f"{corr:.3f} ({pval:.3f})"
        else:
            combined_matrix.loc[i, j] = "N/A" # Or some other indicator for non-numeric pairs


print("Correlation Coefficient (p-value) Matrix for ShapeRating:")
print(combined_matrix)

Correlation Coefficient (p-value) Matrix for ShapeRating:
                                shape ShapeRating_normalized
shape                   1.000 (0.000)          0.083 (0.000)
ShapeRating_normalized  0.083 (0.000)          1.000 (0.000)


In [None]:
#very low correlation for shapescore. also not significant.
#this may mean that the associations people make based on sounds of words may not be related to the actual meanings of words

#very low correlation for shaperating. although it's significant, this may not be meaningful because the coefficient is too small
#this may mean that the perceived shape of an object is not dependent on how the objects are used along with other words in the lexicon
#people may use other cues to make the associations. It is also possible that the anchor words chosen in sound symbolism research are not consistent with the shape definitions people have in their mind