<a href="https://colab.research.google.com/github/martinbremm/embedding-comparison/blob/main/BERT_embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install dependencies

In [19]:
!pip install transformers



In [3]:
import torch
from transformers import BertTokenizer, BertModel

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
cd /content/drive/MyDrive/Datasets

/content/drive/MyDrive/Datasets


# Importing and preprocessing data

In [51]:
import pandas as pd

df = pd.read_csv("data_frame_processed_labeled.csv")
# filtering out missing values in descriptions column
df = df[["clean_description", "clean_fn"]].dropna()

In [7]:
# examples of semantically- and acoustically-similar sound-producing mechanisms

# acoustically similar sound-producing mechanism
acoustic_dict = {
    "impact": ["bump", "drop", "fall", "bounce", "clap", "slam", "clang", "step", "punch"],
    "explosion": ["blast", "explode", "bang", "blowout", "burst"],
    "vocal": ["talk", "scream", "hiss", "yell", "chant", "conversation", "murmur", "shout"],
    "nose": ["wheeze", "sneeze", "snort", "snore", "sniff"],
}

# semantically similar sound-producing mechanism
semantic_dict = {
    "locomotion": ["walk", "gallop", "run" , "march", "jump", "move"],
    "dog": ["howl", "bark", "growl", "whimper", "scratch", "pant"],
    "vehicle": ["rev", "drive", "pass", "accelerate", "brake"],
    "complex_sound" : ["fight", "box", "destruction", "accident", "wrestle", "bowl", "exercise"]
}

In [8]:
def create_tuples(dictionary):
    dictionary_words = [word for words in dictionary.values() for word in words]  # list of words used as index
        
    category = []
    for key, val in dictionary.items():
        category.extend([key]*len(val)) # list of category memberships

    similarity = ["acoustic_sim" if dictionary=="acoustic_sim" else "semantic_sim" for word in dictionary_words]
        
    tuples = list(zip(similarity, category, dictionary_words))
    return tuples

In [10]:
acoustic_df = pd.DataFrame(data=create_tuples(acoustic_dict), columns = ["Similarity", "Category", "Word"])
semantic_df = pd.DataFrame(data=create_tuples(semantic_dict), columns = ["Similarity", "Category", "Word"])


combined_df = pd.concat([acoustic_df, semantic_df], ignore_index = True)

In [11]:
combined_df

Unnamed: 0,Similarity,Category,Word
0,semantic_sim,impact,bump
1,semantic_sim,impact,drop
2,semantic_sim,impact,fall
3,semantic_sim,impact,bounce
4,semantic_sim,impact,clap
5,semantic_sim,impact,slam
6,semantic_sim,impact,clang
7,semantic_sim,impact,step
8,semantic_sim,impact,punch
9,semantic_sim,explosion,blast


In [49]:
# tokenizing the sentences into single words
from nltk.tokenize import word_tokenize
import nltk
nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [52]:
tokens = df["clean_description"].apply(word_tokenize)
content = []
for word in combined_df["Word"]:
  word_df = df[tokens.apply(lambda sentence: word in sentence)]
  word_df.insert(0, "Word", [word]*len(word_df))
  word_df = word_df.reset_index(drop=True)
  content.append(word_df)
examples1 = pd.concat(content, ignore_index = True)

In [37]:
examples1

Unnamed: 0,Word,clean_description,clean_fn
0,bump,"[underwater, bump, bridge, mono, drops, falls,...",hit_concrete
1,bump,"[pinball, machine, hitting, knock, bump, amuse...",pinball_machine
2,bump,"[pinball, machine, hitting, knock, bump, hard,...",pinball_machine
3,bump,"[auto, crash, light, bump, hard, object, car, ...",car_crash
4,bump,"[auto, crash, light, bump, hard, object, car, ...",car_crash
...,...,...,...
74523,exercise,"[metal, drop, weight, plate, exercise, sport, ...",metal_drop
74524,exercise,"[metal, drop, weight, plate, exercise, sport, ...",metal_drop
74525,exercise,"[metal, drop, weight, plate, exercise, sport, ...",metal_drop
74526,exercise,"[metal, drop, weight, plate, exercise, sport, ...",metal_drop


In [53]:
content = []
for word in combined_df["Word"]:
  word_df = df[df["clean_description"].str.contains(" " + word + " ")]
  word_df.insert(0, "Word", [word]*len(word_df))
  word_df = word_df.reset_index(drop=True)
  content.append(word_df)
examples2 = pd.concat(content, ignore_index = True)

In [56]:
content = []
for word in combined_df["Word"]:
  word_df = df[df["clean_fn"].str.contains(word)]
  word_df.insert(0, "Word", [word]*len(word_df))
  word_df = word_df.reset_index(drop=True)
  content.append(word_df)
print(f"Contains examples of {len(content)} words!")

Contains examples of 51 words!


In [57]:
examples = pd.concat(content, ignore_index = True)
examples.to_csv("BERT_example_sentences.csv")

In [58]:
examples

Unnamed: 0,Word,clean_description,clean_fn
0,bump,wood en plank skid drag along bumpy surface,skid_wood_wooden_plank_drag_along_bumpy
1,bump,wood en plank skid drag along bumpy surface,skid_wood_wooden_plank_drag_along_bumpy
2,bump,video game electronic bump,video_game_electronic_bump
3,bump,luggage rolling bumps stop,luggage_rolling_bump
4,bump,body bumps wall various,body_bumps_wall_var
...,...,...,...
57617,exercise,exercise equipment weight lifting competition ...,exercise_equipment
57618,exercise,exercise equipment weight lifting competition ...,exercise_equipment
57619,exercise,exercise weight room training lifting bars wei...,exercise_weight_room
57620,exercise,exercise weight room training lifting bars wei...,exercise_weight_room


Creating Dataframe with example sentences obtained from out dataset (randomly chosen sentences including the words defined in the dictionaries)

In [None]:
content = [(df[df["clean_description"].str.contains(" " + word + " ")].sample(n=1, ignore_index=True)[0]) for word in combined_df["Word"]]

combined_df["example_sentence"] = content

In [None]:
combined_df.head(n=5)

Unnamed: 0,Similarity,Category,Word,example_sentence
0,semantic_sim,impact,bump,hits low bump impact
1,semantic_sim,impact,drop,truck door drop ground
2,semantic_sim,impact,fall,body falls fall human weeds
3,semantic_sim,impact,bounce,theater type folding seat bounce open close
4,semantic_sim,impact,clap,applause crowd small studio audience female cl...


# BERT model preprocessing

In [None]:
# Adding BERT tokens
marked_text = combined_df["example_sentence"].apply(lambda sen: "[CLS] " + sen + " [SEP]")
marked_text

0                      [CLS] hits low bump impact [SEP]
1                    [CLS] truck door drop ground [SEP]
2               [CLS] body falls fall human weeds [SEP]
3     [CLS] theater type folding seat bounce open cl...
4     [CLS] applause crowd small studio audience fem...
5                    [CLS] porsche hood slam shut [SEP]
6     [CLS] metal thuds clang ringing knife ching qu...
7                 [CLS] ladder step climb ladders [SEP]
8     [CLS] classic old movie face punch single hit ...
9     [CLS] auto bmw horn short blast close horns ca...
10    [CLS] background ds munich new years celebrati...
11    [CLS] german precision metal ratchet clink cru...
12    [CLS] noisemaker paper blowout horn party comi...
13                     [CLS] zaps short burst zap [SEP]
14    [CLS] animals horses horse vocals stallion stu...
15    [CLS] human scream teenage girl screams female...
16    [CLS] tape noise microcassette hiss high speed...
17    [CLS] siren english police constant stop n

In [None]:
## Splitting sentence into tokens

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

tokenized_text = marked_text.apply(lambda sen: tokenizer.tokenize(sen))
tokenized_text

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

0               [[CLS], hits, low, bump, impact, [SEP]]
1             [[CLS], truck, door, drop, ground, [SEP]]
2       [[CLS], body, falls, fall, human, weeds, [SEP]]
3     [[CLS], theater, type, folding, seat, bounce, ...
4     [[CLS], applause, crowd, small, studio, audien...
5             [[CLS], porsche, hood, slam, shut, [SEP]]
6     [[CLS], metal, thud, ##s, clan, ##g, ringing, ...
7      [[CLS], ladder, step, climb, ladder, ##s, [SEP]]
8     [[CLS], classic, old, movie, face, punch, sing...
9     [[CLS], auto, bmw, horn, short, blast, close, ...
10    [[CLS], background, ds, munich, new, years, ce...
11    [[CLS], german, precision, metal, rat, ##chet,...
12    [[CLS], noise, ##maker, paper, blow, ##out, ho...
13      [[CLS], za, ##ps, short, burst, za, ##p, [SEP]]
14    [[CLS], animals, horses, horse, vocals, stalli...
15    [[CLS], human, scream, teenage, girl, screams,...
16    [[CLS], tape, noise, micro, ##cas, ##sett, ##e...
17    [[CLS], siren, english, police, constant, 

In [None]:
# Mapping token to BERT vocabulary index
indexed_tokens = tokenized_text.apply(lambda sen: tokenizer.convert_tokens_to_ids(sen))

In [None]:
# Zip words together with their indexes
token_id_tup = []
for ind, sen in enumerate(tokenized_text):
  token_id_tup.append(list(zip(sen, indexed_tokens[ind])))

In [None]:
# Zip words together with their indexes
for tup in token_id_tup[0]:
    print('{:<12} {:>6,}'.format(tup[0], tup[1]))


[CLS]           101
hits          4,978
low           2,659
bump         16,906
impact        4,254
[SEP]           102


In [None]:
# Creating segment IDs for BERT to distinguish between sentences
segment_ids = []
for ind, sen in enumerate(tokenized_text):
  segment_ids.append([ind] * len(sen))

In [None]:
tokens_tensor = [torch.tensor([token]) for token in indexed_tokens]
tokens_tensor[0]

tensor([[  101,  4978,  2659, 16906,  4254,   102]])

In [None]:
segments_tensor = [torch.tensor([id]) for id in segment_ids]
segments_tensor[0]

tensor([[0, 0, 0, 0, 0, 0]])

In [None]:
zipped_text_token_segment = list(zip(tokenized_text, tokens_tensor, segments_tensor))

for tup in zipped_text_token_segment:
  print(f"Sentence: {tup[0]}")
  print(f"Token_tensor: {tup[1]}")
  print(f"Segment_id: {tup[2]}")
  

Sentence: ['[CLS]', 'hits', 'low', 'bump', 'impact', '[SEP]']
Token_tensor: tensor([[  101,  4978,  2659, 16906,  4254,   102]])
Segment_id: tensor([[0, 0, 0, 0, 0, 0]])
Sentence: ['[CLS]', 'truck', 'door', 'drop', 'ground', '[SEP]']
Token_tensor: tensor([[ 101, 4744, 2341, 4530, 2598,  102]])
Segment_id: tensor([[1, 1, 1, 1, 1, 1]])
Sentence: ['[CLS]', 'body', 'falls', 'fall', 'human', 'weeds', '[SEP]']
Token_tensor: tensor([[  101,  2303,  4212,  2991,  2529, 20777,   102]])
Segment_id: tensor([[2, 2, 2, 2, 2, 2, 2]])
Sentence: ['[CLS]', 'theater', 'type', 'folding', 'seat', 'bounce', 'open', 'close', '[SEP]']
Token_tensor: tensor([[  101,  4258,  2828, 12745,  2835, 17523,  2330,  2485,   102]])
Segment_id: tensor([[3, 3, 3, 3, 3, 3, 3, 3, 3]])
Sentence: ['[CLS]', 'applause', 'crowd', 'small', 'studio', 'audience', 'female', 'clap', 'unison', 'slow', 'tempo', 'clapping', 'crowd', '##fe', '##mal', '##e', 'crowd', '##chee', '##ring', 'indoor', '[SEP]']
Token_tensor: tensor([[  101, 20

# Evaluating the BERT model

In [None]:
# suppressing the output with the magic command "capture"
%%capture

# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-uncased',
                                  output_hidden_states = True,
                                  )

# setting model in eval mode (i.e., feed-forward operation)
model.eval()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# apply BERT to text and collect all hidden states form the 12 BERT layers

with torch.no_grad():

  # evaluating the model only on the first pair of token tensor and segments tensor
  outputs = model(tokens_tensor[0], segments_tensor[0])


  # in our pretrained model, third item contains hidden states form all layers
  hidden_states = outputs[2]

In [None]:
print ("Number of layers:", len(hidden_states), "  (initial embeddings + 12 BERT layers)")

layer_i = 0
print ("Number of batches:", len(hidden_states[layer_i]))

batch_i = 0
print ("Number of tokens:", len(hidden_states[layer_i][batch_i]))

token_i = 0
print ("Number of hidden units:", len(hidden_states[layer_i][batch_i][token_i]))

Number of layers: 13   (initial embeddings + 12 BERT layers)
Number of batches: 1
Number of tokens: 6
Number of hidden units: 768


We want to group the values by tokens (i.e., parts of the words we want the embeddings for) and not by layer. Thus we stack all the layers together and extract the information across layers per token

In [None]:
# Concatenate the tensors for all layers. We use `stack` here to create a new dimension in the tensor
token_embeddings = torch.stack(hidden_states, dim=0)

token_embeddings.size()

torch.Size([13, 1, 6, 768])

Current dimensions: [# layers, # batches, # tokens, # features] <br>
Desired dimensions: [# tokens, # layers, # features]

In [None]:
# Changing the order of dimensions by permutation
# Numbers in the permute function refer to the spot in the original list, i.e., the original index in the dimension ordering in token_embedding 
token_embeddings = torch.squeeze(token_embeddings, dim=1)
token_embeddings = token_embeddings.permute(1,0,2)

token_embeddings.size()

torch.Size([6, 13, 768])

First, let’s concatenate the last four layers, giving us a single word vector per token. Each vector will have length 4 x 768 = 3,072.

In [None]:
# stores token vectors
token_vecs_cat = []

for token in token_embeddings:

  # concatinating vectors from the last 4 model layers
    cat_vec = torch.cat((token[-1], token[-2], token[-3], token[-4]), dim=0)

    token_vecs_cat.append(cat_vec)

print ('Shape is: %d x %d' % (len(token_vecs_cat), len(token_vecs_cat[0])))


Shape is: 6 x 3072


# Putting it all together 
Creating word embedding for entire DataFrame

In [None]:
# apply BERT to text and collect all hidden states form the 12 BERT layers
token_vecs_cat = []


with torch.no_grad():
  # each tuple represents all the data for one of the 10 sentences
  for tup in list(zip(tokenized_text, tokens_tensor, segments_tensor)):

    # model gets tokens_tensor and segments_tensor as input, to create word embedding
    out = model(tup[1], tup[2])

  # in our pretrained model, third item contains hidden states form all layers
    hidden_states = out[2]


    # Concatenate the tensors for all layers. We use `stack` here to create a new dimension in the tensor
    token_embeddings = torch.stack(hidden_states, dim=0)
    token_embeddings = torch.squeeze(token_embeddings, dim=1)
    token_embeddings = token_embeddings.permute(1,0,2)

    # stores token vectors
    for token in token_embeddings:

      # concatinating vectors from the last 4 model layers 
      # for dim=0 means only the token embeddings?
      cat_vec = torch.cat((token[-1], token[-2], token[-3], token[-4]), dim=0)

      # adding vector as a numpy array for visualization
      token_vecs_cat.append(cat_vec.numpy())

In [None]:
print(f"Number of all word embeddings: {len(token_vecs_cat)}")

Number of all word embeddings: 637


In [None]:
token_embeddings.size()

torch.Size([7, 13, 768])

In [None]:
token.size()

torch.Size([13, 768])

In [None]:
# getting all the tokens of all sentences into one list
tokens = []
for sentence in tokenized_text:
  tokens.extend(sentence)
print(f"Number of all tokens: {len(tokens)}")

Number of all tokens: 637


Showing the format of the output: The model gives us a single vector for each token, for which we printed only the 5 first dimensions

In [None]:
token_vecs_cat[0][:5]

array([-0.56564415,  0.1800897 , -0.0054876 ,  0.01520055, -0.17841163],
      dtype=float32)

In [None]:
# creating df form dictionary with the dictionary keys (tokens) being the rows and the values (vector dimensions) being columns
data=dict(zip(tokens, token_vecs_cat))

vocab = pd.DataFrame.from_dict(data, orient="index")
vocab

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3062,3063,3064,3065,3066,3067,3068,3069,3070,3071
[CLS],-0.523729,-0.023604,-0.436968,-0.150918,-0.107386,-0.078668,0.383332,0.333220,-0.557351,-0.203414,...,-0.296992,-0.440729,-0.323404,-0.092424,0.322114,-0.276464,0.594268,-0.222647,-0.234435,0.416800
hits,0.174171,0.179587,-0.443125,0.125111,-0.167968,-0.228450,0.398193,0.109855,-0.341891,-0.413290,...,-0.487683,-0.475588,0.285095,0.742009,0.552637,-0.244662,0.547457,-0.378209,-0.592485,-0.766604
low,-0.583295,0.291757,-0.145314,-0.531971,-0.337157,-0.218252,-0.428303,0.634338,0.079402,0.290770,...,1.313356,-1.173494,0.376477,-0.689564,0.450377,1.014516,-0.370163,-0.304821,-0.312838,-0.624800
bump,-0.619284,-0.092625,0.661758,0.305355,-0.072371,0.239656,-0.190960,0.055673,-0.451952,-0.601128,...,0.584293,-0.548999,0.801620,0.125676,0.558971,0.385285,-0.989848,0.431406,-0.901355,-1.037519
impact,-0.155124,0.504677,0.484727,-0.379080,-0.408341,-0.173658,0.647599,0.064324,-0.317842,-0.462156,...,0.723333,-0.388881,0.953135,-0.271467,-0.623187,-0.257033,0.124551,0.111083,0.835148,-0.781466
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
set,0.633760,-0.448184,0.606236,0.136826,-0.117163,0.538670,0.215221,0.004477,-0.265273,-0.446544,...,0.214514,0.116414,-1.224972,0.661390,-1.191625,1.110739,0.016523,-0.768467,-0.617608,-0.490599
cardiovascular,0.633049,0.419266,-0.724354,-0.330886,0.024074,0.349078,0.184914,0.185382,-0.323995,-0.279720,...,0.196417,0.333169,1.041248,0.450708,-0.531994,0.897100,-1.207147,-0.279430,-0.089463,-1.801007
exercise,1.045656,0.567426,-0.702968,-0.444921,0.170371,-0.532313,0.893823,-0.248770,-0.703019,-0.486027,...,0.356673,0.033936,0.798126,0.345628,0.689080,1.708850,-0.392543,-1.179833,-0.396895,-1.413379
jumping,0.357920,0.035868,-0.289538,0.073204,0.054859,-0.741909,0.135693,0.173989,-1.151840,0.090500,...,-0.104864,-0.067548,1.485576,-0.697580,-0.526476,0.688336,0.017144,-0.372590,0.513206,-0.690702


In [None]:
selected_vocab = vocab[vocab.index.isin(combined_df["Word"])]
selected_vocab

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3062,3063,3064,3065,3066,3067,3068,3069,3070,3071
bump,-0.619284,-0.092625,0.661758,0.305355,-0.072371,0.239656,-0.19096,0.055673,-0.451952,-0.601128,...,0.584293,-0.548999,0.80162,0.125676,0.558971,0.385285,-0.989848,0.431406,-0.901355,-1.037519
drop,0.132756,-0.078574,0.493739,-0.174659,0.200522,0.098005,-0.278988,0.157009,-0.383873,0.53764,...,0.447757,-0.411715,0.862398,-0.387089,0.61019,0.973861,-0.025759,-0.657093,-0.167273,-0.747788
fall,-0.693701,-0.082736,0.373599,-0.01957,0.029574,0.01814,0.405328,0.170272,0.147669,-0.103168,...,0.972817,-0.482178,1.313909,-0.351264,-0.319324,1.720897,-0.42501,-1.166955,-0.169317,-0.350666
bounce,-0.195936,0.243751,0.424547,-0.606902,0.119777,-0.563721,0.338499,0.525346,-0.366406,0.031399,...,0.673074,-0.923456,-0.378863,0.823578,-0.195569,0.109558,-0.307408,-1.030033,0.484827,-1.272684
clap,0.472439,0.065688,0.378112,-0.47491,-0.014752,-0.1647,-0.130745,-0.06928,-1.055036,0.498156,...,0.711284,-0.48558,-1.095746,0.854713,-0.534181,-0.084851,-1.420375,-1.232147,-0.074456,-0.734888
slam,0.346325,0.181858,0.245401,-0.350039,0.378883,-0.128553,0.431583,-0.007962,0.018592,-0.025222,...,1.734722,-0.772507,-0.613951,0.199534,-0.154212,0.341405,-0.078764,-0.133446,0.181042,-0.792536
step,0.249,0.087933,0.762515,-0.118922,0.205954,0.448756,0.801775,-0.299156,0.105323,-0.314606,...,1.751666,-0.19612,-0.508474,-0.405685,-1.106469,0.460434,-1.687619,-0.238548,0.823814,-1.467742
punch,-0.558978,0.183725,0.510703,-0.056801,-1.051879,-0.117136,1.034639,-0.061824,-0.39849,-0.394997,...,0.220752,-0.165842,-0.128405,0.680235,-0.120511,1.504295,-0.923547,-1.108415,0.062661,-0.82082
blast,0.089465,0.437574,0.461643,-0.016037,-0.390155,-0.285903,-0.227241,0.73349,0.00323,0.053316,...,1.019733,-0.574076,-0.274089,-1.01925,-0.125812,1.077561,-0.227078,-0.951394,-0.045399,-1.092445
explode,0.3625,0.521522,0.31104,-0.057078,0.254603,-0.538439,0.122729,-0.172447,-0.282075,0.107,...,0.364242,-0.73781,0.050291,1.062922,0.057205,-0.130714,-0.5022,-1.50788,0.564019,-1.712415


In [None]:
tuples = [tuple(combined_df[combined_df["Word"]==i][["Category", "Word"]].values[0]) for i in selected_vocab.index]
multi_index = pd.MultiIndex.from_tuples(tuples, names=["Category", "Word"])

In [None]:
selected_vocab = selected_vocab.set_index(multi_index)
selected_vocab.sort_index(inplace=True)

In [None]:
selected_vocab

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3,4,5,6,7,8,9,...,3062,3063,3064,3065,3066,3067,3068,3069,3070,3071
Category,Word,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
complex_sound,accident,-0.634334,-0.166236,0.136587,-0.245089,0.583748,0.537719,-0.499238,0.591444,-0.457259,0.614421,...,0.853197,-0.783014,0.658167,0.655837,-0.307554,-0.39591,0.341497,-0.965275,-0.085516,-0.230077
complex_sound,bowl,-0.293736,-0.248055,0.481227,-0.10188,-0.008253,0.39866,-0.235137,0.595045,-0.069802,-0.446045,...,0.093816,-0.082791,-0.031476,0.140879,0.765872,0.563783,-0.51388,-0.397026,0.082734,-0.118632
complex_sound,box,0.157816,0.394526,0.487151,-0.319059,0.633125,-0.445856,-0.036684,0.246228,-0.081143,-0.227508,...,-0.406521,-0.401084,-0.785821,-0.745298,0.339088,0.249294,0.330971,-1.130047,-0.278086,0.3444
complex_sound,destruction,0.384657,0.724283,-0.238662,0.095912,-0.15326,0.08457,0.291827,-0.088914,-0.607385,-0.353503,...,0.901534,-0.687527,1.112759,-0.27863,0.464696,-0.129212,0.116072,-0.402315,-0.093726,-0.971605
complex_sound,exercise,1.045656,0.567426,-0.702968,-0.444921,0.170371,-0.532313,0.893823,-0.24877,-0.703019,-0.486027,...,0.356673,0.033936,0.798126,0.345628,0.68908,1.70885,-0.392543,-1.179833,-0.396895,-1.413379
complex_sound,fight,-0.65965,-0.24667,0.601151,0.038464,-0.59962,0.211324,0.639224,-0.142668,0.322838,-0.349722,...,1.031829,-0.445765,-0.212548,0.37003,-0.9783,1.611544,0.45247,-1.257588,-0.155871,-0.040785
complex_sound,wrestle,-0.451489,1.022307,-0.052919,-0.040613,-0.182234,0.121713,1.127993,0.28085,0.228434,-0.589067,...,1.0277,-0.618358,0.215254,0.397466,-0.632952,0.71026,-0.13765,-1.534597,-0.066654,0.370382
dog,bark,-0.10778,0.767434,0.0264,-0.327552,-0.129495,-0.001513,0.492951,0.138415,-0.383684,-0.105719,...,-0.557799,-0.140281,-1.241829,1.463568,0.065796,-0.714575,-0.885796,-0.634543,0.260312,-0.392424
dog,growl,0.326138,0.074804,0.051558,-0.858733,0.854365,0.17235,0.433626,0.585014,-0.257993,0.236949,...,0.883962,-0.139761,0.077236,0.203087,-0.257843,0.430597,0.222444,-0.420155,-0.096562,-0.797544
dog,howl,0.04427,0.521488,-0.53124,-0.802673,0.140558,-0.408496,0.839163,0.310329,-0.231589,-0.314768,...,-0.064252,-0.804048,-0.943207,0.642109,0.498901,-1.154078,0.089857,-1.084929,0.241302,-0.991394


In [None]:
from scipy.spatial.distance import pdist, squareform
from scipy.stats import zscore

# cosine similarity matrix
# 1-distance matrix with the cosine distance as a metric = similarity matrix
def create_similarity_matrix_df(df, metric):
    
    matrix = 1-squareform(pdist(zscore(df, axis=0), metric))

    matrix_df = pd.DataFrame(data=matrix, columns=multi_index, index=multi_index)
    
    # matrix_df.style.set_properties(**{'text-align': 'left'})
    return matrix_df.style.background_gradient(axis=None, cmap='RdBu') # changing color

In [None]:
create_similarity_matrix_df(selected_vocab, metric="cosine")


Unnamed: 0_level_0,Category,impact,impact,impact,impact,impact,impact,impact,impact,explosion,explosion,explosion,explosion,vocal,vocal,vocal,vocal,vocal,vocal,vocal,vocal,dog,nose,nose,locomotion,vehicle,locomotion,locomotion,locomotion,vehicle,locomotion,dog,dog,dog,dog,vehicle,vehicle,vehicle,complex_sound,complex_sound,complex_sound,complex_sound,complex_sound,complex_sound,complex_sound
Unnamed: 0_level_1,Word,bump,drop,fall,bounce,clap,slam,step,punch,blast,explode,bang,burst,talk,scream,hiss,yell,chant,conversation,murmur,shout,growl,snort,sniff,walk,rev,run,march,jump,pass,move,howl,bark,whimper,scratch,drive,accelerate,brake,fight,box,destruction,accident,wrestle,bowl,exercise
Category,Word,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2,Unnamed: 27_level_2,Unnamed: 28_level_2,Unnamed: 29_level_2,Unnamed: 30_level_2,Unnamed: 31_level_2,Unnamed: 32_level_2,Unnamed: 33_level_2,Unnamed: 34_level_2,Unnamed: 35_level_2,Unnamed: 36_level_2,Unnamed: 37_level_2,Unnamed: 38_level_2,Unnamed: 39_level_2,Unnamed: 40_level_2,Unnamed: 41_level_2,Unnamed: 42_level_2,Unnamed: 43_level_2,Unnamed: 44_level_2,Unnamed: 45_level_2
impact,bump,1.0,0.015787,0.038242,0.075094,-0.015989,-0.039802,-0.087169,-0.004358,-0.116761,-0.069086,-0.042636,-0.002162,-0.048663,-0.081877,-0.057362,0.037463,-0.063875,0.07294,-0.10171,0.079024,-0.002641,-0.153011,0.004367,-0.083132,-0.019053,-0.097779,0.09134,-0.081326,-0.121246,0.013581,0.026512,-0.009139,0.134596,0.051477,-0.043461,0.010736,-0.178486,0.060906,-0.090758,-0.172734,-0.065377,-0.032644,-0.002708,0.01189
impact,drop,0.015787,1.0,0.290507,-0.074204,-0.05272,0.071331,-0.106949,-0.035916,0.066913,-0.151095,-0.047851,-0.128528,-0.013623,-0.083683,-0.049746,-0.0588,0.11719,-0.04837,-0.13579,-0.022908,0.059462,0.091893,0.058102,0.003937,0.073251,-0.054724,-0.190798,0.023435,0.076731,-0.13881,-0.14854,-0.167398,0.025742,0.015071,0.025808,-0.125562,-0.000487,-0.00644,-0.076334,-0.005605,0.008993,-0.080534,-0.037014,-0.115712
impact,fall,0.038242,0.290507,1.0,-0.034018,-0.033136,-0.117781,-0.063043,-0.069736,0.014431,-0.080133,-0.056559,-0.11992,-0.01078,-0.03108,-0.035233,-0.064564,-0.094814,-0.168144,-0.04988,0.039539,-0.081642,-0.000162,-0.065064,-0.049702,0.161028,-0.074758,0.033918,0.027998,-0.065999,-0.150553,-0.163507,-0.052958,0.146737,-0.030567,0.061896,-0.052142,-0.087412,-0.017565,0.086184,-0.037864,0.057247,0.003085,-0.005097,-0.074542
impact,bounce,0.075094,-0.074204,-0.034018,1.0,0.067335,-0.001552,0.061984,-0.031951,-0.0487,0.024543,0.039431,-0.032799,0.024493,-0.075511,-0.002408,0.09515,-0.146379,0.045127,-0.11047,-0.012659,0.017274,0.013095,0.050017,-0.118466,-0.147659,0.011781,0.084881,-0.126643,-0.164196,0.142625,0.040881,-0.112467,-0.042573,-0.085072,-0.042046,-0.059674,0.014749,-0.049838,0.030719,-0.196684,0.112908,-0.136755,-0.083175,0.011967
impact,clap,-0.015989,-0.05272,-0.033136,0.067335,1.0,0.104281,0.173162,-0.119356,-0.044362,-0.088052,-0.094489,-0.061256,-0.125986,-0.177905,-0.092305,-0.059336,-0.118342,-0.056605,-0.050574,-0.07827,0.062521,0.093972,-0.152467,0.067492,0.31728,0.06012,-0.018457,-0.039305,-0.054047,-0.056869,-0.032931,0.007372,0.103053,-0.026564,-0.094537,-0.02807,0.020742,-0.01264,-0.03769,-0.140744,-0.003078,-0.086512,-0.123477,-0.164956
impact,slam,-0.039802,0.071331,-0.117781,-0.001552,0.104281,1.0,0.398215,-0.061256,-0.092638,-0.067206,-0.110463,-0.098027,-0.024207,-0.116836,-0.102503,-0.107366,0.011896,-0.00075,-0.164178,-0.039798,0.461092,0.45324,-0.012825,-0.055789,-0.016082,-0.109383,-0.066829,-0.05545,0.026042,-0.087307,-0.118543,-0.093829,-0.088429,-0.014679,-0.118542,-0.106589,-0.046791,0.066584,-0.214038,-0.017687,-0.104557,-0.092742,-0.014331,-0.091539
impact,step,-0.087169,-0.106949,-0.063043,0.061984,0.173162,0.398215,1.0,-0.080995,-0.12733,-0.022547,-0.054275,-0.138021,-0.017095,-0.110507,-0.083534,0.042078,-0.076033,-0.020988,-0.034155,-0.034094,0.280263,0.302123,-0.094314,-0.148167,0.086621,-0.049453,0.053326,-0.068601,-0.069265,-0.077991,-0.078069,0.088294,-0.074082,-0.10232,-0.11421,-0.08162,-0.074767,-0.056373,-0.000984,-0.07517,-0.033486,-0.068685,-0.052303,-0.13896
impact,punch,-0.004358,-0.035916,-0.069736,-0.031951,-0.119356,-0.061256,-0.080995,1.0,0.045817,0.312388,-0.000305,0.176184,-0.084958,0.001727,-0.007728,0.056088,-0.022243,0.057109,0.003533,-0.142024,-0.126805,-0.030035,-0.143262,-0.018692,-0.212313,-0.060237,-0.127425,-0.091426,-0.038005,0.12265,-0.000488,-0.059921,0.021257,0.006596,0.000694,-0.020679,-0.125684,0.013293,-0.08709,-0.108337,-0.046641,0.034842,-0.046676,0.089824
explosion,blast,-0.116761,0.066913,0.014431,-0.0487,-0.044362,-0.092638,-0.12733,0.045817,1.0,0.220767,-0.090499,0.049769,-0.020351,0.013982,-0.094976,-0.149192,0.016475,-0.188305,0.008789,-0.148136,-0.110375,-0.084099,0.028032,-0.100286,-0.016888,-0.042358,-0.195686,-0.020256,-0.017548,0.041716,0.010085,-0.155189,-0.029512,-0.108057,-0.022556,-0.125659,0.054414,0.069703,0.123659,0.095818,0.251489,0.15153,-0.009753,0.033522
explosion,explode,-0.069086,-0.151095,-0.080133,0.024543,-0.088052,-0.067206,-0.022547,0.312388,0.220767,1.0,-0.075069,0.164261,-0.077831,-0.024817,-0.067557,0.009795,0.024949,0.009429,-0.028654,-0.264377,-0.115514,-0.085289,-0.019116,-0.182132,-0.15821,-0.045162,-0.080938,-0.132076,-0.013577,0.082526,0.112798,-0.144827,0.006988,-0.108126,-0.103419,-0.044693,0.011197,-0.003834,-0.065111,0.008418,0.115095,0.104937,-0.063459,0.161296
