<a href="https://colab.research.google.com/github/martinbremm/embedding-comparison/blob/main/BERT_embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install dependencies

In [59]:
!pip install transformers



In [60]:
import torch
from transformers import BertTokenizer, BertModel

In [61]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [62]:
cd /content/drive/MyDrive/Datasets

/content/drive/MyDrive/Datasets


# Importing and preprocessing data

In [63]:
import pandas as pd

df = pd.read_csv("data_frame_processed_labeled.csv")
# filtering out missing values in descriptions column
df = df[["clean_description", "clean_fn"]].dropna()

In [64]:
# examples of semantically- and acoustically-similar sound-producing mechanisms

# acoustically similar sound-producing mechanism
acoustic_dict = {
    "impact": ["bump", "drop", "fall", "bounce", "clap", "slam", "clang", "step", "punch"],
    "explosion": ["blast", "explode", "bang", "blowout", "burst"],
    "vocal": ["talk", "scream", "hiss", "yell", "chant", "conversation", "murmur", "shout"],
    "nose": ["wheeze", "sneeze", "snort", "snore", "sniff"],
}

# semantically similar sound-producing mechanism
semantic_dict = {
    "locomotion": ["walk", "gallop", "run" , "march", "jump", "move"],
    "dog": ["howl", "bark", "growl", "whimper", "scratch", "pant"],
    "vehicle": ["rev", "drive", "pass", "accelerate", "brake"],
    "complex_sound" : ["fight", "box", "destruction", "accident", "wrestle", "bowl", "exercise"]
}

In [65]:
def create_tuples(dictionary):
    dictionary_words = [word for words in dictionary.values() for word in words]  # list of words used as index
        
    category = []
    for key, val in dictionary.items():
        category.extend([key]*len(val)) # list of category memberships

    similarity = ["acoustic_sim" if dictionary=="acoustic_sim" else "semantic_sim" for word in dictionary_words]
        
    tuples = list(zip(similarity, category, dictionary_words))
    return tuples

In [66]:
acoustic_df = pd.DataFrame(data=create_tuples(acoustic_dict), columns = ["Similarity", "Category", "Word"])
semantic_df = pd.DataFrame(data=create_tuples(semantic_dict), columns = ["Similarity", "Category", "Word"])


combined_df = pd.concat([acoustic_df, semantic_df], ignore_index = True)

In [67]:
combined_df

Unnamed: 0,Similarity,Category,Word
0,semantic_sim,impact,bump
1,semantic_sim,impact,drop
2,semantic_sim,impact,fall
3,semantic_sim,impact,bounce
4,semantic_sim,impact,clap
5,semantic_sim,impact,slam
6,semantic_sim,impact,clang
7,semantic_sim,impact,step
8,semantic_sim,impact,punch
9,semantic_sim,explosion,blast


In [68]:
# tokenizing the sentences into single words
from nltk.tokenize import word_tokenize
import nltk
nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [69]:
tokens = df["clean_description"].apply(word_tokenize)
content = []
for word in combined_df["Word"]:
  word_df = df[tokens.apply(lambda sentence: word in sentence)]
  word_df.insert(0, "Word", [word]*len(word_df))
  word_df = word_df.reset_index(drop=True)
  content.append(word_df)
examples1 = pd.concat(content, ignore_index = True)

In [70]:
examples1

Unnamed: 0,Word,clean_description,clean_fn
0,bump,underwater bump bridge mono drops falls hits i...,hit_concrete
1,bump,pinball machine hitting knock bump amusement p...,pinball_machine
2,bump,pinball machine hitting knock bump hard amusem...,pinball_machine
3,bump,auto crash light bump hard object car crashes,car_crash
4,bump,auto crash light bump hard object car crashes,car_crash
...,...,...,...
74523,exercise,metal drop weight plate exercise sport impacte...,metal_drop
74524,exercise,metal drop weight plate exercise sport impacte...,metal_drop
74525,exercise,metal drop weight plate exercise sport impacte...,metal_drop
74526,exercise,metal drop weight plate exercise sport impacte...,metal_drop


In [71]:
content = []
for word in combined_df["Word"]:
  word_df = df[df["clean_description"].str.contains(" " + word + " ")]
  word_df.insert(0, "Word", [word]*len(word_df))
  word_df = word_df.reset_index(drop=True)
  content.append(word_df)
examples2 = pd.concat(content, ignore_index = True)

In [72]:
content = []
for word in combined_df["Word"]:
  word_df = df[df["clean_fn"].str.contains(word)]
  word_df.insert(0, "Word", [word]*len(word_df))
  word_df = word_df.reset_index(drop=True)
  content.append(word_df)
print(f"Contains examples of {len(content)} words!")

Contains examples of 51 words!


In [73]:
examples = pd.concat(content, ignore_index = True)
#examples.to_csv("BERT_example_sentences.csv")

In [74]:
examples

Unnamed: 0,Word,clean_description,clean_fn
0,bump,wood en plank skid drag along bumpy surface,skid_wood_wooden_plank_drag_along_bumpy
1,bump,wood en plank skid drag along bumpy surface,skid_wood_wooden_plank_drag_along_bumpy
2,bump,video game electronic bump,video_game_electronic_bump
3,bump,luggage rolling bumps stop,luggage_rolling_bump
4,bump,body bumps wall various,body_bumps_wall_var
...,...,...,...
57617,exercise,exercise equipment weight lifting competition ...,exercise_equipment
57618,exercise,exercise equipment weight lifting competition ...,exercise_equipment
57619,exercise,exercise weight room training lifting bars wei...,exercise_weight_room
57620,exercise,exercise weight room training lifting bars wei...,exercise_weight_room


Creating Dataframe with example sentences obtained from out dataset (randomly chosen sentences including the words defined in the dictionaries)

In [84]:
descriptions = df["clean_description"]
content = [descriptions[descriptions.str.contains(" " + word + " ")].sample(n=1, ignore_index=True)[0] for word in combined_df["Word"]]

combined_df["example_sentence"] = content

In [86]:
combined_df.head(n=5)

Unnamed: 0,Similarity,Category,Word,example_sentence
0,semantic_sim,impact,bump,metal drum bump dull
1,semantic_sim,impact,drop,plastic cap drop bounce medium sized
2,semantic_sim,impact,fall,metal fall stairs
3,semantic_sim,impact,bounce,car vw beetle turbo driver door various open c...
4,semantic_sim,impact,clap,several speed skaters wearing clap skates skat...


# BERT model preprocessing

In [87]:
# Adding BERT tokens
marked_text = combined_df["example_sentence"].apply(lambda sen: "[CLS] " + sen + " [SEP]")
marked_text

0                      [CLS] metal drum bump dull [SEP]
1      [CLS] plastic cap drop bounce medium sized [SEP]
2                         [CLS] metal fall stairs [SEP]
3     [CLS] car vw beetle turbo driver door various ...
4     [CLS] several speed skaters wearing clap skate...
5     [CLS] telephone domestic old dial phone slam r...
6              [CLS] impact metal ic clang reverb [SEP]
7     [CLS] footstep dress shoe single step shoes ha...
8     [CLS] vehicle motorcycle motorcycles honda val...
9     [CLS] aviation jet various take series engine ...
10    [CLS] science fiction various crunch take rock...
11    [CLS] door doors screen open close stretching ...
12    [CLS] sizzle match strike ignite flame blowout...
13          [CLS] low slow ed air bag burst heavy [SEP]
14    [CLS] sport soccer game medium distant predomi...
15    [CLS] animal scream hollow resonant like lowes...
16    [CLS] science fiction machines motors pass tak...
17    [CLS] siren emergency sirens police wail y

In [88]:
## Splitting sentence into tokens

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

tokenized_text = marked_text.apply(lambda sen: tokenizer.tokenize(sen))
tokenized_text

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

0               [[CLS], metal, drum, bump, dull, [SEP]]
1     [[CLS], plastic, cap, drop, bounce, medium, si...
2                   [[CLS], metal, fall, stairs, [SEP]]
3     [[CLS], car, v, ##w, beetle, turbo, driver, do...
4     [[CLS], several, speed, skaters, wearing, clap...
5     [[CLS], telephone, domestic, old, dial, phone,...
6     [[CLS], impact, metal, ic, clan, ##g, rev, ##e...
7     [[CLS], foot, ##ste, ##p, dress, shoe, single,...
8     [[CLS], vehicle, motorcycle, motorcycles, hond...
9     [[CLS], aviation, jet, various, take, series, ...
10    [[CLS], science, fiction, various, crunch, tak...
11    [[CLS], door, doors, screen, open, close, stre...
12    [[CLS], si, ##zzle, match, strike, ign, ##ite,...
13    [[CLS], low, slow, ed, air, bag, burst, heavy,...
14    [[CLS], sport, soccer, game, medium, distant, ...
15    [[CLS], animal, scream, hollow, res, ##ona, ##...
16    [[CLS], science, fiction, machines, motors, pa...
17    [[CLS], siren, emergency, sirens, police, 

In [89]:
# Mapping token to BERT vocabulary index
indexed_tokens = tokenized_text.apply(lambda sen: tokenizer.convert_tokens_to_ids(sen))

In [90]:
# Zip words together with their indexes
token_id_tup = []
for ind, sen in enumerate(tokenized_text):
  token_id_tup.append(list(zip(sen, indexed_tokens[ind])))

In [91]:
# Zip words together with their indexes
for tup in token_id_tup[0]:
    print('{:<12} {:>6,}'.format(tup[0], tup[1]))


[CLS]           101
metal         3,384
drum          6,943
bump         16,906
dull         10,634
[SEP]           102


In [92]:
# Creating segment IDs for BERT to distinguish between sentences
segment_ids = []
for ind, sen in enumerate(tokenized_text):
  segment_ids.append([ind] * len(sen))

In [93]:
tokens_tensor = [torch.tensor([token]) for token in indexed_tokens]
tokens_tensor[0]

tensor([[  101,  3384,  6943, 16906, 10634,   102]])

In [94]:
segments_tensor = [torch.tensor([id]) for id in segment_ids]
segments_tensor[0]

tensor([[0, 0, 0, 0, 0, 0]])

In [95]:
zipped_text_token_segment = list(zip(tokenized_text, tokens_tensor, segments_tensor))

for tup in zipped_text_token_segment:
  print(f"Sentence: {tup[0]}")
  print(f"Token_tensor: {tup[1]}")
  print(f"Segment_id: {tup[2]}")
  

Sentence: ['[CLS]', 'metal', 'drum', 'bump', 'dull', '[SEP]']
Token_tensor: tensor([[  101,  3384,  6943, 16906, 10634,   102]])
Segment_id: tensor([[0, 0, 0, 0, 0, 0]])
Sentence: ['[CLS]', 'plastic', 'cap', 'drop', 'bounce', 'medium', 'sized', '[SEP]']
Token_tensor: tensor([[  101,  6081,  6178,  4530, 17523,  5396,  7451,   102]])
Segment_id: tensor([[1, 1, 1, 1, 1, 1, 1, 1]])
Sentence: ['[CLS]', 'metal', 'fall', 'stairs', '[SEP]']
Token_tensor: tensor([[ 101, 3384, 2991, 5108,  102]])
Segment_id: tensor([[2, 2, 2, 2, 2]])
Sentence: ['[CLS]', 'car', 'v', '##w', 'beetle', 'turbo', 'driver', 'door', 'various', 'open', 'close', 'occasional', 'bounce', 'back', 'liter', 'hp', 'cy', '##l', 'adaptive', 'trans', '[SEP]']
Token_tensor: tensor([[  101,  2482,  1058,  2860,  7813, 15386,  4062,  2341,  2536,  2330,
          2485,  8138, 17523,  2067, 23675,  6522, 22330,  2140, 19293,  9099,
           102]])
Segment_id: tensor([[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]])

# Evaluating the BERT model

In [96]:
# suppressing the output with the magic command "capture"
%%capture

# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-uncased',
                                  output_hidden_states = True,
                                  )

# setting model in eval mode (i.e., feed-forward operation)
model.eval()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [97]:
# apply BERT to text and collect all hidden states form the 12 BERT layers

with torch.no_grad():

  # evaluating the model only on the first pair of token tensor and segments tensor
  outputs = model(tokens_tensor[0], segments_tensor[0])


  # in our pretrained model, third item contains hidden states form all layers
  hidden_states = outputs[2]

In [98]:
print ("Number of layers:", len(hidden_states), "  (initial embeddings + 12 BERT layers)")

layer_i = 0
print ("Number of batches:", len(hidden_states[layer_i]))

batch_i = 0
print ("Number of tokens:", len(hidden_states[layer_i][batch_i]))

token_i = 0
print ("Number of hidden units:", len(hidden_states[layer_i][batch_i][token_i]))

Number of layers: 13   (initial embeddings + 12 BERT layers)
Number of batches: 1
Number of tokens: 6
Number of hidden units: 768


We want to group the values by tokens (i.e., parts of the words we want the embeddings for) and not by layer. Thus we stack all the layers together and extract the information across layers per token

In [99]:
# Concatenate the tensors for all layers. We use `stack` here to create a new dimension in the tensor
token_embeddings = torch.stack(hidden_states, dim=0)

token_embeddings.size()

torch.Size([13, 1, 6, 768])

Current dimensions: [# layers, # batches, # tokens, # features] <br>
Desired dimensions: [# tokens, # layers, # features]

In [100]:
# Changing the order of dimensions by permutation
# Numbers in the permute function refer to the spot in the original list, i.e., the original index in the dimension ordering in token_embedding 
token_embeddings = torch.squeeze(token_embeddings, dim=1)
token_embeddings = token_embeddings.permute(1,0,2)

token_embeddings.size()

torch.Size([6, 13, 768])

First, let’s concatenate the last four layers, giving us a single word vector per token. Each vector will have length 4 x 768 = 3,072.

In [101]:
# stores token vectors
token_vecs_cat = []

for token in token_embeddings:

  # concatinating vectors from the last 4 model layers
    cat_vec = torch.cat((token[-1], token[-2], token[-3], token[-4]), dim=0)

    token_vecs_cat.append(cat_vec)

print ('Shape is: %d x %d' % (len(token_vecs_cat), len(token_vecs_cat[0])))


Shape is: 6 x 3072


# Putting it all together 
Creating word embedding for entire DataFrame

In [102]:
# apply BERT to text and collect all hidden states form the 12 BERT layers
token_vecs_cat = []


with torch.no_grad():
  # each tuple represents all the data for one of the 10 sentences
  for tup in list(zip(tokenized_text, tokens_tensor, segments_tensor)):

    # model gets tokens_tensor and segments_tensor as input, to create word embedding
    out = model(tup[1], tup[2])

  # in our pretrained model, third item contains hidden states form all layers
    hidden_states = out[2]


    # Concatenate the tensors for all layers. We use `stack` here to create a new dimension in the tensor
    token_embeddings = torch.stack(hidden_states, dim=0)
    token_embeddings = torch.squeeze(token_embeddings, dim=1)
    token_embeddings = token_embeddings.permute(1,0,2)

    # stores token vectors
    for token in token_embeddings:

      # concatinating vectors from the last 4 model layers 
      # for dim=0 means only the token embeddings?
      cat_vec = torch.cat((token[-1], token[-2], token[-3], token[-4]), dim=0)

      # adding vector as a numpy array for visualization
      token_vecs_cat.append(cat_vec.numpy())

In [103]:
print(f"Number of all word embeddings: {len(token_vecs_cat)}")

Number of all word embeddings: 837


In [104]:
token_embeddings.size()

torch.Size([12, 13, 768])

In [105]:
token.size()

torch.Size([13, 768])

In [106]:
# getting all the tokens of all sentences into one list
tokens = []
for sentence in tokenized_text:
  tokens.extend(sentence)
print(f"Number of all tokens: {len(tokens)}")

Number of all tokens: 837


Showing the format of the output: The model gives us a single vector for each token, for which we printed only the 5 first dimensions

In [107]:
token_vecs_cat[0][:5]

array([-0.584233  ,  0.1512002 , -0.32292116, -0.04860656, -0.37233365],
      dtype=float32)

In [108]:
# creating df form dictionary with the dictionary keys (tokens) being the rows and the values (vector dimensions) being columns
data=dict(zip(tokens, token_vecs_cat))

vocab = pd.DataFrame.from_dict(data, orient="index")
vocab

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3062,3063,3064,3065,3066,3067,3068,3069,3070,3071
[CLS],-0.385412,-0.169683,-0.122460,-0.285790,-0.188592,-0.052672,0.460083,0.186863,-0.518355,-0.160871,...,0.029798,-0.754809,-0.671109,-0.298449,0.553228,-0.415125,0.126625,0.135120,0.064915,0.228330
metal,0.526864,0.448068,0.339596,-0.115749,-0.187839,0.422706,0.492667,-0.261456,-0.046720,-0.407582,...,0.094306,-0.477550,1.421478,-0.274952,-0.303977,-0.500042,-1.314984,-0.962641,-0.074827,-0.493122
drum,0.480667,-0.239346,-0.138328,-0.254441,0.341284,0.460979,0.003544,0.598176,-0.872558,0.216899,...,0.641722,-0.622773,-0.153900,0.267731,0.585714,0.516981,-1.826138,0.697307,-0.144111,-1.094720
bump,-0.766358,0.150438,0.796784,-0.063850,0.813698,0.093535,-0.717201,0.465621,-0.285165,-0.140569,...,0.151924,-0.056756,1.593096,-0.314698,0.866845,0.262177,-1.890318,-0.288487,-0.182730,-0.318980
dull,-0.139497,-0.309761,-0.298775,0.184986,0.245184,0.126921,-0.054579,0.484003,-0.360832,-0.068242,...,0.206621,0.075714,0.165896,-0.041510,0.903641,-0.589952,0.644796,-0.068902,-0.902343,-0.279244
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
electronic,0.043955,0.181946,0.234153,0.015311,1.420227,0.616279,0.644040,0.287112,-1.041873,-0.366492,...,0.441504,-0.521269,0.118978,0.249031,0.808547,-0.055141,-1.160134,-0.173069,0.224569,-0.349597
adjustment,-0.058595,-0.005818,0.014575,-0.067759,0.739163,0.284493,0.261438,0.282190,-1.255742,-0.069211,...,0.788132,-0.601480,0.218532,-0.531921,0.745811,0.050028,-0.653203,-0.762557,0.206812,-0.221271
start,-0.240616,0.112974,0.120422,-0.465654,1.167789,0.541881,0.388298,0.345745,-0.186948,0.051988,...,0.771203,-0.964499,-0.444635,0.125779,0.491638,1.040698,-0.225602,-0.737498,0.254134,-1.238674
climb,0.024471,0.046539,-0.066035,-0.178076,0.842693,0.365531,0.715804,-0.433530,-0.211332,-0.004604,...,0.385436,-0.326130,0.347607,-0.328896,0.484009,1.199531,-1.323713,-0.391217,-0.429910,0.235619


In [109]:
selected_vocab = vocab[vocab.index.isin(combined_df["Word"])]
selected_vocab

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3062,3063,3064,3065,3066,3067,3068,3069,3070,3071
bump,-0.766358,0.150438,0.796784,-0.06385,0.813698,0.093535,-0.717201,0.465621,-0.285165,-0.140569,...,0.151924,-0.056756,1.593096,-0.314698,0.866845,0.262177,-1.890318,-0.288487,-0.18273,-0.31898
drop,0.378486,0.019752,0.233416,-0.287537,0.28533,-0.400916,-0.110548,0.206543,-0.343183,0.263094,...,0.430886,-0.84011,0.212373,-0.257885,0.25114,0.898735,-0.803699,-0.771321,-0.214084,-0.699663
bounce,0.267411,-0.189824,0.530648,-0.342058,0.242042,-0.760891,0.588256,0.157697,-0.273386,0.470276,...,-0.433407,0.305226,0.038115,0.492026,0.117041,0.213291,-1.355862,-0.976634,0.258117,-1.248368
fall,0.273359,0.546011,-0.097099,-0.239508,0.53161,0.064574,0.458318,0.27006,-0.10014,-0.03743,...,0.591706,-0.411014,0.885333,-0.731136,0.224104,0.372102,-0.872668,-1.180806,-0.397377,-0.526515
clap,0.724024,-0.546845,0.809024,-0.145585,0.463678,0.059112,-0.124074,0.556058,-0.791176,0.191562,...,0.528075,-0.391435,-0.63809,0.159265,0.887274,0.19844,-0.58836,0.144224,-0.079607,-0.97886
slam,0.862601,0.140036,0.585218,0.057647,0.243517,0.407262,0.092975,0.216761,-0.291751,-0.135581,...,1.281948,-0.580921,0.076436,0.297564,-0.302769,0.735943,0.020178,0.145076,0.509913,-1.625615
rev,0.354782,0.010428,0.660586,0.233403,0.211054,-0.327274,0.103515,0.096231,-0.059674,0.249406,...,0.738442,-0.392206,-1.60001,0.175493,0.517157,0.588347,-0.446151,-1.721137,-0.13216,-0.945002
step,0.231193,-1.113249,0.903277,0.11776,0.438051,-0.028522,-0.197499,-0.071361,0.192139,-0.470036,...,1.154288,-0.407733,-0.218205,-0.158187,0.286432,0.134104,-1.272135,-1.155807,0.275304,-1.45161
punch,-0.35743,0.348607,0.196897,-0.020377,-0.324706,-0.107266,0.274669,0.059066,0.236337,-0.55654,...,-0.136593,-0.089502,-0.951255,-0.571707,0.491681,0.790147,0.318136,-0.598648,0.766209,-0.546699
blast,0.275185,0.692628,0.199813,-0.300574,-0.14149,-0.268455,0.152974,0.471435,-0.14405,0.188913,...,0.274444,-0.607903,-0.789978,-0.318114,0.085406,-0.251474,-0.730113,-1.106835,-1.080569,-0.477772


In [110]:
tuples = [tuple(combined_df[combined_df["Word"]==i][["Category", "Word"]].values[0]) for i in selected_vocab.index]
multi_index = pd.MultiIndex.from_tuples(tuples, names=["Category", "Word"])

In [111]:
selected_vocab = selected_vocab.set_index(multi_index)
selected_vocab.sort_index(inplace=True)

In [112]:
selected_vocab

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3,4,5,6,7,8,9,...,3062,3063,3064,3065,3066,3067,3068,3069,3070,3071
Category,Word,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
complex_sound,accident,-0.071864,-0.43098,0.565145,-0.394851,0.830021,0.248474,-0.261721,1.118823,-0.413547,0.568947,...,0.88644,-0.656647,1.155925,0.357796,-0.207467,0.010212,-0.524442,-0.774472,0.388681,-0.782016
complex_sound,bowl,0.007823,0.084034,-0.283718,-0.260682,-0.047279,0.364171,-0.124721,0.093506,-0.155283,0.133953,...,-0.421039,0.051822,-0.118683,-0.372926,0.523111,0.2231,-0.520723,0.13545,0.167771,-0.284786
complex_sound,box,0.309398,0.243736,0.37198,-0.47857,-0.042897,-0.038703,0.087477,-0.148705,0.242344,0.636851,...,-0.080729,-0.296453,0.074212,-0.582956,1.05864,0.617334,-0.44714,0.09277,-0.487773,0.426127
complex_sound,destruction,0.425721,0.617503,-0.185585,0.166921,-0.005523,0.061782,0.21802,-0.030943,-0.381028,-0.4631,...,0.969082,-0.693655,0.931601,-0.187135,0.498744,-0.003393,0.167953,-0.355956,-0.129725,-0.939647
complex_sound,exercise,0.518139,-0.042294,-0.14133,0.037188,0.567031,-0.242316,0.781135,-0.221104,-0.632925,-0.117545,...,0.419904,0.067802,0.364691,0.23909,0.305038,0.68069,-0.35975,-0.523707,-0.502155,-1.084766
complex_sound,fight,-0.181834,0.196767,-0.048378,0.199709,-0.129111,-0.187882,0.601242,-0.230929,0.510867,-0.485927,...,0.255946,-1.189746,-0.581307,0.57612,-0.135239,0.716419,0.016403,-0.990139,-0.932442,0.239492
complex_sound,wrestle,-0.451489,1.022307,-0.052919,-0.040613,-0.182234,0.121713,1.127993,0.28085,0.228434,-0.589067,...,1.0277,-0.618358,0.215254,0.397466,-0.632952,0.71026,-0.13765,-1.534597,-0.066654,0.370382
dog,bark,0.009542,0.60197,-0.229793,-0.277801,-0.281056,-0.07729,-0.179243,0.354741,-0.538877,0.179805,...,-0.566642,-0.34342,-1.171387,1.324257,-1.037255,-0.568469,-0.507639,-1.415266,-0.340378,-0.522451
dog,growl,0.665008,0.31646,0.985501,-0.824503,0.330797,0.339858,0.922228,-0.203599,-0.014278,0.118476,...,1.208879,-0.824763,-0.002563,-0.401344,-0.70278,-0.305863,-0.558397,-0.740571,0.725616,-0.607732
dog,howl,0.326336,0.1901,-0.145694,-0.267369,0.343547,0.026875,0.788314,0.397109,-0.321884,-0.154574,...,0.527211,-0.315388,-0.897168,-0.323397,-0.196635,0.910172,0.014271,-0.368028,0.294423,-1.421019


In [117]:
from scipy.spatial.distance import pdist, squareform
from scipy.stats import zscore

# cosine similarity matrix
# 1-distance matrix with the cosine distance as a metric = similarity matrix
def create_similarity_matrix_df(df, metric):
    
    matrix = 1-squareform(pdist(zscore(df, axis=0), metric))

    matrix_df = pd.DataFrame(data=matrix, columns=multi_index, index=multi_index)
    matrix_df.sort_index(axis=0, inplace=True)
    matrix_df.sort_index(axis=1, inplace=True)

    # matrix_df.style.set_properties(**{'text-align': 'left'})
    return matrix_df.style.background_gradient(axis=None, cmap='RdBu') # changing color

In [118]:
create_similarity_matrix_df(selected_vocab, metric="cosine")

Unnamed: 0_level_0,Category,complex_sound,complex_sound,complex_sound,complex_sound,complex_sound,complex_sound,complex_sound,dog,dog,dog,dog,dog,explosion,explosion,explosion,explosion,impact,impact,impact,impact,impact,impact,impact,impact,locomotion,locomotion,locomotion,locomotion,locomotion,nose,nose,vehicle,vehicle,vehicle,vehicle,vehicle,vocal,vocal,vocal,vocal,vocal,vocal,vocal,vocal
Unnamed: 0_level_1,Word,accident,bowl,box,destruction,exercise,fight,wrestle,bark,growl,howl,scratch,whimper,bang,blast,burst,explode,bounce,bump,clap,drop,fall,punch,slam,step,jump,march,move,run,walk,sniff,snort,accelerate,brake,drive,pass,rev,chant,conversation,hiss,murmur,scream,shout,talk,yell
Category,Word,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2,Unnamed: 27_level_2,Unnamed: 28_level_2,Unnamed: 29_level_2,Unnamed: 30_level_2,Unnamed: 31_level_2,Unnamed: 32_level_2,Unnamed: 33_level_2,Unnamed: 34_level_2,Unnamed: 35_level_2,Unnamed: 36_level_2,Unnamed: 37_level_2,Unnamed: 38_level_2,Unnamed: 39_level_2,Unnamed: 40_level_2,Unnamed: 41_level_2,Unnamed: 42_level_2,Unnamed: 43_level_2,Unnamed: 44_level_2,Unnamed: 45_level_2
complex_sound,accident,1.0,0.412301,0.309242,0.261004,0.06655,-0.107813,0.194933,-0.088256,-0.065813,-0.100634,-0.023618,-0.063687,0.040709,0.000265,0.128839,0.133548,-0.068914,-0.046283,-0.110121,-0.125535,-0.143164,0.000488,-0.110227,0.059932,-0.022222,-0.040948,-0.150532,-0.015605,-0.138801,-0.02707,-0.115528,-0.063199,0.045257,-0.022296,-0.17032,-0.06937,-0.113706,-0.154113,-0.139249,-0.071076,-0.00763,-0.08794,-0.05845,0.065725
complex_sound,bowl,0.412301,1.0,0.237196,0.308672,-0.09749,-0.19319,0.141461,-0.043812,-0.062679,-0.108729,0.008735,-0.073345,0.221959,0.001673,0.192659,0.050707,-0.036116,0.083176,-0.119949,-0.198286,-0.181356,0.151894,-0.118919,-0.035212,-0.083147,0.067166,-0.112321,0.00497,-0.199989,-0.05975,-0.206867,-0.156449,-0.076104,-0.0066,-0.171975,0.004794,-0.104196,-0.017401,-0.162394,-0.100647,0.091011,-0.024839,0.036684,0.032774
complex_sound,box,0.309242,0.237196,1.0,0.218649,-0.028216,-0.025015,0.147907,0.019699,-0.025894,-0.158108,0.043404,-0.074429,0.090238,-0.107745,0.08681,0.075526,-0.093701,0.005922,-0.107778,-0.138834,-0.139841,0.014026,-0.116857,-7.5e-05,0.004739,0.130068,-0.063665,-0.080243,-0.086975,-0.018142,-0.138135,-0.076327,0.150793,-0.02751,-0.101912,-0.033382,-0.057113,-0.043016,-0.147712,-0.129385,-0.02827,-0.169495,0.006715,-0.120418
complex_sound,destruction,0.261004,0.308672,0.218649,1.0,-0.040694,0.046405,0.047434,-0.069926,-0.041458,-0.126421,-0.034558,-0.010856,0.061377,0.061768,0.218842,0.176703,-0.034715,-0.021544,-0.238942,-0.062776,0.075492,0.23489,-0.156176,-0.038154,0.130499,-0.00947,-0.162004,-0.127131,-0.189727,-0.016489,-0.20932,-0.106872,-0.096669,0.023466,-0.101002,-0.088718,-0.048436,-0.103204,-0.135178,-0.087114,0.02581,-0.11733,0.078761,-0.104975
complex_sound,exercise,0.06655,-0.09749,-0.028216,-0.040694,1.0,0.011468,-0.117285,-0.141733,-0.097434,0.014777,-0.101133,-0.110372,0.043766,-0.042463,-0.00566,0.230354,-0.074555,-0.076976,-0.038279,-0.051288,-0.083932,0.04012,-0.14889,0.078701,0.26574,-0.189498,-0.039612,0.279642,-0.134568,-0.125558,-0.119896,-0.102072,-0.093783,-0.010585,0.101771,-0.070321,-0.146159,-0.103384,0.132786,-0.112249,0.00475,-0.048849,-0.025519,0.071046
complex_sound,fight,-0.107813,-0.19319,-0.025015,0.046405,0.011468,1.0,-0.0056,-0.059406,-0.063881,-0.072493,-0.007241,0.089495,-0.184225,0.042838,0.084147,-0.020895,0.033819,-0.176757,-0.128314,0.074081,0.233849,-0.029891,0.145007,-0.030717,0.040719,-0.145945,0.07836,-0.11331,-0.06605,-0.056892,-0.134433,0.032277,0.053756,0.043544,-0.098848,-0.035173,-0.08254,-0.045198,-0.035265,0.23583,-0.006336,-0.025676,-0.076829,-0.098939
complex_sound,wrestle,0.194933,0.141461,0.147907,0.047434,-0.117285,-0.0056,1.0,0.070393,0.028781,-0.140013,-0.01452,-0.080964,-0.028019,-0.07083,-0.009053,0.026584,0.024889,-0.057179,0.120738,-0.033418,-0.066645,-0.045204,-0.021662,-0.005241,-0.154928,0.100286,-0.096901,-0.170539,-0.055298,0.121826,-0.004763,-0.084893,0.126842,0.088565,-0.108587,-0.052608,-0.088757,-0.028,-0.116245,-0.097229,-0.058205,-0.075923,-0.131729,-0.058361
dog,bark,-0.088256,-0.043812,0.019699,-0.069926,-0.141733,-0.059406,0.070393,1.0,0.023475,0.061744,0.103472,0.19803,-0.146715,0.102493,-0.190428,-0.110808,0.027847,0.159588,-0.003129,0.011796,-0.076876,0.040631,-0.181308,-0.104025,-0.0534,0.176362,0.147713,-0.081029,-0.041873,0.045616,-0.000803,0.033753,-0.056033,-0.077357,-0.060253,-0.114191,-0.070016,-0.075843,-0.10817,-0.078074,-0.059156,-0.01901,-0.023624,-0.116363
dog,growl,-0.065813,-0.062679,-0.025894,-0.041458,-0.097434,-0.063881,0.028781,0.023475,1.0,0.003237,0.01892,-0.09788,-0.097137,-0.012435,-0.073238,-0.186572,-0.038691,0.093515,0.063094,-0.076429,-0.0947,-0.059675,-0.1014,-0.083402,-0.079145,0.074217,-0.008906,-0.120623,0.190387,0.078777,0.175339,0.044319,-0.060623,-0.111763,-0.102625,-0.004217,0.017396,0.139386,-0.067406,-0.106309,-0.061292,-0.09549,-0.104397,-0.110776
dog,howl,-0.100634,-0.108729,-0.158108,-0.126421,0.014777,-0.072493,-0.140013,0.061744,0.003237,1.0,0.170766,0.076869,-0.073036,-0.098866,-0.069061,-0.064983,0.095352,0.099035,0.072357,0.026796,-0.018936,-0.030608,-0.04936,-0.026621,-0.043834,-0.081365,0.210086,-0.001651,-0.049458,-0.08418,-0.017156,0.018208,-0.038897,-0.07957,-0.030789,-0.04403,-0.072228,-0.149452,-0.054257,-0.07186,-0.029476,-0.049057,-0.036845,-0.020878
