In [49]:
import json
from pathlib import Path

import pandas as pd
import plotly.express as px

from src.dataset import TuneDataset

# Load Embeddings

In [50]:
__name__ = 'visualizing.ipynb'
embeddings_path = Path(__name__).resolve().parents[1] / 'data' / 'processed' / 'embeddings.json'
assert embeddings_path.exists() and embeddings_path.is_file()

with open(str(embeddings_path), 'r') as file:
    embeddings = json.load(file)

embeddings

{'19. The Silly Pink Rabbit!.npy': [4.736776828765869, -61.894935607910156],
 '13. Fortune Teller.npy': [4.467158317565918, -57.90912628173828],
 '14. Soldiers of Kakariko Village.npy': [4.97233247756958,
  -64.22405242919922],
 '02. Beginning of the Journey.npy': [4.707441329956055, -60.27317810058594],
 '21. Forest of Mystery.npy': [5.128002166748047, -64.98580169677734],
 '26. Black Mist (Storm).npy': [5.312802791595459, -67.84202575683594],
 '24. Dark Golden Land.npy': [5.500285625457764, -68.65293884277344],
 '12. Guessing-Game House.npy': [5.246336936950684, -66.38778686523438],
 '03. Seal of Seven Maidens.npy': [5.482187747955322, -69.21408081054688],
 '05. Time of the Falling Rain (Storm).npy': [5.38216495513916,
  -69.05252075195312],
 '07. Majestic Castle (Storm).npy': [5.157629013061523, -66.75651550292969],
 '09. Safety in the Sanctuary.npy': [4.786135196685791, -60.95587921142578],
 '06. Majestic Castle.npy': [4.927262306213379, -63.45075607299805],
 '04. Time of the Falli

In [51]:
df = pd.DataFrame(
    data = [embd for embd in embeddings.values()],
    columns=['emb1', 'emb2']
)
df['name'] = list(embeddings.keys())
df

Unnamed: 0,emb1,emb2,name
0,4.736777,-61.894936,19. The Silly Pink Rabbit!.npy
1,4.467158,-57.909126,13. Fortune Teller.npy
2,4.972332,-64.224052,14. Soldiers of Kakariko Village.npy
3,4.707441,-60.273178,02. Beginning of the Journey.npy
4,5.128002,-64.985802,21. Forest of Mystery.npy
5,5.312803,-67.842026,26. Black Mist (Storm).npy
6,5.500286,-68.652939,24. Dark Golden Land.npy
7,5.246337,-66.387787,12. Guessing-Game House.npy
8,5.482188,-69.214081,03. Seal of Seven Maidens.npy
9,5.382165,-69.052521,05. Time of the Falling Rain (Storm).npy


Let's create an index expressing how much the content was changed during preprocessing. That is:
```python
index = (current_length - previous_length) / previous_length
```

Thus, if previous_length = k and current_length = 2k, we have index = 1. This means we copied 100% of the original content.
If previous_length = 2k and current_length = k, we have index = -1. This means we deleted 100% of the original content.
If previous_length = k and current_length = k, we have index = 0. This means we didn't change the original content.


In [52]:
# Load files

# Processed 
processed_ds = TuneDataset().load_tunes(embeddings_path.parent / 'padded-soundscapes')
assert len(processed_ds.file_paths) == df.shape[0]
processed_ds.extract_tune(False)
# Original
original_ds = TuneDataset().load_tunes(
    embeddings_path.parents[1] / 'raw' / 'wav'
)    
original_ds.extract_tune(False)


# Let's compute the indexes 
indexes = {
    'name': [],
    'index': []
}
for processed_tune in processed_ds.tunes:
    for original_tune in original_ds.tunes:
        if original_tune.file_path.name == processed_tune.file_path.name:
            indexes['name'].append(processed_tune.file_path.name)
            indexes['index'].append(
                (processed_tune.time_length-original_tune.time_length)/original_tune.time_length
            )
indexes_df = pd.DataFrame(indexes)

# And add to the dataset
df.name = df.name.apply(lambda nm: nm.replace('.npy','.wav')) # replace '.npy' with '.wav'
df = pd.merge(df, indexes_df,on='name')

In [53]:
df

Unnamed: 0,emb1,emb2,name,index
0,4.736777,-61.894936,19. The Silly Pink Rabbit!.wav,1.159678
1,4.467158,-57.909126,13. Fortune Teller.wav,1.837358
2,4.972332,-64.224052,14. Soldiers of Kakariko Village.wav,4.663792
3,4.707441,-60.273178,02. Beginning of the Journey.wav,1.295067
4,5.128002,-64.985802,21. Forest of Mystery.wav,1.697153
5,5.312803,-67.842026,26. Black Mist (Storm).wav,0.102054
6,5.500286,-68.652939,24. Dark Golden Land.wav,-0.092502
7,5.246337,-66.387787,12. Guessing-Game House.wav,1.565757
8,5.482188,-69.214081,03. Seal of Seven Maidens.wav,0.0
9,5.382165,-69.052521,05. Time of the Falling Rain (Storm).wav,1.296823


# Scatter Plot

In [54]:
fig = px.scatter(df, 
                 x="emb1", 
                 y="emb2", 
                 title="Embeddings",
                 hover_data=['name'],
                 color = 'index')
fig.show()