In [34]:
from transformers import Wav2Vec2Processor, Wav2Vec2Model
import torch
import librosa
import os
from tqdm import tqdm
import pandas as pd
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import umap
import plotly.express as px
import numpy as np
import openl3
import soundfile as sf

In [43]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")
model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base")



In [47]:
files = os.listdir('data/audio')

In [8]:
embeddings = []
for file in tqdm(files):
    audio, sr = librosa.load(f'data/audio/{file}', sr=16000)
    inputs = processor(audio, sampling_rate=sr, return_tensors="pt", padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
        embedding = outputs.last_hidden_state.mean(dim=1)
    embeddings.append(embedding[0])

100%|██████████| 7442/7442 [20:22<00:00,  6.09it/s]


In [None]:
# embeddings = [embedding[0] for embedding in embeddings]

In [9]:
emos = []
for file in files:
    emo = file.split('_')[2]
    emos.append(emo)

In [50]:
def plot_embeddings(embeddings, color_by_data, files, algorithm='tsne', color_by='Emocje'):
    if algorithm == 'tsne':
        tsne = TSNE(n_components=2, random_state=42)
        results = tsne.fit_transform(np.array(embeddings))
    elif algorithm == 'pca':
        pca = PCA(n_components=2)
        results = pca.fit_transform(np.array(embeddings))
    elif algorithm == 'umap':
        umap_ = umap.UMAP(n_components=2, random_state=42)
        results = umap_.fit_transform(np.array(embeddings))

    xy_df = pd.DataFrame(results, columns=['x', 'y'])
    xy_df['emo'] = color_by_data
    xy_df['file'] = files

    fig = px.scatter(xy_df, x='x', y='y', color='emo', 
                    title='Wizualizacja osadzeń emocji przy użyciu t-SNE',
                    labels={'emo': color_by},
                    hover_name=xy_df['file'], 
                    color_discrete_sequence=px.colors.qualitative.Vivid)

    fig.update_layout(
        width=1600,
        height=1000
    )

    fig.update_traces(marker=dict(size=10),
                    selector=dict(mode='markers'))

    fig.show()

In [20]:
plot_embeddings(embeddings, emos, files)

In [25]:
plot_embeddings(embeddings, emos, files, 'pca')

In [26]:
plot_embeddings(embeddings, emos, files, 'umap')


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



# speaker embeddings

In [48]:
def get_embeddings(model):
    embeddings = []
    for file in tqdm(files):
        try:
            embedding = model(f'data/audio/{file}')
            embeddings.append(embedding)
        except Exception as e:
            print(f'{file}: {e}')
    return embeddings

In [45]:
from pyannote.audio import Model, Inference

speaker_model = Model.from_pretrained("pyannote/embedding", use_auth_token="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")
speaker_model = Inference(speaker_model, window="whole", device=device)

c:\Studia\.conda\Lib\site-packages\pytorch_lightning\utilities\migration\migration.py:208: You have multiple `ModelCheckpoint` callback states in this checkpoint, but we found state keys that would end up colliding with each other after an upgrade, which means we can't differentiate which of your checkpoint callbacks needs which states. At least one of your `ModelCheckpoint` callbacks will not be able to reload the state.
Lightning automatically upgraded your loaded checkpoint from v1.2.7 to v2.4.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint C:\Users\wikto\.cache\torch\pyannote\models--pyannote--embedding\snapshots\4db4899737a38b2d618bbd74350915aa10293cb2\pytorch_model.bin`


Lightning automatically upgraded your loaded checkpoint from v1.2.7 to v2.4.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint C:\Users\wikto\.cache\torch\pyannote\models--pyannote--embe

Model was trained with pyannote.audio 0.0.1, yours is 3.3.2. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.8.1+cu102, yours is 2.5.1+cu124. Bad things might happen unless you revert torch to 1.x.
Model was trained with pyannote.audio 0.0.1, yours is 3.3.2. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.8.1+cu102, yours is 2.5.1+cu124. Bad things might happen unless you revert torch to 1.x.


In [49]:
speaker_embeddings = get_embeddings(speaker_model)

100%|██████████| 7442/7442 [01:18<00:00, 95.22it/s] 


In [51]:
speakers = []
for file in files:
    sp = file.split('_')[0]
    speakers.append(sp)

In [52]:
plot_embeddings(speaker_embeddings, speakers, files, color_by='Mówca')