In [35]:
import os, glob
import soundfile as sf
import numpy as np
import h5py
import tqdm
import IPython
import fairseq
import torch
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE

2021-11-19 16:08:23 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX


In [20]:
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [2]:
fairseq.__version__

'1.0.0a0+7fd6435'

In [25]:
from fairseq.models.wav2vec.wav2vec2 import Wav2Vec2Model

In [36]:
wav2vec2_checkpoint_path = './pretrained_checkpoints/wav2vec_small_960h.pt'
checkpoint = torch.load(wav2vec2_checkpoint_path)

In [38]:
list(checkpoint['model'].keys())

['w2v_encoder.proj.weight',
 'w2v_encoder.proj.bias',
 'w2v_encoder.w2v_model.feature_extractor.conv_layers.0.0.weight',
 'w2v_encoder.w2v_model.feature_extractor.conv_layers.0.2.weight',
 'w2v_encoder.w2v_model.feature_extractor.conv_layers.0.2.bias',
 'w2v_encoder.w2v_model.feature_extractor.conv_layers.1.0.weight',
 'w2v_encoder.w2v_model.feature_extractor.conv_layers.2.0.weight',
 'w2v_encoder.w2v_model.feature_extractor.conv_layers.3.0.weight',
 'w2v_encoder.w2v_model.feature_extractor.conv_layers.4.0.weight',
 'w2v_encoder.w2v_model.feature_extractor.conv_layers.5.0.weight',
 'w2v_encoder.w2v_model.feature_extractor.conv_layers.6.0.weight',
 'w2v_encoder.w2v_model.encoder.pos_conv.0.bias',
 'w2v_encoder.w2v_model.encoder.pos_conv.0.weight_g',
 'w2v_encoder.w2v_model.encoder.pos_conv.0.weight_v',
 'w2v_encoder.w2v_model.encoder.layers.0.self_attn.k_proj.weight',
 'w2v_encoder.w2v_model.encoder.layers.0.self_attn.k_proj.bias',
 'w2v_encoder.w2v_model.encoder.layers.0.self_attn.v_pr

In [7]:
# # 0.10.2
# # Check this file: ~/miniconda/lib/python3.8/site-packages/fairseq/models/wav2vec/wav2vec2.py
# wav2vec2 = fairseq.models.wav2vec.Wav2Vec2Model.build_model(checkpoint['args'])
# wav2vec2.load_state_dict(checkpoint['model'])
# audio = torch.randn(1,10000)
# output = wav2vec2(audio)

In [27]:
# fairseq hydra-core-1.0.7 (pip install --editable ./)
from fairseq.dataclass.utils import convert_namespace_to_omegaconf
wav2vec2 = Wav2Vec2Model.build_model(convert_namespace_to_omegaconf(checkpoint['args']).model, task=None)
wav2vec2.load_state_dict(checkpoint['model'])
wav2vec2.cuda()
wav2vec2.eval()
# audio = torch.randn(1,10000)
# features = wav2vec2(audio, features_only=True, mask=False)

Wav2Vec2Model(
  (feature_extractor): ConvFeatureExtractionModel(
    (conv_layers): ModuleList(
      (0): Sequential(
        (0): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
        (1): Dropout(p=0.0, inplace=False)
        (2): Fp32GroupNorm(512, 512, eps=1e-05, affine=True)
        (3): GELU()
      )
      (1): Sequential(
        (0): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
        (1): Dropout(p=0.0, inplace=False)
        (2): GELU()
      )
      (2): Sequential(
        (0): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
        (1): Dropout(p=0.0, inplace=False)
        (2): GELU()
      )
      (3): Sequential(
        (0): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
        (1): Dropout(p=0.0, inplace=False)
        (2): GELU()
      )
      (4): Sequential(
        (0): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
        (1): Dropout(p=0.0, inplace=False)
        (2): GELU()
      )
      (5

In [9]:
features['x'].shape

torch.Size([1, 31, 768])

In [38]:
"""
Inference function of pretrained wav2vec2 to extract intermediate representations
Ref: https://github.com/pytorch/fairseq/blob/89ec6e7efff867d258947acafc57189b257212d0/fairseq/models/wav2vec/wav2vec2.py
"""
def wav2vec2forward(model, source, aggregation=True, hidden_layer=None):
    with torch.no_grad():
        cnn_features = model.feature_extractor(source)
    
        cnn_features = cnn_features.transpose(1, 2)
        features = model.layer_norm(cnn_features)

        if model.quantizer: # this is not None in pretrained w2v
            q = model.quantizer(features, produce_targets=False)
            quantized_features = q["x"]
            quantized_features = model.project_q(quantized_features)

        if model.post_extract_proj is not None: # this is not None in pretrained w2v
            features = model.post_extract_proj(features)

        if model.input_quantizer is not None: # this is None in pretrained w2v
            q = model.input_quantizer(features, produce_targets=False)
            features = q['x']
            features = model.project_inp(features)
            
        encoder_outputs, encoder_layers_features = model.encoder(features, padding_mask=None, layer=hidden_layer)
            
        context_vectors = model.final_proj(encoder_outputs)
        
        ret = dict()
        ret['cnn_features'] = cnn_features[0]
        ret['quantized_features'] = quantized_features[0]
        ret['encoder_outputs'] = encoder_outputs[0]
        ret['context_vectors'] = context_vectors[0]
        if len(encoder_layers_features) > 0:
            ret['encoder_hiddens'] = [h[0][0] for h in encoder_layers_features]
        
        if aggregation:
            ret['cnn_features'] = torch.mean(ret['cnn_features'], dim=0)
            ret['quantized_features'] = torch.mean(ret['quantized_features'], dim=0)
            ret['encoder_outputs'] = torch.mean(ret['encoder_outputs'], dim=0)
            ret['context_vectors'] = torch.mean(ret['context_vectors'], dim=0)
            if len(encoder_layers_features) > 0:
                ret['encoder_hiddens'] = [torch.mean(h, dim=0) for h in ret['encoder_hiddens']]
        
        return ret

In [16]:
ret = wav2vec2.extract_features(audio, padding_mask=None, layer=-1)

In [22]:
ret['layer_results'][0][0].shape

torch.Size([31, 1, 768])

In [31]:
ret = wav2vec2forward(wav2vec2, audio)

In [49]:
ret['encoder_hidden'][0][1]

tensor([[[0.0372, 0.0372, 0.0303, 0.0267, 0.0456, 0.0267, 0.0324, 0.0379,
          0.0212, 0.0233, 0.0339, 0.0322, 0.0386, 0.0427, 0.0257, 0.0272,
          0.0343, 0.0381, 0.0258, 0.0306, 0.0269, 0.0226, 0.0294, 0.0419,
          0.0312, 0.0362, 0.0423, 0.0244, 0.0369, 0.0200, 0.0328],
         [0.0310, 0.0363, 0.0340, 0.0299, 0.0324, 0.0255, 0.0277, 0.0477,
          0.0399, 0.0302, 0.0434, 0.0193, 0.0252, 0.0398, 0.0256, 0.0296,
          0.0298, 0.0433, 0.0244, 0.0333, 0.0365, 0.0193, 0.0315, 0.0399,
          0.0335, 0.0405, 0.0383, 0.0272, 0.0299, 0.0314, 0.0316],
         [0.0286, 0.0397, 0.0383, 0.0314, 0.0473, 0.0199, 0.0264, 0.0415,
          0.0272, 0.0362, 0.0421, 0.0205, 0.0353, 0.0392, 0.0300, 0.0308,
          0.0330, 0.0320, 0.0311, 0.0323, 0.0302, 0.0251, 0.0351, 0.0372,
          0.0282, 0.0298, 0.0436, 0.0238, 0.0260, 0.0242, 0.0295],
         [0.0244, 0.0387, 0.0303, 0.0323, 0.0356, 0.0309, 0.0331, 0.0450,
          0.0382, 0.0374, 0.0249, 0.0303, 0.0328, 0.0293, 0

## VTCK

In [10]:
corpus_dir = '/mnt/scratch09/rifat/InterSpeech21/datasets/VCTK-Corpus/wav16k'

In [24]:
!rm -rf ./outputs/wav2vec2_small/VTCK.h5

In [25]:
## Create a h5py file
## https://www.christopherlovell.co.uk/blog/2016/04/27/h5py-intro.html
hf = h5py.File('./outputs/wav2vec2_small/VTCK.h5', 'w')

In [23]:
hf.close()

In [13]:
list_speakers = glob.glob(f"{corpus_dir}/*")

In [19]:
hf.keys()

<KeysViewHDF5 []>

In [22]:
wav2vec2.eval()

Wav2Vec2Model(
  (feature_extractor): ConvFeatureExtractionModel(
    (conv_layers): ModuleList(
      (0): Sequential(
        (0): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
        (1): Dropout(p=0.0, inplace=False)
        (2): Fp32GroupNorm(512, 512, eps=1e-05, affine=True)
        (3): GELU()
      )
      (1): Sequential(
        (0): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
        (1): Dropout(p=0.0, inplace=False)
        (2): GELU()
      )
      (2): Sequential(
        (0): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
        (1): Dropout(p=0.0, inplace=False)
        (2): GELU()
      )
      (3): Sequential(
        (0): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
        (1): Dropout(p=0.0, inplace=False)
        (2): GELU()
      )
      (4): Sequential(
        (0): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
        (1): Dropout(p=0.0, inplace=False)
        (2): GELU()
      )
      (5

In [None]:
for speaker_dir in tqdm.tqdm(list_speakers):
    speaker_id = os.path.basename(speaker_dir)
    
    for wav_path in glob.glob(f"{speaker_dir}/*.wav"):
        wav_id, ext = os.path.basename(wav_path).split('.')
        audio, sr = sf.read(wav_path, dtype='float32')
        audio = torch.from_numpy(audio).unsqueeze(0).cuda()
        output = wav2vec2forward(wav2vec2, audio)
        
        hf.create_dataset(f"{wav_id}-cnn_features", data=output['cnn_features'].cpu())
        hf.create_dataset(f"{wav_id}-quantized_features", data=output['quantized_features'].cpu())
        hf.create_dataset(f"{wav_id}-encoder_outputs", data=output['encoder_outputs'].cpu())
        hf.create_dataset(f"{wav_id}-context_vectors", data=output['context_vectors'].cpu())
        hf.create_dataset(f"{wav_id}-encoder_hiddens", data=torch.stack([h[0].cpu() for h in output['encoder_hidden']]))

  3%|████▊                                                                                                                                                                        | 3/109 [00:52<28:47, 16.30s/it]

In [40]:
h = torch.stack([h[0] for h in output['encoder_hidden']])

In [41]:
h.shape

torch.Size([12, 102, 1, 768])

In [13]:
hf.create_dataset(speaker_id, data=output['cnn_features'])

<HDF5 dataset "p225": shape (1, 512, 264), type "<f4">

In [124]:
outputs['p225_001'].keys()

dict_keys(['cnn_features', 'quantized_features', 'encoder_outputs', 'encoder_hidden', 'context_vectors'])

In [88]:
IPython.display.Audio(data=audio, rate=sr)

In [89]:
audio = torch.from_numpy(audio).unsqueeze(0)

In [90]:
wav2vec2forward(wav2vec2, audio)

{'x': tensor([[[ 4.3756e-01,  2.5585e-01,  2.4960e-01,  ...,  3.3016e-01,
            7.3597e-02,  3.7483e-04],
          [ 4.3118e-01,  1.6547e-01,  2.9478e-01,  ...,  4.6898e-01,
            3.2812e-04,  2.1461e-01],
          [ 4.7800e-01,  2.0000e-01,  1.4638e-01,  ...,  4.5802e-01,
            2.0314e-01,  3.3044e-01],
          ...,
          [ 4.1868e-01,  2.5780e-01,  3.5004e-01,  ...,  3.0688e-01,
            2.4999e-01, -9.4147e-02],
          [ 4.5850e-01,  3.0685e-02,  1.9284e-01,  ...,  3.0985e-01,
            3.6935e-01, -2.1539e-01],
          [ 4.1362e-01,  5.7176e-02, -1.0161e-01,  ...,  3.3481e-01,
            3.5005e-01,  5.7238e-02]]], grad_fn=<TransposeBackward0>),
 'padding_mask': None,
 'features': tensor([[[ 0.1920, -0.0000, -0.0000,  ..., -0.2362, -0.2156,  0.7165],
          [-0.2198, -0.1763, -0.0983,  ...,  0.4217,  0.1528,  0.1813],
          [-0.1373, -0.1312, -0.2700,  ..., -0.0000, -0.2006,  0.5771],
          ...,
          [ 0.2807,  0.1390, -0.2463,  

In [130]:
!ls outputs/

wav2vec2_small


In [None]:
hf.create_dataset('dataset_1', data=d1)

## TIMIT

In [None]:
## wget https://data.deepai.org/timit.zip

In [40]:
hf = h5py.File('./outputs/wav2vec2_small/TIMIT_train_averaged.h5', 'w')

FileNotFoundError: [Errno 2] Unable to create file (unable to open file: name = './outputs/wav2vec2_small/TIMIT_train_averaged.h5', errno = 2, error message = 'No such file or directory', flags = 13, o_flags = 242)

In [35]:
for wav_path in tqdm.tqdm(glob.glob("./data/data/TRAIN/*/*/*.WAV.wav")):
    tmp = wav_path.split(os.sep)
    wav_id = '_'.join(tmp[-3:])[:-4]
    audio, sr = sf.read(wav_path, dtype='float32')
    audio = torch.from_numpy(audio).unsqueeze(0).cuda()
    output = wav2vec2forward(wav2vec2, audio)

    hf.create_dataset(f"{wav_id}-cnn_features", data=output['cnn_features'].cpu())
    hf.create_dataset(f"{wav_id}-quantized_features", data=output['quantized_features'].cpu())
    hf.create_dataset(f"{wav_id}-encoder_outputs", data=output['encoder_outputs'].cpu())
    hf.create_dataset(f"{wav_id}-context_vectors", data=output['context_vectors'].cpu())
    if 'encoder_hiddens' in output:
        hf.create_dataset(f"{wav_id}-encoder_hiddens", data=torch.stack(output['encoder_hiddens'].cpu()))

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4620/4620 [02:08<00:00, 36.01it/s]


In [36]:
hf.close()

In [19]:
"./data/data/TRAIN/DR2/FAEM0".split(os.sep)

['.', 'data', 'data', 'TRAIN', 'DR2', 'FAEM0']

In [16]:
IPython.display.Audio(filename='./data/data/TRAIN/DR2/MMAA0/SA1.WAV.wav')

In [10]:
IPython.display.Audio(filename=sa1s[1])

In [14]:
IPython.display.Audio(filename=sa1s[0])

In [99]:
!cat "./data/data/TEST/DR1/FAKS0/SA1.WRD"

9640 12783 she
12783 17103 had
17103 18760 your
18760 24104 dark
24104 29179 suit
29179 31880 in
31880 38568 greasy
38568 45119 wash
45624 51033 water
52378 55461 all
55461 60600 year


In [None]:
## Apply t-SNE for visualization
tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
tsne_results = tsne.fit_transform(embeddings)

In [None]:
data = {'x': tsne_results[:,0],
        'y': tsne_results[:,1],
       'speaker': speakers}
  
# Create DataFrame
df = pd.DataFrame(data)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(12, 7))
ax.set_xticks([])
ax.set_yticks([])
g = sns.scatterplot(x="x", y="y",
                  hue="speaker",
                  data=df,s=100, legend=False, ax=ax,
                   palette=sns.color_palette('bright',n_colors=40));

In [14]:
import pandas as pd
md = pd.read_csv("/mnt/scratch09/vnguyen/datasets/LibriSpeech/dev-clean/1272/128104/")

In [20]:
!ls /mnt/scratch09/vnguyen/datasets/LibriSpeech/dev-clean

1272  1919  2078  2428	3000  3576  5338  6241	6345  7976
1462  1988  2086  251	3081  3752  5536  6295	652   8297
1673  1993  2277  2803	3170  3853  5694  6313	777   84
174   2035  2412  2902	3536  422   5895  6319	7850  8842


In [26]:
import glob, os
file_idx = []
file_paths = []
speakers = []
books = []
for file_path in glob.glob("/mnt/scratch09/vnguyen/datasets/LibriSpeech/dev-clean/*/*/*.wav"):
    tmp = file_path.split(os.sep)
    file_idx.append(tmp[-1][:-4])
    file_paths.append(file_path)
    speakers.append(tmp[-3])
    books.append(tmp[-2])

In [30]:
df = pd.DataFrame(list(zip(file_idx, file_paths, speakers, books)), columns =['wav_id', 'wav_path', 'speaker', 'book'])
df.to_csv("./data/LibriSpeech_dev_clean.csv", index=False)

In [31]:
import h5py
hf = h5py.File('./outputs/extracted_features/wav2vec2_small-random_init/LibriSpeech_devclean_averaged.h5', 'r')

In [33]:
"./data/data/TEST/DR1/FAKS0/"

<HDF5 dataset "1272-128104-0000-vq": shape (256,), type "<f4">

In [None]:
"./data/data/TRAIN/DR1/FCJF0/

In [13]:
hf['DR1_FCJF0_SA1.WAV-quantized_features']

<HDF5 dataset "DR1_FCJF0_SA1.WAV-quantized_features": shape (256,), type "<f4">