In [1]:
import os, glob
import soundfile as sf
import numpy as np
import pandas as pd
import h5py
import tqdm
import IPython
import fairseq
import torch
from matplotlib import pyplot as plt
from sklearn.manifold import TSNE
from sklearn import svm
from sklearn import metrics
import seaborn as sns
import utils
from fairseq.dataclass.utils import convert_namespace_to_omegaconf
from fairseq.modules import Fp32LayerNorm,  TransposeLast
import pytorch_lightning as pl
import torch.nn as nn

2021-11-24 22:28:32 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX


In [3]:
class vec2wavModel(torch.nn.Module):
    def __init__(self, in_channel=256):
        super().__init__()
        self.deconv0 = torch.nn.ConvTranspose1d(in_channel, 512, kernel_size=2, stride=2, bias=False)
        self.layer_norm1 = Fp32LayerNorm(512, elementwise_affine=True)
        self.layer_norm2 = Fp32LayerNorm(256, elementwise_affine=True)
        self.deconv1 = torch.nn.ConvTranspose1d(512, 256, kernel_size=2, stride=2, bias=False)
        self.deconv2 = torch.nn.ConvTranspose1d(256, 128, kernel_size=3, stride=2, bias=False)
        self.deconv3 = torch.nn.ConvTranspose1d(128, 64, kernel_size=3, stride=2, bias=False)
        self.deconv4 = torch.nn.ConvTranspose1d(64, 32, kernel_size=3, stride=2, bias=False)
        self.deconv5 = torch.nn.ConvTranspose1d(32, 16, kernel_size=3, stride=2, bias=False)
        self.deconv6 = torch.nn.ConvTranspose1d(16, 1, kernel_size=10, stride=5, bias=False)
        
        self.activation_func = torch.nn.GELU()
        
        
    def forward(self, x):
        x = self.deconv0(x)
        x = TransposeLast()(x)
        x = self.layer_norm1(x)
        x = TransposeLast()(x)
        x = self.activation_func(x)
        x = self.deconv1(x)
        x = TransposeLast()(x)
        x = self.layer_norm2(x)
        x = TransposeLast()(x)
        x = self.activation_func(x)
        x = self.deconv2(x)
        x = self.activation_func(x)
        x = self.deconv3(x)
        x = self.activation_func(x)
        x = self.deconv4(x)
        x = self.activation_func(x)
        x = self.deconv5(x)
        x = self.activation_func(x)
        x = self.deconv6(x)
        
        return x

In [339]:
df = pd.read_csv("./data/TIMIT_test.csv")
hf = h5py.File("./outputs/extracted_features/wav2vec2_small_960h//TIMIT_test.h5", 'r')

In [7]:
vec2wav = vec2wavModel(768)

In [53]:
idx = 11
x, sr = sf.read(df.iloc[idx]['wav_path'], dtype='float32')
features =  hf[df.iloc[idx]['wav_id'] + '-' + 'encoder_output'][:]

In [117]:
checkpoint = torch.load("/mnt/scratch09/vnguyen/SpeakerRecognition/exp/tmp/checkpoints/epoch=44-step=6524.ckpt")

In [118]:
vec2wav.load_state_dict(checkpoint['state_dict'])

<All keys matched successfully>

In [119]:
with torch.no_grad():
    x_recon = vec2wav(torch.tensor(features).T.unsqueeze(0))[0]

In [120]:
IPython.display.display(IPython.display.Audio(data=x, rate=16000))
IPython.display.display(IPython.display.Audio(data=x_recon.detach().numpy(), rate=16000))

# Mel

In [2]:
class Spec2Wav(pl.LightningModule):
    def __init__(self, in_channel=256, out_feature=128):
        super().__init__()
        self.linear_layers = torch.nn.ModuleList()
        hidden_channel = in_channel
        self.linear_layers.append(nn.Linear(in_features=in_channel, out_features=hidden_channel))
        for i in range(2):
            self.linear_layers.append(nn.Linear(in_features=hidden_channel, out_features=hidden_channel))
        self.final_linear = nn.Linear(in_features=hidden_channel, out_features=out_feature)
        
        self.activation_func = torch.nn.ReLU()
        
        
    def forward(self, x):
        for i, layer in enumerate(self.linear_layers):
            x = layer(x) + x
            x = self.activation_func(x)
        
        x = self.final_linear(x)
        
        return x

In [128]:
model = Spec2Wav(768, 128).cuda()

In [129]:
checkpoint = torch.load("/mnt/scratch09/vnguyen/SpeakerRecognition/exp/reconstruct_input/wav2vec2_small/recon_melspec/from_encoder_output/all_trainset_resnet/checkpoints/epoch=57-step=6901.ckpt")
hf = h5py.File("./outputs/extracted_features/wav2vec2_small/TIMIT_test.h5", 'r')
df = pd.read_csv("./data/TIMIT_test.csv")

In [142]:
checkpoint = torch.load("/mnt/scratch09/vnguyen/SpeakerRecognition/exp/reconstruct_input/wav2vec2_small/recon_melspec_newmel/from_encoder_output/all_trainset_resnet/checkpoints/epoch=59-step=7139.ckpt")
hf = h5py.File("./outputs/extracted_features/wav2vec2_small/TIMIT_test.h5", 'r')
df = pd.read_csv("./data/TIMIT_test.csv")

In [132]:
model.load_state_dict(checkpoint['state_dict'])

<All keys matched successfully>

In [140]:
import torchaudio
# transformer = torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_fft=1024, win_length=720, hop_length=320)

idx = 112
x, sr = sf.read(df.iloc[idx]['wav_path'], dtype='float32')
features =  hf[df.iloc[idx]['wav_id'] + '-' + 'encoder_output'][:]

with torch.no_grad():
    melspec = model(torch.tensor(features).cuda())

melspec = torchaudio.transforms.InverseMelScale(sample_rate=16000, n_stft=513, n_mels=128).cuda()(melspec.T)
x_recon = torchaudio.transforms.GriffinLim(n_fft=1024, win_length=720, hop_length=320).cuda()(melspec)

In [125]:
import torchaudio
# transformer = torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_fft=1024, win_length=720, hop_length=320)

idx = 112
x, sr = sf.read(df.iloc[idx]['wav_path'], dtype='float32')
features =  hf[df.iloc[idx]['wav_id'] + '-' + 'encoder_output'][:]

with torch.no_grad():
    melspec = model(torch.tensor(features).cuda())

melspec = torchaudio.transforms.InverseMelScale(sample_rate=16000, n_stft=513, n_mels=64).cuda()(melspec.T)
x_recon = torchaudio.transforms.GriffinLim(n_fft=1024, win_length=720, hop_length=320).cuda()(melspec)

In [175]:
mel = utils.convert_to_mel(x, sr=16000, nfft=1024, win_length=1024, hop_length=320, power=2, n_mels=128, normalized=False)
melspec = torchaudio.transforms.InverseMelScale(sample_rate=16000, n_stft=513, n_mels=128).cuda()(mel.cuda())
x_recon = torchaudio.transforms.GriffinLim(n_fft=1024, win_length=1024, hop_length=320).cuda()(melspec)

In [176]:
IPython.display.display(IPython.display.Audio(data=x, rate=16000))
IPython.display.display(IPython.display.Audio(data=x_recon.cpu().detach().numpy(), rate=16000))

In [141]:
IPython.display.display(IPython.display.Audio(data=x, rate=16000))
IPython.display.display(IPython.display.Audio(data=x_recon.cpu().detach().numpy(), rate=16000))

In [372]:
IPython.display.display(IPython.display.Audio(data=x, rate=16000))
IPython.display.display(IPython.display.Audio(data=x_recon.cpu().detach().numpy(), rate=16000))

In [368]:
IPython.display.display(IPython.display.Audio(data=x, rate=16000))
IPython.display.display(IPython.display.Audio(data=x_recon.cpu().detach().numpy(), rate=16000))

# Spec

In [301]:
class Spec2Wav(pl.LightningModule):
    def __init__(self, in_channel=256):
        super().__init__()
        self.linear_layers = torch.nn.ModuleList()
        hidden_channel = in_channel
        self.linear_layers.append(nn.Linear(in_features=in_channel, out_features=hidden_channel))
        for i in range(2):
            self.linear_layers.append(nn.Linear(in_features=hidden_channel, out_features=hidden_channel))
        self.final_linear = nn.Linear(in_features=hidden_channel, out_features=513 * 2)
        
        self.activation_func = torch.nn.ReLU()
        
        
    def forward(self, x):
        for i, layer in enumerate(self.linear_layers):
            x = layer(x) + x
            x = self.activation_func(x)
        
        x = self.final_linear(x)
        
        return x

In [302]:
model = Spec2Wav(768).cuda()

In [319]:
checkpoint = torch.load("/mnt/scratch09/vnguyen/SpeakerRecognition/exp/reconstruct_input/wav2vec2_small/recon_spec/from_encoder_output/all_trainset_resnet/checkpoints/epoch=8-step=1070.ckpt")
model.load_state_dict(checkpoint['state_dict'])

<All keys matched successfully>

In [320]:
import torchaudio
spec2wav_converter = torchaudio.transforms.InverseSpectrogram(n_fft=1024, hop_length=320, win_length=720).cuda()

idx = 16
x, sr = sf.read(df.iloc[idx]['wav_path'], dtype='float32')
features =  hf[df.iloc[idx]['wav_id'] + '-' + 'encoder_output'][:]

with torch.no_grad():
    spec = model(torch.tensor(features).cuda())

spec = spec[:, :513] + 1j*spec[:, 513:]
    
x_recon = spec2wav_converter(spec.T)

In [321]:
IPython.display.display(IPython.display.Audio(data=x, rate=16000))
IPython.display.display(IPython.display.Audio(data=x_recon.cpu().detach().numpy(), rate=16000))

In [158]:
df_train = pd.read_csv("./data/TIMIT_train.csv")

In [189]:
sorted(list(set(df['sentence'])))

['SA1.WAV',
 'SA2.WAV',
 'SI1000.WAV',
 'SI1001.WAV',
 'SI1002.WAV',
 'SI1003.WAV',
 'SI1004.WAV',
 'SI1009.WAV',
 'SI1010.WAV',
 'SI1013.WAV',
 'SI1015.WAV',
 'SI1019.WAV',
 'SI1024.WAV',
 'SI1026.WAV',
 'SI1029.WAV',
 'SI1030.WAV',
 'SI1033.WAV',
 'SI1037.WAV',
 'SI1038.WAV',
 'SI1039.WAV',
 'SI1040.WAV',
 'SI1043.WAV',
 'SI1084.WAV',
 'SI1085.WAV',
 'SI1088.WAV',
 'SI1089.WAV',
 'SI1090.WAV',
 'SI1091.WAV',
 'SI1092.WAV',
 'SI1093.WAV',
 'SI1094.WAV',
 'SI1099.WAV',
 'SI1100.WAV',
 'SI1103.WAV',
 'SI1105.WAV',
 'SI1108.WAV',
 'SI1109.WAV',
 'SI1114.WAV',
 'SI1116.WAV',
 'SI1124.WAV',
 'SI1127.WAV',
 'SI1128.WAV',
 'SI1129.WAV',
 'SI1130.WAV',
 'SI1133.WAV',
 'SI1144.WAV',
 'SI1146.WAV',
 'SI1174.WAV',
 'SI1175.WAV',
 'SI1178.WAV',
 'SI1179.WAV',
 'SI1180.WAV',
 'SI1181.WAV',
 'SI1182.WAV',
 'SI1183.WAV',
 'SI1184.WAV',
 'SI1189.WAV',
 'SI1190.WAV',
 'SI1193.WAV',
 'SI1195.WAV',
 'SI1198.WAV',
 'SI1199.WAV',
 'SI1204.WAV',
 'SI1213.WAV',
 'SI1214.WAV',
 'SI1217.WAV',
 'SI1218.WAV',
 

In [187]:
sorted(list((df['sentence'])))

['SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA1.WAV',
 'SA

In [179]:
def convert_to_mel(x, sr=16000, nfft=1024, win_length=720, hop_length=320):
    transformer = torchaudio.transforms.MelSpectrogram(sample_rate=sr, 
                                                       n_fft=nfft, 
                                                       win_length=win_length, 
                                                       hop_length=hop_length)
    x = torch.tensor(x).unsqueeze(0)
    melspec = transformer(x)
    
    return melspec

In [252]:
x, sr = sf.read(df.iloc[0]['wav_path'], dtype='float32')

In [253]:
wav2spec_converter = torchaudio.transforms.Spectrogram(n_fft=1024, win_length=720, hop_length=320, power=None)

In [256]:
wav2spec_converter(torch.tensor(x)).

tensor([[ 0.0323,  0.0314,  0.0343,  ...,  0.0163,  0.0176,  0.0218],
        [-0.0226, -0.0224, -0.0255,  ..., -0.0111, -0.0132, -0.0155],
        [ 0.0060,  0.0063,  0.0092,  ...,  0.0015,  0.0048,  0.0040],
        ...,
        [ 0.0009,  0.0003,  0.0007,  ...,  0.0015, -0.0002, -0.0001],
        [ 0.0010,  0.0006, -0.0006,  ..., -0.0013,  0.0006,  0.0007],
        [-0.0019, -0.0010,  0.0006,  ...,  0.0012, -0.0008, -0.0010]])

In [261]:
tmp = wav2spec_converter(torch.tensor(x))

In [271]:
!rm -rf ./data/TIMIT/train/spectrogram.h5

In [272]:
h5out = h5py.File("./data/TIMIT/train/spectrogram.h5", 'w')

In [273]:
for i, row in df.iterrows():
    x, sr = sf.read(df.iloc[i]['wav_path'], dtype='float32')
    tmp = wav2spec_converter(torch.tensor(x))
    h5out.create_dataset(row['wav_id'], data=torch.stack([tmp.real.T, tmp.imag.T]))

In [274]:
h5out.close()

In [40]:
ksizes = [10, 3, 3, 3, 3, 2, 2]
strides = [5, 2, 2, 2, 2, 2, 2]

In [41]:
r = 2
for ksize, stride in zip(ksizes[::-1], strides[::-1]):
    r = (r - 1) * stride + ksize
    print(r)

4
8
17
35
71
143
720
