In [1]:
import torch
import pickle
from src.models.components.lstm_autoencoder import LSTMAutoencoder

  from .autonotebook import tqdm as notebook_tqdm


In [16]:
lstm_enc = torch.nn.LSTM(1, 128, 3, batch_first=True)
fc_enc = torch.nn.Linear(128, 64)
fc_dec = torch.nn.Linear(64, 128)
lstm_dec = torch.nn.LSTM(128, 1, 3, batch_first=True)

t = torch.randn(16, 100, 1)
out, _ = lstm_enc(t)
print("lstm enc out shape:", out.shape)
out = fc_enc(out)
print("fc enc out shape:", out.shape)
out = fc_dec(out)
print("fc dec out shape:", out.shape)
out, _ = lstm_dec(out)
print("lstm dec out shape:", out.shape)

lstm enc out shape: torch.Size([16, 100, 128])
fc enc out shape: torch.Size([16, 100, 64])
fc dec out shape: torch.Size([16, 100, 128])
lstm dec out shape: torch.Size([16, 100, 1])


In [17]:
# Hyperparameters
input_size = 1  # Number of input features
hidden_size = 256  # Hidden units in LSTM layers
latent_size = 128  # Size of the fixed-size representation
num_layers = 1  # Number of LSTM layers
dropout = 0.0  # Dropout probability

# Create the LSTM autoencoder
model = LSTMAutoencoder(input_size, hidden_size, latent_size, num_layers, dropout)

# input params
bs = 8
seq_len = 100
n_features = 1

# Example usage
time_series = torch.randn(
    bs, seq_len, n_features
)  # 8 time-series samples, each of length 100 and with 1 feature
reconstructed_time_series, encoding = model(time_series)

print("Original time-series shape:", time_series.shape)
print("Reconstructed time-series shape:", reconstructed_time_series.shape)
print("Encoding shape:", encoding.shape)

Input shape: torch.Size([8, 100, 1])
Original time-series shape: torch.Size([8, 100, 1])
Reconstructed time-series shape: torch.Size([8, 100, 1])
Encoding shape: torch.Size([8, 100, 128])


### Dataloader

In [4]:
from src.utils.old_features import (
    get_features_from_lab_root,
    F0Extractor,
    read_lab_file,
    get_features_from_lab_wav_path,
    get_features_from_lab_root,
)
import scipy.io.wavfile as wav
import numpy as np
from src.utils.prosody_tools.f0_processing import _interpolate
from src.data.components.feature_extractors import LibriTTSFeatureExtractor

[nltk_data] Downloading package punkt to /Users/lukas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/lukas/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [5]:
WAV_PATH = "/Users/lukas/Desktop/projects/MIT/data/LibriTTS/debug/237/126133/237_126133_000001_000000.wav"
LAB_PATH = "/Users/lukas/Desktop/projects/MIT/data/LibriTTSCorpusLabel/debug/237/126133/237_126133_000001_000000.lab"
LAB_ROOT = "/Users/lukas/Desktop/projects/MIT/data/LibriTTSCorpusLabel/debug"
WAV_ROOT = "/Users/lukas/Desktop/projects/MIT/data/LibriTTS/debug"

In [6]:
f0_extractor = F0Extractor(modes=["curve"])

In [7]:
extractor = LibriTTSFeatureExtractor(
    lab_root=LAB_ROOT,
    wav_root=WAV_ROOT,
)

Start extracting features from /Users/lukas/Desktop/projects/MIT/data/LibriTTSCorpusLabel/debug


Readers: 100%|██████████| 1/1 [01:04<00:00, 64.06s/it]

Finished extracting 99 samples.





In [8]:
curves = extractor.get_all_f0_curve()
len(curves)

99

In [9]:
curves = [c for curve in curves for c in curve]
len(curves)

1683

In [22]:
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence


class TimeSeriesDataset(Dataset):
    def __init__(self, data, texts=None):
        self.data = data
        self.texts = texts

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        sequence = torch.tensor(self.data[index], dtype=torch.float32)
        if self.texts is not None:
            text = self.texts[index]
            return sequence, text
        return sequence


def rnn_collate_fn(batch, pad_value=-999):
    sequences = [item for item in batch]
    lengths = torch.tensor([len(seq) for seq in sequences])
    padded_sequences = pad_sequence(
        sequences, batch_first=True, padding_value=pad_value
    )
    mask = (padded_sequences != pad_value).float()  # Assuming 0 is your padding value
    # unsqueeze(2) adds a dimension for the features
    return padded_sequences.unsqueeze(2), lengths, mask.unsqueeze(2)
    # return padded_sequences, lengths, mask

In [23]:
curves[0]

array([ 1.676,  1.728,  1.715,  1.715,  1.715,  1.29 ,  1.29 ,  1.29 ,
        1.264,  1.367,  1.367,  1.431,  1.483,  1.483,  1.509,  1.522,
        1.522,  1.612,  1.625,  1.637, -0.435, -0.409, -0.396, -0.371,
       -0.371, -0.371, -0.383, -0.383, -0.383, -0.371, -0.345, -0.345,
       -0.358, -0.358, -0.358, -0.371, -0.396, -0.409, -0.409, -0.474,
       -0.525, -0.589, -0.396, -0.396, -0.396, -0.383, -0.383, -0.383,
       -0.396, -0.409, -0.435, -0.448, -0.461, -0.474, -0.474, -0.474,
       -0.499, -0.512, -0.525, -0.538, -0.551, -0.564, -0.577, -0.602,
       -0.602, -0.602, -0.602, -0.615, -0.615, -0.615, -0.615, -0.628,
       -0.628, -0.641, -0.641, -0.641, -0.654, -0.654, -0.68 , -0.68 ,
       -0.692, -0.731, -0.744, -0.808, -0.203, -0.126, -0.139, -0.1  ,
        0.067,  0.08 ,  0.222,  0.157,  0.157])

In [24]:
dataset = TimeSeriesDataset(curves)

In [25]:
dataset.__getitem__(0)

tensor([ 1.6761,  1.7276,  1.7147,  1.7147,  1.7147,  1.2899,  1.2899,  1.2899,
         1.2642,  1.3671,  1.3671,  1.4315,  1.4830,  1.4830,  1.5087,  1.5216,
         1.5216,  1.6117,  1.6246,  1.6374, -0.4350, -0.4092, -0.3964, -0.3706,
        -0.3706, -0.3706, -0.3835, -0.3835, -0.3835, -0.3706, -0.3449, -0.3449,
        -0.3577, -0.3577, -0.3577, -0.3706, -0.3964, -0.4092, -0.4092, -0.4736,
        -0.5251, -0.5894, -0.3964, -0.3964, -0.3964, -0.3835, -0.3835, -0.3835,
        -0.3964, -0.4092, -0.4350, -0.4478, -0.4607, -0.4736, -0.4736, -0.4736,
        -0.4993, -0.5122, -0.5251, -0.5380, -0.5508, -0.5637, -0.5766, -0.6023,
        -0.6023, -0.6023, -0.6023, -0.6152, -0.6152, -0.6152, -0.6152, -0.6281,
        -0.6281, -0.6409, -0.6409, -0.6409, -0.6538, -0.6538, -0.6795, -0.6795,
        -0.6924, -0.7310, -0.7439, -0.8083, -0.2033, -0.1260, -0.1389, -0.1003,
         0.0670,  0.0799,  0.2215,  0.1571,  0.1571])

In [26]:
batch_size = 2  # Adjust the batch size as needed

dataloader = DataLoader(
    dataset, batch_size=batch_size, shuffle=False, collate_fn=rnn_collate_fn
)

In [27]:
for batch in dataloader:
    padded_sequences, lengths, mask = batch
    print("Padded sequences shape:", padded_sequences.shape)
    # print("Padded sequences:", padded_sequences)
    print("Lengths shape:", lengths.shape)
    # print("Lengths:", lengths)
    print("Mask shape:", mask.shape)
    # print("Mask:", mask)
    break

Padded sequences shape: torch.Size([2, 93, 1])
Lengths shape: torch.Size([2])
Mask shape: torch.Size([2, 93])
