In [1]:
import numpy as np
import matplotlib.pyplot as plt
import torch  as th
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import pickle
import pandas as pd
import seaborn as sns
from tqdm import tqdm
from transformers import Wav2Vec2Model
from torch.utils.data import TensorDataset, DataLoader

In [2]:
class RandomProjection:
    def __init__(self, input_dim, quantizer_dim, random_state=None):
        self.random_state = random_state
        self.input_dim = input_dim
        self.quantizer_dim = quantizer_dim
        self.projection_matrix = None
        self.codebook = None

    def initialize_quantizer(self):
        # Initialize projection matrix with Xavier initialization
        self.projection_matrix = nn.init.xavier_uniform_(th.empty(self.quantizer_dim, self.input_dim))

        # Initialize codebook with standard normal distribution
        self.codebook = th.randn(self.input_dim // 10, self.quantizer_dim)

    def project(self, X):
        th.manual_seed(self.random_state)

        self.initialize_quantizer()

        #Normalize data
        X_normalized = X / th.norm(X, dim=1, keepdim=True)

        # Project input data using projection matrix
        X_projected = th.matmul(X_normalized, self.projection_matrix.T)
        return X_projected

    def quantize(self, X):

        X_projected = self.project(X)
        expanded_codebook = codebook.repeat(X_projected.shape[0],1,1)

        # Compute distances to codebook vectors
        distances = th.norm(th.norm(X_projected) - th.norm(self.codebook), dim=2)

        # Find nearest vector index
        labels = th.argmin(distances, dim=1)

        return labels


class ASREncoder(nn.Module):
    def __init__(self, input_dim, num_classes, hidden_dim=256, num_layers=5, dropout=0.1):
        super(ASREncoder, self).__init__()

        # Feature extractor (Wav2Vec2 model)
        self.feature_extractor = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")
        self.feature_extractor.freeze_feature_extractor()  # Freeze feature extractor

        # Conformer layers
        conformer_layers = []
        for _ in range(num_layers):
            conformer_layers.append(nn.TransformerEncoderLayer(
                d_model=input_dim,
                nhead=4,
                dim_feedforward=hidden_dim,
                dropout=dropout,
                activation='gelu'
            ))
        self.conformer_encoder = nn.TransformerEncoder(nn.ModuleList(conformer_layers), num_layers=num_layers)

        # Classifier head
        self.classifier = nn.Linear(input_dim, num_classes)

    def forward(self, x):
        # Extract features using the pre-trained Wav2Vec2 model
        features = self.feature_extractor(x).last_hidden_state

        # Transpose features for Transformer input format (seq_len, batch_size, hidden_dim)
        features = features.permute(1, 0, 2)

        # Apply Conformer layers
        features = self.conformer_encoder(features)

        # Average pooling over time
        pooled_features = torch.mean(features, dim=0)

        # Classification
        output = self.classifier(pooled_features)

        return output

class SelfSupervisedRandomProjectionQuantizer:
    def __init__(self, input_dim, quantizer_dim, num_classes, hidden_dim, batch_size):
        self.quantizer = RandomProjection(input_dim, quantizer_dim)
        self.asr_encoder = ASREncoder(quantizer_dim, hidden_dim, num_classes)
        self.loss_function = nn.CrossEntropyLoss()
        self.batch_size = batch_size
        self.optimizer = None

    def initialize_optimizer(self, learning_rate):
        self.optimizer = optim.Adam(self.asr_encoder.parameters(), lr=learning_rate)


    def split_batch(self, X, y):
        dataset = TensorDataset(X, y)
        data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
        return data_loader




    def pretrain(self, xtrain, ytrain, xvalid, yvalid, epochs=10, batch_size=32):
        # Initialize quantizer
        self.quantizer.initialize_quantizer()
        trainloader = self.splitbatch(xtrain, ytrain)
        validloader = self.splitbatch(xvalid, yvalid)

        # Iterate over epochs
        for epoch in range(epochs):
            epoch_loss = 0.0

            for inputs, labels in trainloader:

                # Zero gradients
                self.optimizer.zero_grad()

                # Forward pass
                labels = self.quantizer.quantize(inputs,)
                predictions = self.asr_encoder(labels)

                # Compute loss
                loss = self.loss_function(predictions, mini_batch_y)

                # Backward pass
                loss.backward()

                # Update weights
                self.optimizer.step()

                epoch_loss += loss.item()

            # Print epoch loss
            print(f"Pretraining Epoch {epoch+1}/{epochs}, Loss: {epoch_loss / len(X_tensor)}")


In [3]:
filepath = 'data/cassette-th-data.pck'
xtrain, xvalid, ytrain, yvalid = np.load(filepath, allow_pickle = True)
input_dim = xtrain[0].shape
input_dim[0]

600

In [4]:
tmp = RandomProjection(input_dim= 600, quantizer_dim= 100, random_state= 23)
tmp.quantize(xtrain)

NameError: name 'codebook' is not defined

In [51]:
tmp = ASREncoder(8, 4)
tmp


Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ASREncoder(
  (feature_extractor): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2GroupNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
          (activation): GELUActivation()
          (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
        )
        (1-4): 4 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
        (5-6): 2 x Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
      )
    )
    (feature_projection): Wav2Vec2FeatureProjection(
      (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (projection): Linear(in_features=512, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (enc

In [70]:
test = SelfSupervisedRandomProjectionQuantizer(input_dim= 600, quantizer_dim = 100, hidden_dim= 10, num_classes=5, batch_size=200)

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [79]:
mini_batchs = test.split_batch(xtrain)
trainloader = DataLoader(mini_batchs, batch_size = 200, shuffle = False)


195 items not taken


In [82]:
test.split_batch(ytrain)

195 items not taken


IndexError: too many indices for tensor of dimension 1

In [102]:
dataset = TensorDataset(xtrain, ytrain)

# Define batch size
batch_size = 100  # Choose your desired batch size

# Create a DataLoader for your dataset
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)  # You can set shuffle=True to shuffle the data during training


In [103]:
for inputs, labels in data_loader:
    a = 1
inputs, labels

(tensor([[ 7.2484e+00, -2.4982e-01,  3.7492e+00,  ...,  7.7482e+00,
           9.2479e+00,  2.0245e+01],
         [-1.2451e+01, -1.2903e+01, -1.2903e+01,  ..., -7.0176e+00,
          -2.2637e-01,  5.6593e+00],
         [ 2.8723e+02,  2.1895e+02,  1.4922e+02,  ..., -7.1949e+00,
          -1.8333e+01, -5.2230e+01],
         ...,
         [ 2.5690e-01, -2.6603e+00, -3.6327e+00,  ...,  8.5223e+00,
           3.1741e+00,  2.6879e+00],
         [-7.9414e-01,  6.1758e-01,  2.9705e+00,  ..., -1.3500e+01,
          -9.7350e+00, -1.8676e+01],
         [ 9.9558e+00,  9.5160e+00,  1.6113e+01,  ...,  1.8312e+01,
           5.1179e+00, -5.9951e-01]]),
 tensor([2, 2, 0, 1, 0, 1, 0, 2, 1, 4, 1, 2, 1, 2, 0, 2, 0, 2, 2, 2, 1, 2, 2, 2,
         2, 4, 0, 0, 3, 2, 2, 4, 2, 4, 1, 4, 0, 0, 4, 2, 0, 2, 0, 2, 4, 2, 2, 1,
         2, 2, 4, 1, 2, 1, 0, 2, 2, 2, 3, 0, 2, 1, 2, 0, 3, 2, 0, 0, 0, 0, 2, 2,
         1, 0, 2, 2, 1, 0, 0, 0, 2, 1, 0, 2, 1, 0, 2, 0, 2, 0, 2, 0, 2, 4, 1],
        dtype=torch.int32))

In [38]:
proj = tmp.project(xtrain)
codebook = tmp.codebook
th.norm(xtrain[0]) - th.norm(codebook, dim = 1)


tensor([191.7634, 191.6759, 191.7279, 191.8734, 192.4562, 191.9494, 192.5126,
        193.1053, 192.0806, 191.3367, 192.6963, 192.1805, 192.6700, 191.8028,
        191.7176, 191.1822, 191.8556, 192.2178, 192.5416, 191.4962, 190.9630,
        192.2687, 191.8800, 191.9859, 192.4312, 191.9797, 193.7579, 191.1921,
        192.7931, 192.0800, 192.2573, 191.1365, 191.9663, 191.7607, 192.5780,
        192.9385, 191.2001, 191.8400, 192.1129, 192.2557, 191.3062, 192.0537,
        191.7319, 191.1817, 192.3630, 192.1825, 193.0748, 191.0130, 192.0197,
        191.8678, 190.7955, 191.7078, 192.3392, 192.4276, 193.3549, 193.8392,
        191.8078, 192.5172, 191.7083, 192.0896])

In [39]:
codebook = codebook.repeat(200, 1,1)
th.norm(xtrain[:200], dim = 1) - th.norm(codebook, dim = 2)

RuntimeError: The size of tensor a (200) must match the size of tensor b (60) at non-singleton dimension 1

In [40]:
a = th.norm(xtrain[:200], dim = 1)
b = th.norm(codebook, dim = 1)


In [41]:
th.argmin(a -  th.transpose(th.norm(codebook, dim = 2), 0, 1), dim = 0)

tensor([50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
        50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
        50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
        50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
        50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
        50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
        50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
        50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
        50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
        50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
        50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
        50, 50])

In [14]:
th.argmin(a - th.transpose(b, 0, 1), dim =0)

tensor([50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50,
        50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50])