In [1]:
#Importing necessary libraries
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA
import json
import numpy as np
import pandas as pd
from datasets import load_dataset
import torch
from torch import nn
from scipy.stats import pearsonr
from torch.utils.data import DataLoader, TensorDataset, random_split
from sklearn.model_selection import train_test_split

In [2]:
# Load your fine-tuned local model 
model_path = '/home/mendu/Thesis/data/musiccaps/new_embedding_model2'
model = SentenceTransformer(model_path)

In [3]:
# Loading the MusicCaps dataset from HuggingFace
msd_dataset = load_dataset('seungheondoh/LP-MusicCaps-MSD')

In [4]:
# Only using the trainingn set
train = pd.DataFrame(msd_dataset['train'])

In [5]:
# Converting the captions to a list of size 44865
sentences = train['caption_writing'].to_list()

In [6]:
len(sentences)

444865

## Training a PCA

In [7]:
'''By calling model.encode() we are converting the list of entences into encoded vectors'''

# Generate embeddings for your sentences using the fine-tuned model
embedded_sentences = model.encode(sentences, show_progress_bar=True, convert_to_numpy=True)

# Number of PCA components (e.g., reduce to 50 dimensions)
# num_components = 50
# obj = PCA(n_components=num_components)

# Fit the PCA model to the embedded sentences (this will find the principal components)
# pca_embeddings = obj.fit_transform(embedded_sentences)

Batches:   0%|          | 0/13903 [00:00<?, ?it/s]

In [8]:
embedded_sentences.shape #these are our roberta encodings

(444865, 768)

In [9]:
# # Project the PCA embeddings back to the original space
# projected_embeddings = obj.inverse_transform(pca_embeddings)

# # Initialize an empty list to store the Pearson correlation coefficients
# pearsons_correlations = []

# # Calculate Pearson's correlation for each pair of original and projected embeddings
# for original, projected in zip(embedded_sentences, projected_embeddings):
#     # Compute Pearson's r
#     corr, _ = pearsonr(original, projected)
#     pearsons_correlations.append(corr)

# # If you want to compute a single Pearson's correlation coefficient for all data
# # Concatenate all embeddings and compute the correlation
# flat_original = embedded_sentences.flatten()
# flat_projected = projected_embeddings.flatten()
# overall_corr, _ = pearsonr(flat_original, flat_projected)

# # print("Pearsons correlation for each embedding pair:", pearsons_correlations)
# print("Overall Pearson's correlation:", overall_corr)

In [10]:
# print("Original embeddings shape:", embedded_sentences.shape)
# print("PCA-reduced embeddings shape:", pca_embeddings.shape)

## Training an autoencoder

In [11]:
# Defining the autoencoder class and architecture
class Autoencoder(nn.Module):
    def __init__(self, input_size, encoding_size):
        super(Autoencoder, self).__init__()
        # Encoder
        self.encoder = nn.Sequential(
            nn.Linear(input_size, 512),
            nn.ReLU(True),
            nn.BatchNorm1d(512),
            nn.Linear(512, 256),
            nn.ReLU(True),
            nn.BatchNorm1d(256),  # Added batch normalization
            nn.Linear(256, 128),  # Added another layer
            nn.ReLU(True),
            nn.BatchNorm1d(128),
            nn.Linear(128, encoding_size),  # Adjusted the size of the encoding layer
            nn.Sigmoid()
        )
        # Decoder
        self.decoder = nn.Sequential(
            nn.Linear(encoding_size, 128),  # Adjusted the size of the decoding layer
            nn.ReLU(True),
            nn.Linear(128, 256),  # Added another layer
            nn.ReLU(True),
            nn.BatchNorm1d(256),  # Added batch normalization
            nn.Linear(256, 512),
            nn.ReLU(True),
            nn.Linear(512, input_size),
            nn.Sigmoid()  # Using Sigmoid because embeddings are likely normalized
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

In [12]:
# Convert the NumPy array of embeddings to a PyTorch tensor
embedded_sentences_tensor = torch.tensor(embedded_sentences, dtype=torch.float32)

# Create a dataset and a dataloader
dataset = TensorDataset(embedded_sentences_tensor)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True, drop_last=True)

In [13]:
# Instantiate the autoencoder
input_size = embedded_sentences.shape[1]
encoding_size = 128  # change this to whatever size you want to encode down to
autoencoder = Autoencoder(input_size=input_size, encoding_size=encoding_size)

# Define loss function and optimizer
# criterion = nn.MSELoss()
criterion = nn.L1Loss()  # MAE
optimizer = torch.optim.Adam(autoencoder.parameters(), lr=1e-3)

In [14]:
# Early Stopping Class
class EarlyStopping:
    def __init__(self, patience=7, verbose=False, delta=0):
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.train_loss_min = np.Inf
        self.delta = delta

    def __call__(self, train_loss, model):
        score = -train_loss
        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(train_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            if self.verbose:
                print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(train_loss, model)
            self.counter = 0

    def save_checkpoint(self, train_loss, model):
        if self.verbose:
            print(f'Training loss decreased ({self.train_loss_min:.6f} --> {train_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), '/home/mendu/Thesis/data/musiccaps/auto_encoder/saved_checkpoints128/checkpoint.pt')
        self.train_loss_min = train_loss

In [15]:
epochs = 50  # Set this to the number of epochs to train for
early_stopping = EarlyStopping(patience=10, verbose=True)

for epoch in range(epochs):
    autoencoder.train()
    train_loss = 0.0
    for data in dataloader:
        inputs = data[0]
        # Forward pass
        outputs = autoencoder(inputs)
        loss = criterion(outputs, inputs)

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    train_loss = train_loss / len(dataloader)
        
    print(f'Epoch {epoch+1}, Training Loss: {train_loss:.6f}')

    early_stopping(train_loss, autoencoder)

    if early_stopping.early_stop:
        print("Early stopping")
        break


Epoch 1, Training Loss: 0.402292
Training loss decreased (inf --> 0.402292).  Saving model ...
Epoch 2, Training Loss: 0.387970
Training loss decreased (0.402292 --> 0.387970).  Saving model ...
Epoch 3, Training Loss: 0.385184
Training loss decreased (0.387970 --> 0.385184).  Saving model ...
Epoch 4, Training Loss: 0.383412
Training loss decreased (0.385184 --> 0.383412).  Saving model ...
Epoch 5, Training Loss: 0.382088
Training loss decreased (0.383412 --> 0.382088).  Saving model ...
Epoch 6, Training Loss: 0.381156
Training loss decreased (0.382088 --> 0.381156).  Saving model ...
Epoch 7, Training Loss: 0.380418
Training loss decreased (0.381156 --> 0.380418).  Saving model ...
Epoch 8, Training Loss: 0.379833
Training loss decreased (0.380418 --> 0.379833).  Saving model ...
Epoch 9, Training Loss: 0.379356
Training loss decreased (0.379833 --> 0.379356).  Saving model ...
Epoch 10, Training Loss: 0.378942
Training loss decreased (0.379356 --> 0.378942).  Saving model ...
Epoc

In [16]:
# Load the last checkpoint with the best model
autoencoder.load_state_dict(torch.load('/home/mendu/Thesis/data/musiccaps/auto_encoder/saved_checkpoints128/checkpoint.pt'))

<All keys matched successfully>

In [17]:
# Check the range of the output
with torch.no_grad():
    sample_output = autoencoder(embedded_sentences_tensor)
    output_min = sample_output.min().item()
    output_max = sample_output.max().item()

print(f"Output range of the autoencoder: [{output_min}, {output_max}]")

Output range of the autoencoder: [8.926408544998589e-38, 1.0]


In [18]:
# Save the encoder's state_dict
torch.save(autoencoder.encoder.state_dict(), '/home/mendu/Thesis/data/musiccaps/auto_encoder/encoder_state_dict128.pth')

In [19]:
# # Switch the autoencoder to evaluation mode
# autoencoder.eval()

# # Process the entire dataset to obtain the decoded (projected) embeddings
# encoded_embeddings = autoencoder.encoder(embedded_sentences_tensor).detach().numpy()
# decoded_embeddings = autoencoder.decoder(torch.from_numpy(encoded_embeddings)).detach().numpy()

In [20]:
# Save the encoder's state_dict
# torch.save(autoencoder.encoder.state_dict(), '/home/mendu/Thesis/data/musiccaps/auto_encoder/encoder_state_dict128.pth')

Change the metric, reconstruction loss

In [21]:
# print("Original embeddings shape:", embedded_sentences_tensor.shape)

# # Switch autoencoder to evaluation mode
# autoencoder.eval()

# # Process the entire dataset to obtain the encoded embeddings
# encoded_embeddings = autoencoder.encoder(embedded_sentences_tensor).detach()

# print("Entire dataset encoded embeddings shape:", encoded_embeddings.shape)

## Trying to get the word embeddings of the 8 class labels

In [22]:
# # Assuming you have your SentenceTransformer model loaded as `model`
# new_sentence = "Your new sentence."
# new_sentence_embedding = model.encode(new_sentence, convert_to_tensor=True)

# # Create a new Autoencoder instance and load the trained encoder
# autoencoder = Autoencoder(input_size=768, encoding_size=64)
# encoder_state_dict = torch.load('/home/mendu/Thesis/data/musiccaps/auto_encoder/encoder_state_dict.pth')
# autoencoder.encoder.load_state_dict(encoder_state_dict)

# # You may need to ensure the new sentence embedding is on the same device (CPU/GPU) as the model
# # e.g., if the autoencoder is on the GPU, you need to do: new_sentence_embedding = new_sentence_embedding.to('cuda')

# # Pass your new sentence embedding through the encoder
# autoencoder.eval()  # Important: set the model to evaluation mode
# with torch.no_grad():
#     new_encoded_embedding = autoencoder.encoder(new_sentence_embedding.unsqueeze(0))  # Add dummy batch dimension

# # Convert to numpy array if needed
# new_encoded_embedding = new_encoded_embedding.cpu().numpy()  # Call .cpu() if model is on GPU

# # The variable 'new_encoded_embedding' now contains the 64-dimensional vector for the new sentence

In [23]:
# import os

# os.listdir('/home/mendu/Thesis/data/musiccaps/new_embedding_model')

In [24]:
# #Chnage this, dont use the tokeniser use model


# import numpy as np
# from transformers import AutoTokenizer

# # Load the fine-tuned tokenizer
# tokenizer = AutoTokenizer.from_pretrained('/home/mendu/Thesis/data/musiccaps/new_embedding_model/')  # Update the path accordingly

# # Load the 64-dimensional word embeddings
# embeddings_reduced = np.load('/home/mendu/Thesis/data/musiccaps/auto_encoder/encoded_embeddings.npy')  # Update with the correct .npy file path

# # Function to get the embedding of a specific word
# def get_word_embedding(word):
#     # Tokenize the word to get its ID
#     token_id = tokenizer.encode(word, add_special_tokens=False)
#     if not token_id:
#         raise ValueError(f"The word '{word}' was not found in the tokenizer's vocabulary.")
#     elif len(token_id) > 1:
#         raise ValueError(f"The input text '{word}' corresponds to multiple tokens.")
#     token_id = token_id[0]  # We only expect one token ID for a single word input
    
#     # Fetch the corresponding embedding
#     word_embedding = embeddings_reduced[token_id]
#     return word_embedding

# # Example usage:
# word = "rock"  # Replace with the word you're interested in
# embedding_of_music = get_word_embedding(word)
# print(f"The embedding for the word '{word}' is:", embedding_of_music)

In [25]:
## Older autoencoder code
# import torch
# from torch import nn

# class Autoencoder(nn.Module):
#     def __init__(self, input_size, encoding_size):
#         super(Autoencoder, self).__init__()
#         # Encoder
#         self.encoder = nn.Sequential(
#             nn.Linear(input_size, encoding_size),
#             nn.ReLU(True)
#         )
#         # Decoder
#         self.decoder = nn.Sequential(
#             nn.Linear(encoding_size, input_size),
#             nn.Sigmoid()  # Using Sigmoid because embeddings are likely normalized
#         )

#     def forward(self, x):
#         encoded = self.encoder(x)
#         decoded = self.decoder(encoded)
#         return decoded
    