In [1]:
#Importing necessary libraries
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA
import json
import numpy as np
import pandas as pd
from datasets import load_dataset
import torch
from torch import nn
from scipy.stats import pearsonr
from torch.utils.data import DataLoader, TensorDataset, random_split
from sklearn.model_selection import train_test_split

In [2]:
# Load your fine-tuned local model 
model_path = '/home/mendu/Thesis/data/musiccaps/new_embedding_model'
model = SentenceTransformer(model_path)

In [3]:
# Loading the MusicCaps dataset from HuggingFace
msd_dataset = load_dataset('seungheondoh/LP-MusicCaps-MSD')

In [4]:
# Only using the trainingn set
train = pd.DataFrame(msd_dataset['train'])
# test = pd.DataFrame(msd_dataset['test'])
# valid = pd.DataFrame(msd_dataset['valid'])

In [5]:
# Converting the captions to a list of size 44865
sentences = train['caption_writing'].to_list()

## Training a PCA

In [6]:
# Generate embeddings for your sentences using the fine-tuned model
embedded_sentences = model.encode(sentences, show_progress_bar=True, convert_to_numpy=True)

# Number of PCA components (e.g., reduce to 50 dimensions)
num_components = 50
# obj = PCA(n_components=num_components)

# Fit the PCA model to the embedded sentences (this will find the principal components)
# pca_embeddings = obj.fit_transform(embedded_sentences)

Batches:   0%|          | 0/13903 [00:00<?, ?it/s]

In [None]:
# # Project the PCA embeddings back to the original space
# projected_embeddings = obj.inverse_transform(pca_embeddings)

# # Initialize an empty list to store the Pearson correlation coefficients
# pearsons_correlations = []

# # Calculate Pearson's correlation for each pair of original and projected embeddings
# for original, projected in zip(embedded_sentences, projected_embeddings):
#     # Compute Pearson's r
#     corr, _ = pearsonr(original, projected)
#     pearsons_correlations.append(corr)

# # If you want to compute a single Pearson's correlation coefficient for all data
# # Concatenate all embeddings and compute the correlation
# flat_original = embedded_sentences.flatten()
# flat_projected = projected_embeddings.flatten()
# overall_corr, _ = pearsonr(flat_original, flat_projected)

# # print("Pearsons correlation for each embedding pair:", pearsons_correlations)
# print("Overall Pearson's correlation:", overall_corr)

In [None]:
# print("Original embeddings shape:", embedded_sentences.shape)
# print("PCA-reduced embeddings shape:", pca_embeddings.shape)

## Training an autoencoder

In [7]:
# Defining the autoencoder class and architecture
class Autoencoder(nn.Module):
    def __init__(self, input_size, encoding_size):
        super(Autoencoder, self).__init__()
        # Encoder
        self.encoder = nn.Sequential(
            nn.Linear(input_size, 512),
            nn.ReLU(True),
            nn.BatchNorm1d(512),
            nn.Linear(512, 256),
            nn.ReLU(True),
            nn.BatchNorm1d(256),  # Added batch normalization
            nn.Linear(256, 128),  # Added another layer
            nn.ReLU(True),
            nn.Linear(128, encoding_size),  # Adjusted the size of the encoding layer
            nn.ReLU(True)
        )
        # Decoder
        self.decoder = nn.Sequential(
            nn.Linear(encoding_size, 128),  # Adjusted the size of the decoding layer
            nn.ReLU(True),
            nn.Linear(128, 256),  # Added another layer
            nn.ReLU(True),
            nn.BatchNorm1d(256),  # Added batch normalization
            nn.Linear(256, 512),
            nn.ReLU(True),
            nn.Linear(512, input_size),
            nn.Sigmoid()  # Using Sigmoid because embeddings are likely normalized
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

In [8]:
# Convert the NumPy array of embeddings to a PyTorch tensor
embedded_sentences_tensor = torch.tensor(embedded_sentences, dtype=torch.float32)

# Create a dataset and a dataloader
dataset = TensorDataset(embedded_sentences_tensor)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True, drop_last=True)

In [11]:
# Instantiate the autoencoder
input_size = embedded_sentences.shape[1]
encoding_size = 64  # change this to whatever size you want to encode down to
autoencoder = Autoencoder(input_size=input_size, encoding_size=encoding_size)

# Define loss function and optimizer
# criterion = nn.MSELoss()
criterion = nn.L1Loss()  # MAE
optimizer = torch.optim.Adam(autoencoder.parameters(), lr=1e-3)

In [12]:
epochs = 20  # Set this to the number of epochs to train for
for epoch in range(epochs):
    for data in dataloader:
        inputs = data[0]
        # Forward pass
        outputs = autoencoder(inputs)
        loss = criterion(outputs, inputs)

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    print(f'Epoch {epoch+1}/{epochs}, Loss: {loss.item()}')

Epoch 1/20, Loss: 0.3479149341583252
Epoch 2/20, Loss: 0.3466016352176666
Epoch 3/20, Loss: 0.34476661682128906
Epoch 4/20, Loss: 0.33979666233062744
Epoch 5/20, Loss: 0.3394850790500641
Epoch 6/20, Loss: 0.3356765806674957
Epoch 7/20, Loss: 0.3363005220890045
Epoch 8/20, Loss: 0.3406217098236084
Epoch 9/20, Loss: 0.3356150686740875
Epoch 10/20, Loss: 0.34118881821632385
Epoch 11/20, Loss: 0.3356809616088867
Epoch 12/20, Loss: 0.3333381414413452
Epoch 13/20, Loss: 0.33752670884132385
Epoch 14/20, Loss: 0.33646360039711
Epoch 15/20, Loss: 0.34646520018577576
Epoch 16/20, Loss: 0.33549603819847107
Epoch 17/20, Loss: 0.3361603319644928
Epoch 18/20, Loss: 0.33716142177581787
Epoch 19/20, Loss: 0.3304450213909149
Epoch 20/20, Loss: 0.33382365107536316


In [13]:
# Switch the autoencoder to evaluation mode
autoencoder.eval()

# Process the entire dataset to obtain the decoded (projected) embeddings
encoded_embeddings = autoencoder.encoder(embedded_sentences_tensor).detach().numpy()
decoded_embeddings = autoencoder.decoder(torch.from_numpy(encoded_embeddings)).detach().numpy()

In [14]:
# Save the encoder's state_dict
torch.save(autoencoder.encoder.state_dict(), '/home/mendu/Thesis/data/musiccaps/auto_encoder/encoder_state_dict.pth')

Change the metric, reconstruction loss

In [None]:
print("Original embeddings shape:", embedded_sentences_tensor.shape)

# Switch autoencoder to evaluation mode
autoencoder.eval()

# Process the entire dataset to obtain the encoded embeddings
encoded_embeddings = autoencoder.encoder(embedded_sentences_tensor).detach()

print("Entire dataset encoded embeddings shape:", encoded_embeddings.shape)

## Trying to get the word embeddings of the 8 class labels

In [None]:
# Assuming you have your SentenceTransformer model loaded as `model`
new_sentence = "Your new sentence."
new_sentence_embedding = model.encode(new_sentence, convert_to_tensor=True)

# Create a new Autoencoder instance and load the trained encoder
autoencoder = Autoencoder(input_size=768, encoding_size=64)
encoder_state_dict = torch.load('/home/mendu/Thesis/data/musiccaps/auto_encoder/encoder_state_dict.pth')
autoencoder.encoder.load_state_dict(encoder_state_dict)

# You may need to ensure the new sentence embedding is on the same device (CPU/GPU) as the model
# e.g., if the autoencoder is on the GPU, you need to do: new_sentence_embedding = new_sentence_embedding.to('cuda')

# Pass your new sentence embedding through the encoder
autoencoder.eval()  # Important: set the model to evaluation mode
with torch.no_grad():
    new_encoded_embedding = autoencoder.encoder(new_sentence_embedding.unsqueeze(0))  # Add dummy batch dimension

# Convert to numpy array if needed
new_encoded_embedding = new_encoded_embedding.cpu().numpy()  # Call .cpu() if model is on GPU

# The variable 'new_encoded_embedding' now contains the 64-dimensional vector for the new sentence

In [None]:
import os

os.listdir('/home/mendu/Thesis/data/musiccaps/new_embedding_model')

In [None]:
#Chnage this, dont use the tokeniser use model


import numpy as np
from transformers import AutoTokenizer

# Load the fine-tuned tokenizer
tokenizer = AutoTokenizer.from_pretrained('/home/mendu/Thesis/data/musiccaps/new_embedding_model/')  # Update the path accordingly

# Load the 64-dimensional word embeddings
embeddings_reduced = np.load('/home/mendu/Thesis/data/musiccaps/auto_encoder/encoded_embeddings.npy')  # Update with the correct .npy file path

# Function to get the embedding of a specific word
def get_word_embedding(word):
    # Tokenize the word to get its ID
    token_id = tokenizer.encode(word, add_special_tokens=False)
    if not token_id:
        raise ValueError(f"The word '{word}' was not found in the tokenizer's vocabulary.")
    elif len(token_id) > 1:
        raise ValueError(f"The input text '{word}' corresponds to multiple tokens.")
    token_id = token_id[0]  # We only expect one token ID for a single word input
    
    # Fetch the corresponding embedding
    word_embedding = embeddings_reduced[token_id]
    return word_embedding

# Example usage:
word = "rock"  # Replace with the word you're interested in
embedding_of_music = get_word_embedding(word)
print(f"The embedding for the word '{word}' is:", embedding_of_music)

In [None]:
## Older autoencoder code
# import torch
# from torch import nn

# class Autoencoder(nn.Module):
#     def __init__(self, input_size, encoding_size):
#         super(Autoencoder, self).__init__()
#         # Encoder
#         self.encoder = nn.Sequential(
#             nn.Linear(input_size, encoding_size),
#             nn.ReLU(True)
#         )
#         # Decoder
#         self.decoder = nn.Sequential(
#             nn.Linear(encoding_size, input_size),
#             nn.Sigmoid()  # Using Sigmoid because embeddings are likely normalized
#         )

#     def forward(self, x):
#         encoded = self.encoder(x)
#         decoded = self.decoder(encoded)
#         return decoded
    