In [60]:
import pandas as pd
from pathlib import Path
from datetime import datetime
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import tqdm
from sklearn.model_selection import KFold
from random import randint

In [2]:
BASE_DATE = datetime.strptime("2025-01-03", '%Y-%m-%d').timestamp()
print(BASE_DATE)

1735858800.0


In [3]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print("Device:", device)

Device: cuda


# DATA

In [4]:
tracks_raw_data = pd.read_json(Path('../data_v2/tracks_artists.jsonl'), lines=True)
artists_raw_data = pd.read_json(Path('../data_v2/artists.jsonl'), lines=True)

# NORMALIZE DATES
tracks_raw_data["release_date"] = (pd.to_datetime(tracks_raw_data["release_date"], format='mixed').apply(lambda x: x.timestamp())).div(BASE_DATE)

# NORMALIZE DURATION
tracks_raw_data["duration_ms"] = tracks_raw_data["duration_ms"].div(tracks_raw_data["duration_ms"].max())

# NORMALIZE TEMPO
tracks_raw_data["tempo"] = tracks_raw_data["tempo"].div(tracks_raw_data["tempo"].max())

# EXPLICITE ENCODING
tracks_raw_data["explicit"] = tracks_raw_data["explicit"].apply(lambda x: [0, 1] if x else [1, 0])

# processs ARTIST HASH

def postprocess_hash_to_list(x):
    str_x = str(x)
    if len(str_x) < 8:
        str_x = "0" * (8 - len(str_x)) + str_x
    return [int(x) for x in str_x]

tracks_raw_data["id_artist_hash"] = tracks_raw_data["id_artist_hash"].apply(postprocess_hash_to_list)


In [5]:
class TracksDataset(Dataset):
    def __init__(self, tracks_data: pd.DataFrame):
        self.data = tracks_data

    def __len__(self):
        return len(self.data)

    def get_item(self, idx):
        return self.data.iloc[idx].values

    def __getitem__(self, idx):
        unpacked_data = []
        for data in self.data.iloc[idx].drop("id_track").values:
            if type(data) != list:
                unpacked_data.append(data)
            else:
                unpacked_data += data
        return torch.Tensor(unpacked_data)

In [None]:
BATCH_SIZE = 1024
NUMBER_OF_INPUTS = 47

In [58]:
tracks_raw_dataset = TracksDataset(tracks_raw_data)
tracks_raw_dataloader = DataLoader(tracks_raw_dataset, batch_size=BATCH_SIZE, shuffle=True)

# Code

In [120]:
def cross_validate_model(model_class: nn.Module, embeding_dim, learning_rate, loss_module, dataset: Dataset, number_of_epochs: int = 10, number_of_splits: int = 5):
    kfold = KFold(n_splits=number_of_splits, shuffle=True)
    for fold, (train_ids, test_ids) in enumerate(kfold.split(dataset)):
        model = model_class(embeding_dim).to(device)
        print(f"\nFold {fold}")

        optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
        scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.95)
        train_loader = DataLoader(dataset, batch_size=BATCH_SIZE, sampler=train_ids)
        test_loader = DataLoader(dataset, batch_size=BATCH_SIZE, sampler=test_ids)
        progress_bar = tqdm.tqdm(range(number_of_epochs), total=number_of_epochs, desc="Epoch")

        for _ in progress_bar:
            agregated_loss = 0
            for data in train_loader:

                data = data.to(device)

                preds = model(data)
                loss = loss_module(preds, data)

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                agregated_loss += loss.item()
            progress_bar.set_postfix({"Train loss": agregated_loss / len(train_loader)})
            model.train()
            scheduler.step()

        model.eval()
        test_loss = 0
        for data in test_loader:
            data = data.to(device)
            preds = model(data)
            loss = loss_module(preds, data)
            test_loss += loss.item()
        print(f"Test loss: {test_loss / len(test_loader)}")

# Modele

In [112]:
class Encoder(nn.Module):

    def __init__(self, hidden_dim: int):
        super().__init__()
        self.fc1 = nn.Linear(NUMBER_OF_INPUTS, 2048)
        self.act1 = nn.ReLU6()
        self.fc5 = nn.Linear(2048, hidden_dim)

    def forward(self, x):
        x = self.act1(self.fc1(x))
        x = self.fc5(x)
        return x

In [113]:
class Decoder(nn.Module):

    def __init__(self, hidden_dim: int):
        super().__init__()
        self.fc1 = nn.Linear(hidden_dim, 2048)
        self.act1 = nn.ReLU6()
        self.fc5 = nn.Linear(2048, NUMBER_OF_INPUTS)

    def forward(self, x):
        x = self.act1(self.fc1(x))
        x = self.fc5(x)
        return x

In [115]:
class Autoencoder(nn.Module):

    def __init__(self,
                 latent_dim: int,
                 encoder_class : object = Encoder,
                 decoder_class : object = Decoder):
        super().__init__()
        self.encoder = encoder_class(latent_dim)
        self.decoder = decoder_class(latent_dim)

    def forward(self, x):
        """
        The forward function takes in an image and returns the reconstructed image
        """
        z = self.encoder(x)
        x_hat = self.decoder(z)
        return x_hat

# Training

In [124]:
EMBEDING_DIM = 64
model = Autoencoder(EMBEDING_DIM).to(device)

In [125]:
NUMBER_OF_EPOCHS = 10
LEARNING_RATE = 0.004


optimizer = torch.optim.Adam(model.parameters(), lr=0.0005)
loss_module = nn.L1Loss()
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.95)

In [126]:
cross_validate_model(Autoencoder, EMBEDING_DIM, LEARNING_RATE, nn.L1Loss(), TracksDataset(tracks_raw_data), NUMBER_OF_EPOCHS, 5)


Fold 0


Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch:  90%|█████████ | 9/10 [00:36<00:04,  4.29s/it, Train loss=0.0978]

In [56]:
model.train()
agregated_loss = 0
for epoch in tqdm.tqdm(range(NUMBER_OF_EPOCHS), total=NUMBER_OF_EPOCHS, desc="Epoch"):
    agregated_loss = 0
    for data in tracks_raw_dataloader:

        data = data.to(device)

        preds = model(data)
        loss = loss_module(preds, data)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        agregated_loss += loss.item()
    print(f"Train loss: {agregated_loss / len(tracks_raw_dataloader)}")
    model.train()
    scheduler.step()

Epoch:  10%|█         | 1/10 [00:05<00:50,  5.56s/it]

Train loss: 0.5779829269105737


Epoch:  20%|██        | 2/10 [00:11<00:44,  5.58s/it]

Train loss: 0.1515811627561396


Epoch:  30%|███       | 3/10 [00:17<00:42,  6.02s/it]

Train loss: 0.0956356143578887


Epoch:  40%|████      | 4/10 [00:23<00:35,  5.91s/it]

Train loss: 0.08426837072792379


Epoch:  50%|█████     | 5/10 [00:29<00:29,  5.88s/it]

Train loss: 0.07515660241584886


Epoch:  60%|██████    | 6/10 [00:34<00:23,  5.82s/it]

Train loss: 0.07025317712263628


Epoch:  70%|███████   | 7/10 [00:40<00:17,  5.74s/it]

Train loss: 0.06585435751317577


Epoch:  80%|████████  | 8/10 [00:46<00:11,  5.78s/it]

Train loss: 0.061564283360811795


Epoch:  90%|█████████ | 9/10 [00:52<00:05,  5.72s/it]

Train loss: 0.05840270123867826


Epoch: 100%|██████████| 10/10 [00:58<00:00,  5.81s/it]

Train loss: 0.05677685094997287





In [63]:
for data in tracks_raw_dataloader:
    data = data.to(device)
    preds = model(data)
    loss = loss_module(preds, data)
    for i in range(len(data)):
        print(f"Original: {data[i]}")
        print(f"Reconstructed: {preds[i]}")
    break


Original: tensor([0.6900, 0.0494, 1.0000, 0.0000, 0.7793, 0.5380, 0.8190, 4.0000, 0.0444,
        0.0435, 0.0445, 0.0000, 0.2150, 0.4780, 0.3729, 0.0000, 0.0000, 0.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 7.0000, 8.0000, 9.0000, 2.0000, 7.0000,
        0.0000, 9.0000, 6.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.0000,
        0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
        0.0000, 1.0000], device='cuda:0')
Reconstructed: tensor([ 6.2001e-01,  4.9972e-02,  9.3611e-01,  2.6582e-02,  7.6266e-01,
         5.6796e-01,  7.1847e-01,  4.0644e+00,  5.7608e-02,  4.0242e-02,
         1.6750e-01,  1.0620e-02,  1.4215e-01,  5.0163e-01,  5.8552e-01,
         8.7414e-04,  9.5695e-03,  1.2135e-02,  9.3252e-03,  7.8363e-03,
        -8.8138e-03, -1.1103e-03,  7.0102e+00,  7.9715e+00,  8.9901e+00,
         1.9733e+00,  7.0216e+00, -1.2047e-02,  9.0316e+00,  6.0043e+00,
         1.6058e-02, -7.8310e-03,  5.1405e-03,  2.0336e-03,  1.8048e-02,
         1.0845e-01,  

In [82]:
random_element = randint(0, len(tracks_raw_data)-1)
element_to_compare = randint(0, len(tracks_raw_data)-1)

print(f"Distance: {torch.dist(model.encoder(tracks_raw_dataset[random_element].to(device)), model.encoder(tracks_raw_dataset[element_to_compare].to(device)))}")
print(f"First element: {tracks_raw_dataset.get_item(random_element)}")
print(f"Second element: {tracks_raw_dataset.get_item(element_to_compare)}")

Distance: 16.531755447387695
First element: ['70p3HYq9iHZisJqpDmWd1U' 0.75 0.07540304514911444 list([1, 0])
 0.8544639690739823 0.455 0.9520000000000001 1 0.1063666667 0.0787
 0.023200000000000002 5.65e-05 0.06620000000000001 0.332
 0.41056524563946223 list([0, 0, 0, 0, 0, 0, 0])
 list([1, 8, 4, 1, 4, 2, 3, 7])
 list([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0])]
Second element: ['3dPtXHP0oXQ4HCWHsOA9js' 0.79 0.06334870292103068 list([1, 0])
 0.9081199461615196 0.67 0.874 8 0.08701666670000001 0.030500000000000003
 0.00231 1.7199999999999998e-05 0.30000000000000004 0.789
 0.5908295812338993 list([0, 0, 0, 0, 0, 0, 0])
 list([9, 0, 1, 0, 9, 1, 7, 0])
 list([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1])]


In [19]:
model_scripted = torch.jit.script(model)
model_scripted.save('embeding_models/embeding_v0.pt')