In [2]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

import sklearn.manifold as skm
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import MultiLabelBinarizer, scale, minmax_scale

from pytorch_lightning import Trainer
from torch.utils.data import DataLoader

from project.mlp import MLP, CategMLP, AttentionMLP
from project.autoencoder import  AutoEncoder, DAE
from project.custom_dataset import CustomDataset, CategoricalDataset

from collections import defaultdict
from tqdm import tqdm

import torch
import pytorch_lightning as pl
from torch import nn


import logging
logging.getLogger("pytorch_lightning").setLevel(0)

In [3]:
df = pd.read_csv("data/playlist_2010to2022.csv").dropna()
df = df.drop(columns=['playlist_url', 'track_id', 'artist_id'], axis=1)
mlb = MultiLabelBinarizer(sparse_output=True)
genres = pd.DataFrame.sparse.from_spmatrix(mlb.fit_transform(df.artist_genres.str.strip("[]").str.split(', ')),
                                           columns=mlb.classes_, index=df.index).drop("", axis=1)
genres.columns = genres.columns.str.strip("'")
df = df.drop(columns=['artist_genres'], axis=1)
ohe_year = pd.get_dummies(df.year).astype(int)
target = df.pop("track_popularity")
df = df.drop(columns=['track_name', 'year', 'artist_name', 'artist_popularity', 'album'], axis=1)
df.columns = df.columns.astype(str)
#split in train and test 0.8 t 0.2 

full = pd.concat([df, genres, ohe_year], axis=1)
full.columns = full.columns.astype(str)

train = full.sample(frac=0.8, random_state=0)
test = full.drop(train.index) 
train_idx = train.index
train_target = target[train.index].values
test_target = target[test.index].values

train, test = scale(train), scale(test)


feature_year_no_genre = pd.concat([df, ohe_year], axis=1)
feature_year_no_genre.columns = feature_year_no_genre.columns.astype(str)

train_no_genre = feature_year_no_genre.loc[train_idx].values
test_no_genre = feature_year_no_genre.drop(train_idx).values
train_no_genre, test_no_genre = scale(train_no_genre), scale(test_no_genre)


train_no_genre_no_ohe = df.loc[train_idx].values
test_no_genre_no_ohe = df.drop(train_idx).values
train_no_genre_no_ohe, test_no_genre_no_ohe = scale(train_no_genre_no_ohe), scale(test_no_genre_no_ohe)

train_genre_no_year = pd.concat([df, genres], axis=1)
train_genre_no_year.columns = train_genre_no_year.columns.astype(str)
train_genre_no_year = train_genre_no_year.loc[train_idx]
test_genre_no_year = pd.concat([df, genres], axis=1).drop(train_idx).values
train_genre_no_year, test_genre_no_year = scale(train_genre_no_year.values), scale(test_genre_no_year)



In [None]:
def test_train_error(model, train, train_target, test, test_target):
    train_error = ((model.predict(train) - train_target)**2).sum()
    test_error = ((model.predict(test) - test_target)**2).sum()
    print("Train error : ", train_error, " ; test error : ", test_error)
    return train_error, test_error


In [None]:
print("Linear Ridge and Lasso regression using the year and genre as one hot encoded vector")
#use test_train_error and store result in a dictionnary
train_dict = defaultdict(lambda: defaultdict(dict))
test_dict = defaultdict(lambda: defaultdict(dict))
train_dict["Linear"]["ohe_genre"]["ohe_year"], test_dict["Linear"]["ohe_genre"]["ohe_year"] = test_train_error(LinearRegression().fit(train, train_target), train, train_target, test, test_target)
train_dict["Ridge"]["ohe_genre"]["ohe_year"], test_dict["Ridge"]["ohe_genre"]["ohe_year"] = test_train_error(Ridge().fit(train, train_target), train, train_target, test, test_target)
train_dict["Lasso"]["ohe_genre"]["ohe_year"], test_dict["Lasso"]["ohe_genre"]["ohe_year"] = test_train_error(Lasso().fit(train, train_target), train, train_target, test, test_target)


In [None]:
print("Linear Ridge and Lasso regression using the year as one hot encoded vector, but no genre")
train_dict["Linear"]["no_genre"]["ohe_year"], test_dict["Linear"]["no_genre"]["ohe_year"] = test_train_error(LinearRegression().fit(train_no_genre, train_target), train_no_genre, train_target, test_no_genre, test_target)
train_dict["Ridge"]["no_genre"]["ohe_year"], test_dict["Ridge"]["no_genre"]["ohe_year"] = test_train_error(Ridge().fit(train_no_genre, train_target), train_no_genre, train_target, test_no_genre, test_target)
train_dict["Lasso"]["no_genre"]["ohe_year"], test_dict["Lasso"]["no_genre"]["ohe_year"] = test_train_error(Lasso().fit(train_no_genre, train_target), train_no_genre, train_target, test_no_genre, test_target)


In [None]:
print("Linear Ridge and Lasso regression using the year as int, and no genre")
train_dict["Linear"]["no_genre"]["no_ohe_year"], test_dict["Linear"]["no_genre"]["no_ohe_year"] = test_train_error(LinearRegression().fit(train_no_genre_no_ohe, train_target), train_no_genre_no_ohe, train_target, test_no_genre_no_ohe, test_target)
train_dict["Ridge"]["no_genre"]["no_ohe_year"], test_dict["Ridge"]["no_genre"]["no_ohe_year"] = test_train_error(Ridge().fit(train_no_genre_no_ohe, train_target), train_no_genre_no_ohe, train_target, test_no_genre_no_ohe, test_target)
train_dict["Lasso"]["no_genre"]["no_ohe_year"], test_dict["Lasso"]["no_genre"]["no_ohe_year"] = test_train_error(Lasso().fit(train_no_genre_no_ohe, train_target), train_no_genre_no_ohe, train_target, test_no_genre_no_ohe, test_target)


In [None]:
print("Linear Ridge and Lasso regression using the year as int, and one hot encoed genre")
train_dict["Linear"]["ohe_genre"]["no_ohe_year"], test_dict["Linear"]["ohe_genre"]["no_ohe_year"] = test_train_error(LinearRegression().fit(train_genre_no_year, train_target), train_genre_no_year, train_target, test_genre_no_year, test_target)
train_dict["Ridge"]["ohe_genre"]["no_ohe_year"], test_dict["Ridge"]["ohe_genre"]["no_ohe_year"] = test_train_error(Ridge().fit(train_genre_no_year, train_target), train_genre_no_year, train_target, test_genre_no_year, test_target)
train_dict["Lasso"]["ohe_genre"]["no_ohe_year"], test_dict["Lasso"]["ohe_genre"]["no_ohe_year"] = test_train_error(Lasso().fit(train_genre_no_year, train_target), train_genre_no_year, train_target, test_genre_no_year, test_target)

In [None]:
# format train and test dict as a dataframe
train_df = pd.DataFrame(train_dict)
test_df = pd.DataFrame(test_dict)
train_df = train_df.stack(level=0).reset_index().rename(columns={"level_0": "genre", "level_1": "model", "level_2": "year", 0: "train_error"})
train_df = pd.concat([train_df, pd.DataFrame(list(train_df.train_error))], axis=1).drop(columns=["train_error"]).melt(["model", "genre"]).rename(columns={"variable": "encoded_year"})

test_df = test_df.stack(level=0).reset_index().rename(columns={"level_0": "genre", "level_1": "model", "level_2": "year", 0: "test_error"})
test_df = pd.concat([test_df, pd.DataFrame(list(test_df.test_error))], axis=1).drop(columns=["test_error"]).melt(["model", "genre"]).rename(columns={"variable": "encoded_year"})

# merge two df
error_df = pd.merge(train_df, test_df, on=["model", "genre", "encoded_year"]).rename(columns={"value_x": "train_error", "value_y": "test_error"})
error_df = error_df.melt(["model", "genre", "encoded_year"], ["train_error", "test_error"], "error_type").rename(columns={"error_type": "error_type", "value": "error"})

error_df

In [None]:
def signif(x, p):
    x = np.asarray(x)
    x_positive = np.where(np.isfinite(x) & (x != 0), np.abs(x), 10**(p-1))
    mags = 10 ** (p - 1 - np.floor(np.log10(x_positive)))
    return np.round(x * mags) / mags

error_df["error"] = signif(error_df.error, 2)

In [None]:
fig = px.bar(error_df, x="model", y="error", color="error_type", facet_row="genre", facet_col="encoded_year", barmode="group", title="Train and test error for different models and different encoding of the year and genre", log_y=True, text_auto=True)
#round values to two significant digits 
fig.update_traces(textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
#make figure taller
fig.update_layout(height=1200)


We obtain the best result by on hot encoding the genres and the year, with a Ridge regression. Here it is interesting to see that one hot encoding the year helps the model, as it removes information by removing the ordinality of the feature (the year 2020 is a higher number than the year 2019) 
The genre seems to be quite important here as it the feature the has the most impact on lowering the error rate.

For the remainder of this notebook we will use an MLP to predict track popularity and will observe the impact of the "categorical embedding" technique on the error rate. 
Factorization machines are also an interesting technique to use for this kind of problem, and might use them later

In [13]:
# create test and train dataloader from the dataset
train, test = train.astype(np.float32), test.astype(np.float32)
train_dataset = CustomDataset(train, train_target)
test_dataset =  CustomDataset(test, test_target)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=8)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=8)


In [None]:
# initialize mlp and create a gridsearch to find the best parameters recording test and train error for each combination of parameters
lr = [0.001, 0.01, 0.1]
hidden_layer_width = [32, 64, 128]
hidden_layer_depth = [1, 2, 4, 8]
epochs = [20, 50,100]
dropout = [0.0, 0.2, 0.5]
grid = np.array(np.meshgrid(lr, hidden_layer_width, hidden_layer_depth, epochs, dropout)).T.reshape(-1, 5)
grid = pd.DataFrame(grid, columns=["lr", "hidden_layer_width", "hidden_layer_depth", "epochs", "dropout"])
grid["train_error"] = np.nan
grid["test_error"] = np.nan
grid[["hidden_layer_width", "hidden_layer_depth", "epochs"]] = grid[["hidden_layer_width", "hidden_layer_depth", "epochs"]].astype(int)



In [None]:
# go through the grid and train the model for each combination of parameters
for i in tqdm(range(len(grid))):
    layers = [grid.hidden_layer_width[i]]*grid.hidden_layer_depth[i]
    model = MLP(input_dim=train.shape[1], output_dim=1, hidden_layers=layers, lr=grid.lr[i], dropout=grid.dropout[i])
    trainer = Trainer(max_epochs=int(grid.epochs[i]), enable_progress_bar=False, enable_model_summary=False)
    trainer.fit(model, train_dataloader, test_dataloader)
    grid.train_error[i] = trainer.callback_metrics["train_loss"].item()
    grid.test_error[i] = trainer.callback_metrics["val_loss"].item()


In [None]:
grid.sort_values(by="test_error").head(10)

##### We can see that the best parameters for a normal feed forward neural network are : 
- learning rate : 0.001
- hidden layer width : 32
- hidden layer depth : 8
- epochs : 100
- dropout : 0.2

##

### Using an autoencoder as a pretraining step for the MLP and check if the performance improves

In [None]:
# model = MLP(input_dim=train.shape[1], output_dim=1, hidden_layers=[32]*7, lr=0.001, dropout=0.2)
model = AutoEncoder(input_dim=train.shape[1], output_dim=473, hidden_layers=[32]*8, lr=0.001, dropout=0.2)
trainer = Trainer(max_epochs=150, enable_progress_bar=False, enable_model_summary=False)
trainer.fit(model, train_dataloader, test_dataloader)
print(trainer.callback_metrics["train_loss"].item(), trainer.callback_metrics["val_loss"].item())

In [None]:
finetuned_model = MLP(input_dim=train.shape[1], output_dim=473, hidden_layers=[32]*8, lr=0.001, dropout=0.2)
finetuned_model.load_state_dict(model.state_dict())
trainer = Trainer(max_epochs=100, enable_progress_bar=True, enable_model_summary=False)
finetuned_model.layers = finetuned_model.layers[:-1].append(nn.Linear(32,1))

trainer.fit(finetuned_model, train_dataloader, test_dataloader)
print(trainer.callback_metrics["train_loss"].item(), trainer.callback_metrics["val_loss"].item())

In [None]:
unpretrained_model = MLP(input_dim=train.shape[1], output_dim=1, hidden_layers=[32]*8, lr=0.001, dropout=0.2)
trainer = Trainer(max_epochs=100, enable_progress_bar=False, enable_model_summary=False)
trainer.fit(unpretrained_model, train_dataloader, test_dataloader)
print(trainer.callback_metrics["train_loss"].item(), trainer.callback_metrics["val_loss"].item())

##### We can see that the autoencoder does not improve the performance of the MLP, and that the performance of the MLP is better than the linear regression

### Using a DAE as a pretraining step for the MLP and check if the performance improves

In [None]:
model = DAE(input_dim=train.shape[1], output_dim=473, hidden_layers=[32]*8, noise_mean=0.1, noise_std=0.1, lr=0.001, dropout=0.2)
trainer = Trainer(max_epochs=150, enable_progress_bar=False, enable_model_summary=False)
trainer.fit(model, train_dataloader, test_dataloader)
print(trainer.callback_metrics["train_loss"].item(), trainer.callback_metrics["val_loss"].item())

In [None]:
finetuned_model = MLP(input_dim=train.shape[1], output_dim=473, hidden_layers=[32]*8, lr=0.001, dropout=0.2)
finetuned_model.load_state_dict(model.state_dict())
trainer = Trainer(max_epochs=100, enable_progress_bar=False, enable_model_summary=False)
finetuned_model.layers = finetuned_model.layers[:-1].append(nn.Linear(32,1))
trainer.fit(finetuned_model, train_dataloader, test_dataloader)
print(trainer.callback_metrics["train_loss"].item(), trainer.callback_metrics["val_loss"].item())

### Using categorical embedding for the genre

In [None]:
genres.loc[train_idx].values

In [4]:
# create test and train dataloader from the dataset
train, test = train_no_genre.astype(np.float32), test_no_genre.astype(np.float32)
train_dataset = CategoricalDataset(data=train, target=train_target, genre=genres.loc[train_idx].values)
test_dataset =  CategoricalDataset(data=test, target=test_target, genre=genres.drop(train_idx).values)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=8)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=8)


In [None]:
# initialize mlp and create a gridsearch to find the best parameters recording test and train error for each combination of parameters
lr = [0.001, 0.01, 0.1]
hidden_layer_width = [32, 64, 128]
hidden_layer_depth = [1, 2, 4, 8]
embedding_dim = [16, 32, 64]
epochs = [20, 50, 100]
dropout = [0.1, 0.2, 0.3, 0.4, 0.5]
grid = np.array(np.meshgrid(lr, hidden_layer_width, hidden_layer_depth, embedding_dim, epochs, dropout)).T.reshape(-1, 6)
grid = pd.DataFrame(grid, columns=["lr", "hidden_layer_width", "hidden_layer_depth", "embedding_dim", "epochs", "dropout"])
grid["train_error"] = np.nan
grid["test_error"] = np.nan
grid[["hidden_layer_width", "hidden_layer_depth", "embedding_dim", "epochs"]] = grid[["hidden_layer_width", "hidden_layer_depth", "embedding_dim", "epochs"]].astype(int)



In [None]:
# go through the grid and train the model for each combination of parameters
for i in tqdm(range(len(grid))):
    layers = [grid.hidden_layer_width[i]]*grid.hidden_layer_depth[i]
    model = CategMLP(input_dim=train.shape[1], output_dim=1, categ_dim=genres.shape[1], embedding_dim=grid.embedding_dim[i], hidden_layers=layers,
                     lr=grid.lr[i], dropout=grid.dropout[i], )
    trainer = Trainer(max_epochs=int(grid.epochs[i]), enable_progress_bar=False, enable_model_summary=False)
    trainer.fit(model, train_dataloader, test_dataloader)
    grid.train_error[i] = trainer.callback_metrics["train_loss"].item()
    grid.test_error[i] = trainer.callback_metrics["val_loss"].item()

In [45]:
attention_mlp = AttentionMLP(input_dim=train.shape[1], output_dim=1, hidden_layers=[32]*8, lr=0.0001, dropout=0.5)
trainer = Trainer(max_epochs=200, enable_progress_bar=False, enable_model_summary=False)
trainer.fit(attention_mlp, train_dataloader, test_dataloader)


In [46]:
trainer.callback_metrics["train_loss"].item(), trainer.callback_metrics["val_loss"].item()

(128.95205688476562, 3994.33349609375)

In [47]:
trainer.callback_metrics

{'train_loss': tensor(128.9521), 'val_loss': tensor(3994.3335)}