In [1]:
import tensorflow as tf
import tensorflow_recommenders as tfrs
import pandas as pd
from pprint import pprint
import numpy as np

2022-10-24 10:35:12.702640: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-10-24 10:35:12.952653: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-10-24 10:35:12.952693: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-10-24 10:35:13.006088: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-10-24 10:35:14.380473: W tensorflow/stream_executor/platform/de

In [197]:
# database is small enough to be read into memory hence we will use pandas and then cast it into a tf.data object
df = pd.read_csv('./data/database_EDA.csv')
df = df.dropna()
all_tracks = df['artist&track'].unique()
pprint(df.dtypes)
df.head()

artist           object
track_id         object
track            object
owner            object
popularity      float64
artist&track     object
dtype: object


Unnamed: 0,artist,track_id,track,owner,popularity,artist&track
0,Romanthony,1iwZVgKv3FKc0dqhnSG9uW,Get It 2 Getta,kv718oiku8q612q0zi4iaovzb,22.0,Romanthony/Get It 2 Getta
1,Glenn Underground,2JFeAyJLMPPsBkklQEet6t,H-Dance,kv718oiku8q612q0zi4iaovzb,7.0,Glenn Underground/H-Dance
2,DJ BORING,3ilkEyg6OCtd9qCnOJkPzU,Winona,kv718oiku8q612q0zi4iaovzb,52.0,DJ BORING/Winona
3,DJ BORING,3ilkEyg6OCtd9qCnOJkPzU,Winona,botetauw3lk6anvygizm8wolf,52.0,DJ BORING/Winona
4,DJ BORING,3ilkEyg6OCtd9qCnOJkPzU,Winona,thebootlegboy,52.0,DJ BORING/Winona


In [3]:
all_tracks

array(['Romanthony/Get It 2 Getta', 'Glenn Underground/H-Dance',
       'DJ BORING/Winona', ..., 'Joni Mitchell/Edith and the Kingpin',
       "Joni Mitchell/Don't Interrupt the Sorrow",
       'Joni Mitchell/The Hissing of Summer Lawns'], dtype=object)

In [199]:
user_data = tf.data.Dataset.from_tensor_slices(dict(df))
tracks = tf.data.Dataset.from_tensor_slices({'artist&track' : all_tracks})

In [200]:
for row in user_data.take(1).as_numpy_iterator():
    print(row)
    break

{'artist': b'Romanthony', 'track_id': b'1iwZVgKv3FKc0dqhnSG9uW', 'track': b'Get It 2 Getta', 'owner': b'kv718oiku8q612q0zi4iaovzb', 'popularity': 22.0, 'artist&track': b'Romanthony/Get It 2 Getta'}


In [6]:
for row in tracks.take(1).as_numpy_iterator():
    print(row)
    break

{'artist&track': b'Romanthony/Get It 2 Getta'}


### Feature selection

In [7]:
user_songs = user_data.map(lambda x: {
    'user': x['owner'],
    'artist&track': x['artist&track'],
})

tracks = tracks.map(lambda x: x['artist&track'])

In [8]:
tf.random.set_seed(42)
buffersize = df.shape[0]
shuffled = user_songs.shuffle(buffersize, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(int(0.8*buffersize))
test = shuffled.skip(int(0.8*buffersize)).take(int(0.2*buffersize))

track_names = tracks.batch(1_000)
user_ids = user_songs.batch(1_000_000).map(lambda x: x["user"])

unique_track_names = np.unique(np.concatenate(list(track_names)))
unique_user_ids = np.unique(np.concatenate(list(user_ids)))

unique_track_names[:10]

array([b' Sentinel/Toulepleu', b'!!!/Fine Fine Fine', b'!!!/Myth Takes',
       b'!!!/NRGQ',
       b'#1 Movie Favorites/The Winner Is (From Little Miss Sunshine)',
       b'#TocoParaVos/Se Pic\xc3\xb3', b'$MOKE OG/$MOKE OG',
       b'$NOT/"Life"', b'$NOT/5AM', b'$NOT/BENZO'], dtype=object)

## Retrieval model

Following the tutorial from [Tensorflow](https://www.tensorflow.org/recommenders/examples/basic_retrieval)

### Query and candidate towers

In [9]:
embedding_dimension = 32

# stringlook up encodes the user id's as continuous integers and then uses an embedding
# similar process for the
user_model = tf.keras.Sequential(
    [
        tf.keras.layers.StringLookup(
            vocabulary=unique_user_ids,
            mask_token=None,
        ),
        tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension),
    ]
)

track_model = tf.keras.Sequential(
    [
        tf.keras.layers.StringLookup(
            vocabulary=unique_track_names,
            mask_token=None,
        ),
        tf.keras.layers.Embedding(len(unique_track_names) + 1, embedding_dimension)
    ]
)

### Metric

In [10]:
metrics = tfrs.metrics.FactorizedTopK(
    candidates=tracks.batch(128).map(track_model)
)

task = tfrs.tasks.Retrieval(metrics=metrics)

### Full retrieval model

In [11]:
class TrackRetrievalModel(tfrs.Model):

    def __init__(self, usr_model, track_model):
        super().__init__()
        self.track_model: tf.keras.Model = track_model
        self.user_model: tf.keras.Model = usr_model
        self.task: tf.keras.layers.Layer = task

    def compute_loss(self, features, training=False) -> tf.Tensor:
        # We pick out the user features and pass them into the user model.
        user_embeddings = self.user_model(features["user"])
        # And pick out the track features and pass them into the track model,
        # getting embeddings back.
        positive_track_embeddings = self.track_model(features["artist&track"])

        # The task computes the loss and the metrics.
        return self.task(user_embeddings, positive_track_embeddings)

### Fitting and evaluating the model

In [12]:
model = TrackRetrievalModel(user_model, track_model)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

#both train and test splits only include the 'ratings' dataset
cached_train = train.shuffle(100_000).batch(8192).cache()
cached_test = test.batch(4096).cache()

model.fit(cached_train, epochs=3)

Epoch 1/3


2022-10-24 10:37:13.531958: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 268435456 exceeds 10% of free system memory.
2022-10-24 10:37:13.726722: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 268435456 exceeds 10% of free system memory.


 1/16 [>.............................] - ETA: 11:21 - factorized_top_k/top_1_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_5_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_10_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_50_categorical_accuracy: 6.1035e-04 - factorized_top_k/top_100_categorical_accuracy: 9.7656e-04 - loss: 73817.8750 - regularization_loss: 0.0000e+00 - total_loss: 73817.8750

2022-10-24 10:37:55.061680: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 268435456 exceeds 10% of free system memory.
2022-10-24 10:37:55.199816: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 268435456 exceeds 10% of free system memory.


 2/16 [==>...........................] - ETA: 10:00 - factorized_top_k/top_1_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_5_categorical_accuracy: 3.6621e-04 - factorized_top_k/top_10_categorical_accuracy: 3.6621e-04 - factorized_top_k/top_50_categorical_accuracy: 0.0012 - factorized_top_k/top_100_categorical_accuracy: 0.0018 - loss: 73817.8789 - regularization_loss: 0.0000e+00 - total_loss: 73817.8789        

2022-10-24 10:38:37.975515: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 268435456 exceeds 10% of free system memory.


Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fec05119f70>

Not especially high chance of the positive examples being in the top 100. Model is not learning the data well enough. Maybe not surprising given the limited features have been fed into it.

In [13]:
model.evaluate(cached_test, return_dict=True)



{'factorized_top_k/top_1_categorical_accuracy': 0.0011135513195767999,
 'factorized_top_k/top_5_categorical_accuracy': 0.009465185925364494,
 'factorized_top_k/top_10_categorical_accuracy': 0.0227968692779541,
 'factorized_top_k/top_50_categorical_accuracy': 0.11961396783590317,
 'factorized_top_k/top_100_categorical_accuracy': 0.1605369746685028,
 'loss': 27237.46875,
 'regularization_loss': 0,
 'total_loss': 27237.46875}

Quite similar results for the test set.

### Making predictions (for retrieval of queries)

In [14]:
scann_index = tfrs.layers.factorized_top_k.ScaNN(model.user_model, k=100)
scann_index.index_from_dataset(
    tf.data.Dataset.zip((tracks.batch(100), tracks.batch(100).map(model.track_model)))
)

result = scann_index(np.array(['kv718oiku8q612q0zi4iaovzb']))

2022-10-24 11:31:48.871540: I scann/partitioning/partitioner_factory_base.cc:59] Size of sampled dataset for training partition: 100044
2022-10-24 11:31:49.161391: I ./scann/partitioning/kmeans_tree_partitioner_utils.h:88] PartitionerFactory ran in 289.253589ms.


In [15]:
result

(<tf.Tensor: shape=(1, 100), dtype=float32, numpy=
 array([[1.699753 , 1.653338 , 1.606923 , 1.5787424, 1.575427 , 1.565481 ,
         1.5555348, 1.5472465, 1.5373003, 1.5306697, 1.5306697, 1.5273542,
         1.5256966, 1.524039 , 1.5223813, 1.5223813, 1.5190659, 1.5174083,
         1.5008314, 1.4991738, 1.4908854, 1.4792817, 1.4709932, 1.4610472,
         1.4511011, 1.4477856, 1.441155 , 1.4394972, 1.4361819, 1.4278935,
         1.4245782, 1.4245782, 1.4179474, 1.409659 , 1.4048991, 1.404686 ,
         1.4013706, 1.3997129, 1.3997129, 1.3997129, 1.3947399, 1.3947399,
         1.3947399, 1.3881092, 1.3864515, 1.3814785, 1.3814785, 1.3781631,
         1.3781631, 1.3715324, 1.3698747, 1.363244 , 1.3615863, 1.3615863,
         1.3615863, 1.3599286, 1.3551688, 1.353298 , 1.353298 , 1.3483249,
         1.3483249, 1.3450096, 1.3416941, 1.3416941, 1.3400365, 1.3367212,
         1.3334057, 1.3251173, 1.3251173, 1.3251173, 1.3234596, 1.3234596,
         1.3201443, 1.316829 , 1.3151712, 1.31351

In [16]:
tf.saved_model.save(
    scann_index,
    './models/spotify_retrieval_v1',
    options=tf.saved_model.SaveOptions(namespace_whitelist=['Scann']),
)



INFO:tensorflow:Assets written to: ./models/spotify_retrieval_v1/assets


INFO:tensorflow:Assets written to: ./models/spotify_retrieval_v1/assets


# Ignore!!!

In [133]:
recs = []
temp = result[1].numpy().reshape(-1)
for rec in temp:
    recs.append(rec.decode('UTF-8').split('/'))

recs = pd.DataFrame(recs, columns=['artist', 'track'])
actual_playlist = df[df['owner'] == 'kv718oiku8q612q0zi4iaovzb'][['artist','track']]
new_recs = pd.concat([recs, actual_playlist]).drop_duplicates(keep=False)

In [164]:
#new_recs.merge(df[['artist','track','track_id']], how='inner').drop_duplicates(subset=['artist', 'track'],keep='first')['track_id'].to_list()

## Ranking Model

In [201]:
user_songs_ranking = user_data.map(lambda x: {
    'user': x['owner'],
    'artist&track': x['artist&track'],
    'popularity': x['popularity'],
})

tf.random.set_seed(42)

shuffled_ranking = user_songs_ranking.shuffle(buffersize, seed=42, reshuffle_each_iteration=False)

train_ranking = shuffled_ranking.take(int(0.8*buffersize))
test_ranking = shuffled_ranking.skip(int(0.8*buffersize)).take(int(0.2*buffersize))

In [202]:
class RankingModel(tf.keras.Model):

    def __init__(self, usr_model, film_model):
        super().__init__()

        # Compute embeddings for users.
        self.user_embeddings = usr_model

        # Compute embeddings for movies.
        self.track_embeddings = track_model

        # Compute predictions.
        self.ratings = tf.keras.Sequential([
            # Learn multiple dense layers.
            tf.keras.layers.Dense(256, activation="relu"),
            tf.keras.layers.Dense(64, activation="relu"),
            # Make rating predictions in the final layer.
            tf.keras.layers.Dense(1)
        ])

    def call(self, inputs):

        user_id, track_name = inputs

        user_embedding = self.user_embeddings(user_id)
        track_embedding = self.track_embeddings(track_name)

        return self.ratings(tf.concat([user_embedding, track_embedding], axis=1))


class TrackRankingModel(tfrs.models.Model):

    def __init__(self, usr_model, track_model):
        super().__init__()
        self.ranking_model: tf.keras.Model = RankingModel(usr_model, track_model)
        self.task: tf.keras.layers.Layer = tfrs.tasks.Ranking(
            loss = tf.keras.losses.MeanSquaredError(),
            metrics=[tf.keras.metrics.RootMeanSquaredError()]
        )

    def call(self, features) -> tf.Tensor:
        return self.ranking_model(
            (features["user"], features["artist&track"]))

    def compute_loss(self, features, training=False) -> tf.Tensor:
        labels = features.pop("popularity")

        rating_predictions = self(features)

        # The task computes the loss and the metrics.
        return self.task(labels=labels, predictions=rating_predictions)


In [218]:
ranking_model = TrackRankingModel(user_model, track_model)
ranking_model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

cached_train_ranking = train_ranking.shuffle(buffersize).batch(8192).cache()
cached_test_ranking = test_ranking.batch(4096).cache()

ranking_model.fit(cached_train_ranking, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7febb617f5e0>

In [219]:
ranking_model.evaluate(cached_test_ranking, return_dict=True)



{'root_mean_squared_error': 16.916194915771484,
 'loss': 286.89654541015625,
 'regularization_loss': 0,
 'total_loss': 286.89654541015625}

## Combining the two models

In [222]:
user_id = np.array(['kv718oiku8q612q0zi4iaovzb'])

#use the retrieval model to produce the top 100 films for user_id
top100tracks = scann_index(user_id)[1].numpy()
top100tracks = top100tracks.reshape(100)

predicted_tracks = {}
for track in top100tracks:
    predicted_tracks[track] = ranking_model({
        'user': user_id,
        'artist&track': np.array([track])
    })


print('Ratings: ')
for track, score in sorted(predicted_tracks.items(), key=lambda x: x[1], reverse=True):
    print(f'track: {track}, score: {score}')





Ratings: 
track: b'Peggy Gou/Starry Night - Edit', score: [[51.789352]]
track: b'Tell/Cool Bananas', score: [[48.4002]]
track: b'V.I.C.A.R.I./Pasci\xc3\xa0 - Dub', score: [[46.881058]]
track: b'COMPUTER DATA/Healing', score: [[46.52957]]
track: b'No_4mat/1992', score: [[46.102833]]
track: b'Tom Jarmey/Beach Jazz', score: [[43.92224]]
track: b'Tell/I Lost 200\xe2\x82\xac in a Club', score: [[42.54987]]
track: b"\xeb\xb0\x95\xed\x98\x9c\xec\xa7\x84 Park Hye Jin/I DON'T CARE", score: [[41.658184]]
track: b'Korn\xc3\xa9l Kov\xc3\xa1cs/Szikra', score: [[41.56623]]
track: b'Adryiano/On My Side', score: [[41.220066]]
track: b'Pachanga Boys/Time', score: [[39.990913]]
track: b'V.I.C.A.R.I./Pasci\xc3\xa0', score: [[39.51571]]
track: b'Coeo/Like It Is - Mixed', score: [[38.96499]]
track: b'Demuja/Do You Want My Love', score: [[38.038883]]
track: b'Leo Pol/This must be illegal', score: [[37.847115]]
track: b'Pacific Coliseum/Ocean City', score: [[37.49597]]
track: b"dj poolboi/don't be so hard on