In [21]:
import tensorflow as tf
import tensorflow_recommenders as tfrs
import pandas as pd
from pprint import pprint
import numpy as np

In [52]:
# database is small enough to be read into memory hence we will use pandas and then cast it into a tf.data object
df = pd.read_csv('./data/database_model_ready.csv')
df = df.astype(str)
all_tracks = df['artist&track'].unique()
pprint(df.dtypes)
df.head()

artist          object
track_id        object
track           object
owner           object
artist&track    object
dtype: object


Unnamed: 0,artist,track_id,track,owner,artist&track
0,Romanthony,1iwZVgKv3FKc0dqhnSG9uW,Get It 2 Getta,kv718oiku8q612q0zi4iaovzb,Romanthony/Get It 2 Getta
1,Glenn Underground,2JFeAyJLMPPsBkklQEet6t,H-Dance,kv718oiku8q612q0zi4iaovzb,Glenn Underground/H-Dance
2,DJ BORING,3ilkEyg6OCtd9qCnOJkPzU,Winona,kv718oiku8q612q0zi4iaovzb,DJ BORING/Winona
3,Josh Wink,7sBAYj9DLDYz8lFMV89cF6,Meditation Will Manifest,kv718oiku8q612q0zi4iaovzb,Josh Wink/Meditation Will Manifest
4,Lazarus,5Nlm87K2iVmNh2hBnIxmxf,Harbinger,kv718oiku8q612q0zi4iaovzb,Lazarus/Harbinger


In [50]:
all_tracks

array(['Romanthony/Get It 2 Getta', 'Glenn Underground/H-Dance',
       'DJ BORING/Winona', ..., 'Joni Mitchell/Edith and the Kingpin',
       "Joni Mitchell/Don't Interrupt the Sorrow",
       'Joni Mitchell/The Hissing of Summer Lawns'], dtype=object)

In [92]:
user_data = tf.data.Dataset.from_tensor_slices(dict(df))
tracks = tf.data.Dataset.from_tensor_slices({'artist&track' : all_tracks})

In [54]:
for row in user_data.take(1).as_numpy_iterator():
    print(row)
    break

{'artist': b'Romanthony', 'track_id': b'1iwZVgKv3FKc0dqhnSG9uW', 'track': b'Get It 2 Getta', 'owner': b'kv718oiku8q612q0zi4iaovzb', 'artist&track': b'Romanthony/Get It 2 Getta'}


In [55]:
for row in tracks.take(1).as_numpy_iterator():
    print(row)
    break

{'artist&track': b'Romanthony/Get It 2 Getta'}


### Feature selection

In [93]:
user_songs = user_data.map(lambda x: {
    'user': x['owner'],
    'artist&track': x['artist&track'],
})

tracks = tracks.map(lambda x: x['artist&track'])

In [94]:
tf.random.set_seed(42)
buffersize = df.shape[0]
shuffled = user_songs.shuffle(buffersize, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(int(0.8*buffersize))
test = shuffled.skip(int(0.8*buffersize)).take(int(0.2*buffersize))

track_names = tracks.batch(1_000)
user_ids = user_songs.batch(1_000_000).map(lambda x: x["user"])

unique_track_names = np.unique(np.concatenate(list(track_names)))
unique_user_ids = np.unique(np.concatenate(list(user_ids)))

unique_track_names[:10]

array([b' Sentinel/Toulepleu', b'!!!/Fine Fine Fine', b'!!!/Myth Takes',
       b'!!!/NRGQ',
       b'#1 Movie Favorites/The Winner Is (From Little Miss Sunshine)',
       b'#TocoParaVos/Se Pic\xc3\xb3', b'$MOKE OG/$MOKE OG',
       b'$NOT/"Life"', b'$NOT/5AM', b'$NOT/BENZO'], dtype=object)

## Retrieval model

Following the tutorial from [Tensforlow](https://www.tensorflow.org/recommenders/examples/basic_retrieval)

### Query and candidate towers

In [95]:
embedding_dimension = 32

# stringlook up encodes the user id's as contingous integers and then uses an embedding
# similar process for the
user_model = tf.keras.Sequential(
    [
        tf.keras.layers.StringLookup(
            vocabulary=unique_user_ids,
            mask_token=None,
        ),
        tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension),
    ]
)

track_model = tf.keras.Sequential(
    [
        tf.keras.layers.StringLookup(
            vocabulary=unique_track_names,
            mask_token=None,
        ),
        tf.keras.layers.Embedding(len(unique_track_names) + 1, embedding_dimension)
    ]
)

### Metric

In [96]:
metrics = tfrs.metrics.FactorizedTopK(
    candidates=tracks.batch(128).map(track_model)
)

task = tfrs.tasks.Retrieval(metrics=metrics)

### Full retrieval model

In [99]:
class TrackRetrievalModel(tfrs.Model):

    def __init__(self, usr_model, track_model):
        super().__init__()
        self.track_model: tf.keras.Model = track_model
        self.user_model: tf.keras.Model = usr_model
        self.task: tf.keras.layers.Layer = task

    def compute_loss(self, features, training=False) -> tf.Tensor:
        # We pick out the user features and pass them into the user model.
        user_embeddings = self.user_model(features["user"])
        # And pick out the track features and pass them into the track model,
        # getting embeddings back.
        positive_track_embeddings = self.track_model(features["artist&track"])

        # The task computes the loss and the metrics.
        return self.task(user_embeddings, positive_track_embeddings)

### Fitting and evaluating the model

In [100]:
model = TrackRetrievalModel(user_model, track_model)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

#both train and test splits only include the 'ratings' dataset
cached_train = train.shuffle(100_000).batch(8192).cache()
cached_test = test.batch(4096).cache()

model.fit(cached_train, epochs=3)

Epoch 1/3


2022-10-21 11:30:08.200332: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 268435456 exceeds 10% of free system memory.
2022-10-21 11:30:08.424753: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 268435456 exceeds 10% of free system memory.


 1/17 [>.............................] - ETA: 15:43 - factorized_top_k/top_1_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_5_categorical_accuracy: 1.2207e-04 - factorized_top_k/top_10_categorical_accuracy: 2.4414e-04 - factorized_top_k/top_50_categorical_accuracy: 7.3242e-04 - factorized_top_k/top_100_categorical_accuracy: 0.0011 - loss: 73817.3594 - regularization_loss: 0.0000e+00 - total_loss: 73817.3594

2022-10-21 11:31:02.226231: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 268435456 exceeds 10% of free system memory.
2022-10-21 11:31:02.447722: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 268435456 exceeds 10% of free system memory.


 2/17 [==>...........................] - ETA: 13:35 - factorized_top_k/top_1_categorical_accuracy: 6.1035e-05 - factorized_top_k/top_5_categorical_accuracy: 5.4932e-04 - factorized_top_k/top_10_categorical_accuracy: 8.5449e-04 - factorized_top_k/top_50_categorical_accuracy: 0.0019 - factorized_top_k/top_100_categorical_accuracy: 0.0022 - loss: 73817.2148 - regularization_loss: 0.0000e+00 - total_loss: 73817.2148    

2022-10-21 11:31:56.540478: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 268435456 exceeds 10% of free system memory.


Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f9829e5b640>

Not especially high chance of the positive examples being in the top 100. Model is not learning the data well enough. Maybe not surprising given the limited features have been fed into it.

In [161]:
model.evaluate(cached_test, return_dict=True)



{'factorized_top_k/top_1_categorical_accuracy': 0.0014806040562689304,
 'factorized_top_k/top_5_categorical_accuracy': 0.012792419642210007,
 'factorized_top_k/top_10_categorical_accuracy': 0.02984897792339325,
 'factorized_top_k/top_50_categorical_accuracy': 0.1583654135465622,
 'factorized_top_k/top_100_categorical_accuracy': 0.20432336628437042,
 'loss': 5892.62158203125,
 'regularization_loss': 0,
 'total_loss': 5892.62158203125}

Quite similar results for the test set.

### Making predictions (for retrieval of queries)

In [163]:
scann_index = tfrs.layers.factorized_top_k.ScaNN(model.user_model, k=100)
scann_index.index_from_dataset(
    tf.data.Dataset.zip((tracks.batch(100), tracks.batch(100).map(model.track_model)))
)

result = scann_index(np.array(['kv718oiku8q612q0zi4iaovzb']))

2022-10-21 16:51:17.928073: I scann/partitioning/partitioner_factory_base.cc:59] Size of sampled dataset for training partition: 100044
2022-10-21 16:51:18.302408: I ./scann/partitioning/kmeans_tree_partitioner_utils.h:88] PartitionerFactory ran in 374.261723ms.




In [165]:
tf.saved_model.save(
    scann_index,
    './models/spotify_retrieval_v1',
    options=tf.saved_model.SaveOptions(namespace_whitelist=['Scann']),
)



INFO:tensorflow:Assets written to: ./models/spotify_retrieval_v1/assets


INFO:tensorflow:Assets written to: ./models/spotify_retrieval_v1/assets


# Ignore!!!

In [133]:
recs = []
temp = result[1].numpy().reshape(-1)
for rec in temp:
    recs.append(rec.decode('UTF-8').split('/'))

recs = pd.DataFrame(recs, columns=['artist', 'track'])
actual_playlist = df[df['owner'] == 'kv718oiku8q612q0zi4iaovzb'][['artist','track']]
new_recs = pd.concat([recs, actual_playlist]).drop_duplicates(keep=False)

In [164]:
#new_recs.merge(df[['artist','track','track_id']], how='inner').drop_duplicates(subset=['artist', 'track'],keep='first')['track_id'].to_list()

## Ranking Model

In [None]:
user_songs_ranking = user_data.map(lambda x: {
    'user': x['owner'],
    'artist&track': x['artist&track'],
    'popularity': x['popularity'],
})

tf.random.set_seed(42)

shuffled_ranking = user_songs_ranking.shuffle(buffersize, seed=42, reshuffle_each_iteration=False)

train_ranking = shuffled.take(int(0.8*buffersize))
test_ranking = shuffled.skip(int(0.8*buffersize)).take(int(0.2*buffersize))

In [None]:
class RankingModel(tf.keras.Model):

    def __init__(self, usr_model, film_model):
        super().__init__()

        # Compute embeddings for users.
        self.user_embeddings = usr_model

        # Compute embeddings for movies.
        self.track_embeddings = track_model

        # Compute predictions.
        self.ratings = tf.keras.Sequential([
            # Learn multiple dense layers.
            tf.keras.layers.Dense(256, activation="relu"),
            tf.keras.layers.Dense(64, activation="relu"),
            # Make rating predictions in the final layer.
            tf.keras.layers.Dense(1)
        ])

    def call(self, inputs):

        user_id, track_name = inputs

        user_embedding = self.user_embeddings(user_id)
        track_embedding = self.track_embeddings(track_name)

        return self.ratings(tf.concat([user_embedding, track_embedding], axis=1))


class TrackRankingModel(tfrs.models.Model):

    def __init__(self, usr_model, track_model):
        super().__init__()
        self.ranking_model: tf.keras.Model = RankingModel(usr_model, track_model)
        self.task: tf.keras.layers.Layer = tfrs.tasks.Ranking(
            loss = tf.keras.losses.MeanSquaredError(),
            metrics=[tf.keras.metrics.RootMeanSquaredError()]
        )

    def call(self, features) -> tf.Tensor:
        return self.ranking_model(
            (features["user_id"], features["track"]))

    def compute_loss(self, features, training=False) -> tf.Tensor:
        labels = features.pop("user_rating")

        rating_predictions = self(features)

        # The task computes the loss and the metrics.
        return self.task(labels=labels, predictions=rating_predictions)


In [None]:
ranking_model = TrackRankingModel(user_model, track_model)
ranking_model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

cached_train_ranking = train_ranking.shuffle(buffersize).batch(8192).cache()
cached_test_ranking = test_ranking.batch(4096).cache()

ranking_model.fit(cached_train_ranking, epochs=3)