In [1]:
import os
os.environ['OPENBLAS_NUM_THREADS'] = '1'
os.environ['OMP_NUM_THREADS'] = '1'

In [2]:
import pyspark
import numpy as np
import scipy.sparse as sp
from lightfm import LightFM

In [3]:
# Initialize pyspark

from pyspark.sql import SparkSession

spark = SparkSession.builder \
                    .appName("LightFM Text") \
                    .getOrCreate()

In [4]:
# Load dataframes

import joblib

path_to_df = 'gs://thesis_spotify_apc_bucket/df_data'

df_tracks = spark.read.orc(path_to_df + '/df_tracks.orc')
df_playlists = spark.read.orc(path_to_df + '/df_playlists.orc')
df_playlists_metadata = spark.read.orc(path_to_df + '/df_playlists_metadata.orc')
df_test_playlists = spark.read.orc(path_to_df + '/df_test_playlists.orc')
df_test_metadata = spark.read.orc(path_to_df + '/df_test_metadata.orc')

df_train = spark.read.orc(path_to_df + '/df_train.orc')
df_val = spark.read.orc(path_to_df + '/df_val1.orc')
# val1_pids = joblib.load(path_to_df + '/val1_pids.pkl')

In [6]:
# Setup LightFM configuration

pid_max = df_playlists_metadata.agg({'pid': 'max'}).collect()[0]['max(pid)']
tid_max = df_tracks.agg({'tid': 'max'}).collect()[0]['max(tid)']

config = {
    'num_playlists': pid_max + 1,
    'num_tracks': tid_max + 1,
}

In [8]:
# df_zeros is a dataframe of df_val.pid - df_train.pid.unique()

df_zeros = df_val.join(df_train, on=['pid'], how='leftanti')
# df_zeros.count()

In [9]:
# df_no_zeros is a dataframe of df_val.pid - df_zeros.pid

df_no_zeros = df_val.join(df_zeros, on=['pid'], how='leftanti').limit(1000)
# df_no_zeros.count()

In [10]:
# Prepare playlist names

train_playlist_names = df_playlists_metadata.select(['pid', 'name'])
playlist_names = train_playlist_names

In [11]:
# Join playlist with names with a dataframe of all playlists

from pyspark.sql.functions import col

playlists_to_add = spark.range(config['num_playlists'])
playlist_names = playlists_to_add.join(playlist_names, how='left', on= playlist_names.pid == playlists_to_add.id)\
                                 .select([col('id').alias('pid'), col('name')])\
                                 .fillna('')
#                                  .withColumn('name', split(trim('name'), ' '))

In [12]:
import pandas as pd

playlist_names_pd = playlist_names.toPandas().set_index('pid').name
# playlist_names_pd

In [13]:
# Vectorize using scikit-learn's CountVectorizser

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(max_features=2000)
user_features = vectorizer.fit_transform(playlist_names_pd)

In [14]:
user_features = sp.hstack([sp.eye(config['num_playlists']), user_features])

In [15]:
# Setup sparse matrix for training

train_row_indices = df_train.select('pid').rdd.flatMap(lambda x: x).collect()
train_col_indices = df_train.select('tid').rdd.flatMap(lambda x: x).collect()

X_train = sp.coo_matrix(
    (np.ones(df_train.count()), (train_row_indices, train_col_indices)),
    shape=(config['num_playlists'], config['num_tracks'])
)

In [16]:
# Setup sparse matrix for validation using df_zeros

zeros_row_indices = df_zeros.select('pid').rdd.flatMap(lambda x: x).collect()
zeros_col_indices = df_zeros.select('tid').rdd.flatMap(lambda x: x).collect()

X_zeros = sp.coo_matrix(
    (np.ones(df_zeros.count()), (zeros_row_indices, zeros_col_indices)),
    shape=(config['num_playlists'], config['num_tracks'])
)

In [17]:
# Setup sparse matrix for validation using df_no_zeros

no_zeros_row_indices = df_no_zeros.select('pid').rdd.flatMap(lambda x: x).collect()
no_zeros_col_indices = df_no_zeros.select('tid').rdd.flatMap(lambda x: x).collect()

X_no_zeros = sp.coo_matrix(
    (np.ones(df_no_zeros.count()), (no_zeros_row_indices, no_zeros_col_indices)),
    shape=(config['num_playlists'], config['num_tracks'])
)

In [18]:
config['model_path'] = 'models/lightfm_model_text.pkl'

In [19]:
# Declare model

model = LightFM(
    no_components=200, 
    loss='warp', 
    learning_rate=0.03, 
    max_sampled=400, 
    random_state=1,
    user_alpha=1e-05,
)

In [20]:
num_threads = 16

In [None]:
%%time

# Train and save the best model

from lightfm.evaluation import precision_at_k

best_score = 0
# epochs_without_change = 0

for i in range(10):  # range(10)
    
    print('Epoch: {}'.format(i))
    
    model.fit_partial(X_train, epochs=5, num_threads=num_threads, user_features=user_features) # num_threads=50
    
    score_zeros = []
    score_no_zeros = []
    
    # Calculate scores
    precision_zeros = precision_at_k(model, 
                                     test_interactions=X_zeros, 
                                     train_interactions=X_train, 
                                     user_features=user_features,
                                     k=600, check_intersections=False,
                                     num_threads=num_threads)
    score_zeros.append(precision_zeros)
    precision_no_zeros = precision_at_k(model, 
                                        test_interactions=X_no_zeros, 
                                        train_interactions=X_train, 
                                        user_features=user_features,
                                        k=600, check_intersections=False,
                                        num_threads=num_threads)
    score_no_zeros.append(precision_no_zeros)
    
    mean_score_zeros = np.mean(score_zeros)
    mean_score_no_zeros = np.mean(score_no_zeros)
    
    print(mean_score_zeros, mean_score_no_zeros)
    if mean_score_zeros > best_score:
        joblib.dump(model, config['model_path'])
        best_score = mean_score_zeros

Epoch: 0
0.013668333 0.0014111366
Epoch: 1
0.013288334 0.0014912282
Epoch: 2
0.013033333 0.001460717
Epoch: 3
0.012835 0.001495042
Epoch: 4
0.012721666 0.0015141114
Epoch: 5
0.0126 0.0015026696
Epoch: 6
0.012578333 0.0014683448
Epoch: 7
0.012516666 0.0014149505
Epoch: 8
0.012379999 0.0014111366
CPU times: user 1d 8h 27min 30s, sys: 56.6 s, total: 1d 8h 28min 26s
Wall time: 4h 12min 19s


In [22]:
joblib.dump(user_features, open('models/user_features.pkl', 'wb'))

In [23]:
# Upload file to GCS

! gsutil cp models/* gs://thesis_spotify_apc_bucket/models

Copying file://models/lightfm_model.pkl [Content-Type=application/octet-stream]...
==> NOTE: You are uploading one or more large file(s), which would run          
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite objects is
so slow that gsutil disables downloads of composite objects.

Copying file://models/lightfm_model_text.pkl [Content-Type=application/octet-stream]...
Copying file://models/lightfm_model_text_pd.pkl [Content-Type=application/octet-stream]...
Copying file://models/user_features.pkl [Content-Typ

In [None]:
# model = joblib.load(open(config['model_path'], 'rb'))