In [1]:
import os
os.environ['OPENBLAS_NUM_THREADS'] = '1'
os.environ['OMP_NUM_THREADS'] = '1'

In [2]:
import findspark
findspark.init()
import pyspark
import numpy as np
import scipy.sparse as sp
from lightfm import LightFM



In [3]:
# Initialize pyspark

from pyspark.sql import SparkSession

spark = SparkSession.builder \
                    .config("spark.driver.memory", "12g") \
                    .config("spark.sql.orc.filterPushdown", "true") \
                    .appName("LightFM Model") \
                    .getOrCreate()

In [4]:
# Load dataframes

import joblib

path_to_df = '/Volumes/T5/PROJECTS/U. THESIS/Dataset/df_data'

df_tracks = spark.read.orc(path_to_df + '/df_tracks.orc')
df_playlists = spark.read.orc(path_to_df + '/df_playlists.orc')
df_playlists_metadata = spark.read.orc(path_to_df + '/df_playlists_metadata.orc')

df_train = spark.read.orc(path_to_df + '/df_train.orc')
df_val = spark.read.orc(path_to_df + '/df_val1.orc')

In [5]:
# Setup LightFM configuration

pid_max = df_playlists_metadata.agg({'pid': 'max'}).collect()[0]['max(pid)']
tid_max = df_tracks.agg({'tid': 'max'}).collect()[0]['max(tid)']

config = {
    'num_playlists': pid_max + 1,
    'num_tracks': tid_max + 1,
}

In [6]:
# Setup sparse matrix for training

train_row_indices = df_train.select('pid').rdd.flatMap(lambda x: x).collect()
train_col_indices = df_train.select('tid').rdd.flatMap(lambda x: x).collect()

X_train = sp.coo_matrix(
    (np.ones(df_train.count()), (train_row_indices, train_col_indices)),
    shape=(config['num_playlists'], config['num_tracks'])
)

In [7]:
# Setup sparse matrix for validation

val_row_indices = df_val.select('pid').rdd.flatMap(lambda x: x).collect()
val_col_indices = df_val.select('tid').rdd.flatMap(lambda x: x).collect()

X_val = sp.coo_matrix(
    (np.ones(df_val.count()), (val_row_indices, val_col_indices)),
    shape=(config['num_playlists'], config['num_tracks'])
)

In [8]:
config['model_path'] = '/Volumes/T5/PROJECTS/U. THESIS/models/lightfm_model.pkl'

In [9]:
# Declare model

model = LightFM(no_components=200, loss='warp', learning_rate=0.02, max_sampled=400, random_state=1, user_alpha=1e-05)

best_score = 0

In [12]:
%%time

# Fit the model
# Note: Here fit_partial is used, meaning multiple calls to the function will continue from the previous state

from lightfm.evaluation import precision_at_k

for i in range(2): # range=60
    
    print('Epoch: {}'.format(i))
    
    model.fit_partial(X_train, epochs=5, num_threads=1) # epochs=5, num_threads=50
    
    score = []
    precision = precision_at_k(model, X_val, 
                               train_interactions=X_train, 
                               k=600, check_intersections=False, num_threads=1)
    score.append(precision)
    
    score = np.mean(score)
    print(score)
    if score > best_score:
        joblib.dump(model, open(config['model_path'], 'wb'))
        best_score = score

Epoch: 0
0.04026227
Epoch: 1
0.042764384
CPU times: user 1h 38min 16s, sys: 28 s, total: 1h 38min 44s
Wall time: 1h 39min 17s


In [14]:
# best_score