In [1]:
import pyspark
import pandas as pd
import numpy as np
import scipy.sparse as sp
from lightfm import LightFM

In [2]:
# Initialize pyspark

from pyspark.sql import SparkSession

spark = SparkSession.builder \
                    .appName("LightFM Features") \
                    .getOrCreate()

In [3]:
# Function to calculate representation and bias between objects and features

def precompute_representation(features, feature_embeddings, feature_biases):
    representation = features.dot(feature_embeddings)
    representation_bias = features.dot(feature_biases)
    return representation, representation_bias

In [4]:
# Load models and features

import joblib

path_to_models = 'models2'

model = joblib.load(open(path_to_models + '/lightfm_model.pkl', 'rb'))
model_text = joblib.load(open(path_to_models + '/lightfm_model_text.pkl', 'rb'))
user_features = joblib.load(open(path_to_models + '/user_features.pkl', 'rb'))

_user_repr, _user_repr_biases = precompute_representation(
    features=user_features,
    feature_embeddings=model_text.user_embeddings,
    feature_biases=model_text.user_biases,
)

In [5]:
# Load candidates

import pandas as pd

path_to_df = 'gs://thesis_apc_bucket/df_data2'

df_train = spark.read.orc(path_to_df + '/ii_candidate.orc')
df_val = spark.read.orc(path_to_df + '/iii_candidate.orc')
df_test = spark.read.orc(path_to_df + '/test_candidate.orc')

In [6]:
%%time

# from pyspark.sql.types import StructType, StructField, FloatType, ArrayType, IntegerType

# # Get bias and embeddings from both models for playlists
# playlist_results = pd.DataFrame({'pid_bias': model.user_biases, 'pid_embedding': list(model.user_embeddings),
#                                  'pid_bias_text': _user_repr_biases, 'pid_embedding_text': list(_user_repr)})
# playlist_results['pid'] = playlist_results.index

# # Convert into Spark dataframe
# playlist_schema = StructType([
#     StructField('pid_bias', FloatType(), True),
#     StructField('pid_embedding', ArrayType(FloatType()), True),
#     StructField('pid_bias_text', FloatType(), True),
#     StructField('pid_embedding_text', ArrayType(FloatType()), True),
#     StructField('pid', IntegerType(), True),
#   ])
# df_playlist_results = spark.createDataFrame(playlist_results, schema=playlist_schema)
    
# # Get bias and embeddings from both models for tracks
# track_results = pd.DataFrame({'tid_bias': model.item_biases, 'tid_embedding': list(model.item_embeddings.tolist()),
#                               'tid_bias_text': model_text.item_biases, 'tid_embedding_text': list(model_text.item_embeddings.tolist())})
# track_results['tid'] = track_results.index

# # Convert into Spark dataframe
# track_schema = StructType([
#     StructField('tid_bias', FloatType(), True),
#     StructField('tid_embedding', ArrayType(FloatType()), True),
#     StructField('tid_bias_text', FloatType(), True),
#     StructField('tid_embedding_text', ArrayType(FloatType()), True),
#     StructField('tid', IntegerType(), True),
#   ])
# df_track_results = spark.createDataFrame(track_results, schema=track_schema)

CPU times: user 1e+03 ns, sys: 2 µs, total: 3 µs
Wall time: 4.77 µs


In [8]:
def add_embeddings_and_biases(df):
    res = df.toPandas()
    
    pids = [int(pid) for pid in res.pid]
    tids = [int(tid) for tid in res.tid]
    
    res['pid_bias'] = model.user_biases[pids]
    res['tid_bias'] = model.item_biases[tids]
    
    pid_embeddings = model.user_embeddings[pids]
    tid_embeddings = model.item_embeddings[tids]
    
    res['lightfm_dot_product'] = (pid_embeddings * tid_embeddings).sum(axis=1)
    
    res['pid_bias_text'] = _user_repr_biases[pids]
    res['tid_bias_text'] = model_text.item_biases[tids]
    
    pid_embeddings = _user_repr[pids]
    tid_embeddings = model_text.item_embeddings[tids]

    res['lightfm_dot_product_text'] = (pid_embeddings * tid_embeddings).sum(axis=1)
    
    final = spark.createDataFrame(res)
    
    return final

In [9]:
# UDF for calculating dot product

from pyspark.sql.functions import pandas_udf, array
from pyspark.sql.types import DoubleType

@pandas_udf('float')
def dotProduct(arr1: pd.Series, arr2: pd.Series) -> pd.Series:
    res = [np.dot(x, y) for x,y in zip(arr1, arr2)]
    return pd.Series(res)

In [10]:
from pyspark.sql import Window
from pyspark.sql.functions import col, dense_rank, desc

def create_lightfm_features(df):
    
    df = add_embeddings_and_biases(df)
    
    # Generate dot product of embeddings and overall score
    df = df.withColumn('lightfm_prediction', col('lightfm_dot_product') + col('pid_bias') + col('tid_bias')) \
           .withColumn('lightfm_prediction_text', col('lightfm_dot_product_text') + col('pid_bias_text') + col('tid_bias_text'))
    
    # Get rank of candidates by sorting score
    lightfm_window = Window.partitionBy('pid').orderBy(desc('lightfm_prediction'))
    lightfm_text_window = Window.partitionBy('pid').orderBy(desc('lightfm_prediction_text'))
    df = df.withColumn('lightfm_rank', dense_rank().over(lightfm_window)) \
            .withColumn('lightfm_rank_text', dense_rank().over(lightfm_text_window))
    
    # For checking
    print('Done')
    
    return df

In [13]:
%%time

df_train = create_lightfm_features(df_train)
df_train.show(5)

Done
+----+------+------+-----------+----------+-------------------+-------------------+-------------+------------------------+------------------+-----------------------+------------+-----------------+
| pid|   tid|target|   pid_bias|  tid_bias|lightfm_dot_product|      pid_bias_text|tid_bias_text|lightfm_dot_product_text|lightfm_prediction|lightfm_prediction_text|lightfm_rank|lightfm_rank_text|
+----+------+------+-----------+----------+-------------------+-------------------+-------------+------------------------+------------------+-----------------------+------------+-----------------+
|26.0|519034| false|-0.08734558| 0.9021774|          0.5924262|-23.697782546281815|    0.6261367|       4.862977839216418|          1.407258|    -18.208667986884883|         557|                1|
|26.0|523329| false|-0.08734558| 0.5229688|          1.4899194|-23.697782546281815|   0.45235774|       5.007797977901631|         1.9255426|    -18.237626829170054|          61|                2|
|26.0|5281

In [15]:
%%time

df_val = create_lightfm_features(df_val)
df_val.show(5)

Done
+----+------+------+-----------+-------------+-------------------+-------------------+-------------+------------------------+------------------+-----------------------+------------+-----------------+
| pid|   tid|target|   pid_bias|     tid_bias|lightfm_dot_product|      pid_bias_text|tid_bias_text|lightfm_dot_product_text|lightfm_prediction|lightfm_prediction_text|lightfm_rank|lightfm_rank_text|
+----+------+------+-----------+-------------+-------------------+-------------------+-------------+------------------------+------------------+-----------------------+------------+-----------------+
|30.0|511391| false|-0.37722275|-0.0070896107|          1.3922765|-31.178849518299103|   0.35649538|       4.097497194110817|         1.0079641|    -26.724856943786676|         253|                1|
|30.0|589019| false|-0.37722275|   0.34427986|          1.4790885|-31.178849518299103|    0.5170483|       3.793315889032227|         1.4461457|    -26.868485329954282|          10|              

In [16]:
%%time

df_test = create_lightfm_features(df_test)
df_test.show(5)

Done
+-----+------+-----------+---------+-------------------+--------------------+-------------+------------------------+------------------+-----------------------+------------+-----------------+
|  pid|   tid|   pid_bias| tid_bias|lightfm_dot_product|       pid_bias_text|tid_bias_text|lightfm_dot_product_text|lightfm_prediction|lightfm_prediction_text|lightfm_rank|lightfm_rank_text|
+-----+------+-----------+---------+-------------------+--------------------+-------------+------------------------+------------------+-----------------------+------------+-----------------+
|180.0|511885|-0.06929762|1.4351802|         0.76276326|-0.22065338492393494|   0.87265104|        2.60760378886442|          2.128646|      3.259601444494532|          28|                1|
|180.0|511988|-0.06929762| 1.299129|          1.1761954|-0.22065338492393494|   0.88705814|       2.501355251344644|         2.4060268|       3.16776000526806|           5|                2|
|180.0|515885|-0.06929762|1.2555002|    

In [17]:
df_train.write.mode('overwrite').orc(path_to_df + '/ii_lightfm_features.orc')
df_val.write.mode('overwrite').orc(path_to_df + '/iii_lightfm_features.orc')
df_test.write.mode('overwrite').orc(path_to_df + '/test_lightfm_features.orc')