# LightGCN
https://github.com/microsoft/recommenders/blob/main/examples/02_model_collaborative_filtering/lightgcn_deep_dive.ipynb

## Setup

In [1]:
import sys
import os
import papermill as pm
import scrapbook as sb
import pandas as pd
import numpy as np
import tensorflow as tf
tf.get_logger().setLevel('ERROR') # only show error messages

from recommenders.utils.timer import Timer
from recommenders.models.deeprec.models.graphrec.lightgcn import LightGCN
from recommenders.models.deeprec.DataModel.ImplicitCF import ImplicitCF
from recommenders.datasets import movielens
from recommenders.datasets.python_splitters import python_stratified_split
from recommenders.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k
from recommenders.utils.constants import SEED as DEFAULT_SEED
from recommenders.models.deeprec.deeprec_utils import prepare_hparams

print("System version: {}".format(sys.version))
print("Pandas version: {}".format(pd.__version__))
print("Tensorflow version: {}".format(tf.__version__))

System version: 3.7.0 (default, Oct  9 2018, 10:31:47) 
[GCC 7.3.0]
Pandas version: 1.1.5
Tensorflow version: 1.15.5


In [31]:
import fun
from pyspark.sql.types import StringType, FloatType, IntegerType, LongType, StructType, StructField
from recommenders.evaluation.spark_evaluation import SparkRatingEvaluation, SparkRankingEvaluation, SparkDiversityEvaluation

In [24]:
# top k items to recommend
TOP_K = 20

# Select MovieLens data size: 100k, 1m, 10m, or 20m
MOVIELENS_DATA_SIZE = '100k'

# Model parameters
EPOCHS = 50
BATCH_SIZE = 1024

SEED = DEFAULT_SEED  # Set None for non-deterministic results

yaml_file = "lightgcn.yaml"
user_file = "../../tests/resources/deeprec/lightgcn/user_embeddings.csv"
item_file = "../../tests/resources/deeprec/lightgcn/item_embeddings.csv"

## Import and prepare Data

In [6]:
data_full = movielens.load_pandas_df(size=MOVIELENS_DATA_SIZE, title_col='title',
    genres_col='genres')
data_full.head()

100%|██████████| 4.81k/4.81k [00:00<00:00, 14.7kKB/s]


Unnamed: 0,userID,itemID,rating,timestamp,title,genres
0,196,242,3.0,881250949,Kolya (1996),Comedy
1,63,242,3.0,875747190,Kolya (1996),Comedy
2,226,242,5.0,883888671,Kolya (1996),Comedy
3,154,242,3.0,879138235,Kolya (1996),Comedy
4,306,242,5.0,876503793,Kolya (1996),Comedy


In [7]:
df = data_full[['userID', 'itemID', 'rating', 'timestamp']]
df.head()

Unnamed: 0,userID,itemID,rating,timestamp
0,196,242,3.0,881250949
1,63,242,3.0,875747190
2,226,242,5.0,883888671
3,154,242,3.0,879138235
4,306,242,5.0,876503793


In [9]:
data_full_spark = fun.movielens_to_spark(data_full, schema = fun.get_movielens_schema())

Spark df created, info: 

root
 |-- UserId: integer (nullable = true)
 |-- MovieId: integer (nullable = true)
 |-- Rating: float (nullable = true)
 |-- Timestamp: long (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)

+------+-------+------+---------+------------+------+
|UserId|MovieId|Rating|Timestamp|       title|genres|
+------+-------+------+---------+------------+------+
|   196|    242|   3.0|881250949|Kolya (1996)|Comedy|
|    63|    242|   3.0|875747190|Kolya (1996)|Comedy|
|   226|    242|   5.0|883888671|Kolya (1996)|Comedy|
|   154|    242|   3.0|879138235|Kolya (1996)|Comedy|
|   306|    242|   5.0|876503793|Kolya (1996)|Comedy|
+------+-------+------+---------+------------+------+
only showing top 5 rows



In [10]:
feature_data = fun.create_feature_data(data_full_spark)

In [15]:
train, test = python_stratified_split(df, ratio=0.75)
train.head()

Unnamed: 0,userID,itemID,rating,timestamp
10732,1,232,3.0,878543196
43023,1,66,4.0,878543030
30062,1,106,4.0,875241390
46315,1,97,3.0,875073128
69919,1,73,3.0,876892774


In [18]:
 traintestschema = StructType(
    (StructField("UserId", IntegerType()),
    StructField("MovieId", IntegerType()),
    StructField("Rating", FloatType()),
    StructField("Timestamp", LongType())))

In [20]:
train_df_spark = fun.movielens_to_spark(train, schema = traintestschema)
test_df_spark = fun.movielens_to_spark(test, schema = traintestschema)

Spark df created, info: 

root
 |-- UserId: integer (nullable = true)
 |-- MovieId: integer (nullable = true)
 |-- Rating: float (nullable = true)
 |-- Timestamp: long (nullable = true)

+------+-------+------+---------+
|UserId|MovieId|Rating|Timestamp|
+------+-------+------+---------+
|     1|    232|   3.0|878543196|
|     1|     66|   4.0|878543030|
|     1|    106|   4.0|875241390|
|     1|     97|   3.0|875073128|
|     1|     73|   3.0|876892774|
+------+-------+------+---------+
only showing top 5 rows

Spark df created, info: 

root
 |-- UserId: integer (nullable = true)
 |-- MovieId: integer (nullable = true)
 |-- Rating: float (nullable = true)
 |-- Timestamp: long (nullable = true)

+------+-------+------+---------+
|UserId|MovieId|Rating|Timestamp|
+------+-------+------+---------+
|     1|     49|   3.0|878542478|
|     1|     69|   3.0|875072262|
|     1|    221|   5.0|887431921|
|     1|      5|   3.0|889751712|
|     1|    139|   3.0|878543216|
+------+-------+------+

## Train and predict LightGCN

In [25]:
data = ImplicitCF(train=train, test=test, seed=SEED)
hparams = prepare_hparams(yaml_file,
                          n_layers=3,
                          batch_size=BATCH_SIZE,
                          epochs=EPOCHS,
                          learning_rate=0.005,
                          eval_epoch=5,
                          top_k=TOP_K,
                         )

In [26]:
model = LightGCN(hparams, data, seed=SEED)

Already create adjacency matrix.
Already normalize adjacency matrix.
Using xavier initialization.


In [27]:
with Timer() as train_time:
    model.fit()

print("Took {} seconds for training.".format(train_time.interval))

Epoch 1 (train)3.8s: train loss = 0.46603 = (mf)0.46579 + (embed)0.00025
Epoch 2 (train)3.2s: train loss = 0.27655 = (mf)0.27590 + (embed)0.00065
Epoch 3 (train)3.2s: train loss = 0.24837 = (mf)0.24756 + (embed)0.00081
Epoch 4 (train)3.4s: train loss = 0.23141 = (mf)0.23043 + (embed)0.00098
Epoch 5 (train)3.3s + (eval)0.3s: train loss = 0.22196 = (mf)0.22085 + (embed)0.00111, recall = 0.22025, ndcg = 0.29547, precision = 0.22450, map = 0.09192
Epoch 6 (train)3.4s: train loss = 0.21733 = (mf)0.21612 + (embed)0.00121
Epoch 7 (train)3.4s: train loss = 0.21148 = (mf)0.21018 + (embed)0.00130
Epoch 8 (train)3.4s: train loss = 0.20132 = (mf)0.19991 + (embed)0.00141
Epoch 9 (train)3.4s: train loss = 0.19043 = (mf)0.18887 + (embed)0.00156
Epoch 10 (train)3.3s + (eval)0.2s: train loss = 0.18149 = (mf)0.17975 + (embed)0.00173, recall = 0.25896, ndcg = 0.33969, precision = 0.26448, map = 0.11191
Epoch 11 (train)3.4s: train loss = 0.17338 = (mf)0.17149 + (embed)0.00189
Epoch 12 (train)3.3s: train l

In [28]:
topk_scores = model.recommend_k_items(test, top_k=TOP_K, remove_seen=True)
print(type(topk_scores))
topk_scores.head()

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,userID,itemID,prediction
0,1,98,5.605151
1,1,423,5.21259
2,1,69,5.202886
3,1,919,5.160752
4,1,408,5.160678


## Evaluate

### Use recommenders example approach for precision and recall

eval_precision = precision_at_k(test, topk_scores, k=TOP_K)
eval_recall = recall_at_k(test, topk_scores, k=TOP_K)

print("Precision@K:\t%f" % eval_precision,
      "Recall@K:\t%f" % eval_recall, sep='\n')

### Use our methods for diversity metrics,leave ratings as NA

In [30]:
topk_spark = fun.movielens_to_spark(topk_scores, schema = fun.get_predictions_schema())

Spark df created, info: 

root
 |-- UserId: integer (nullable = true)
 |-- MovieId: integer (nullable = true)
 |-- prediction: float (nullable = true)

+------+-------+----------+
|UserId|MovieId|prediction|
+------+-------+----------+
|     1|     98| 5.6051507|
|     1|    423| 5.2125897|
|     1|     69|  5.202886|
|     1|    919|  5.160752|
|     1|    408|  5.160678|
+------+-------+----------+
only showing top 5 rows



In [33]:
def get_diversity_results(diversity_eval):
    metrics = {
        "catalog_coverage":diversity_eval.catalog_coverage(),
        "distributional_coverage":diversity_eval.distributional_coverage(), 
        "novelty": diversity_eval.novelty(), 
        "diversity": diversity_eval.diversity(), 
        "serendipity": diversity_eval.serendipity()
    }
    return metrics

Collaborative

In [34]:
collaborative_diversity_eval = SparkDiversityEvaluation(
        train_df = train_df_spark, 
        reco_df = topk_spark,
        col_user = "UserId", 
        col_item = "MovieId"
    )
diversity_collaborative = get_diversity_results(collaborative_diversity_eval)

In [37]:
print("Collaborative diversity:\t%f" % diversity_collaborative['diversity'],
      "Collaborative serendipity:\t%f" % diversity_collaborative['serendipity'], 
     "Collaborative novelty:\t%f" % diversity_collaborative['novelty'], sep='\n')

Collaborative diversity:	0.673293
Collaborative serendipity:	0.725411
Collaborative novelty:	8.863385


Content

In [36]:
content_diversity_eval = SparkDiversityEvaluation(
        train_df = train_df_spark, 
        reco_df = topk_spark,
        item_feature_df = feature_data, 
        item_sim_measure="item_feature_vector",
        col_user = "UserId", 
        col_item = "MovieId"
    )
diversity_content = get_diversity_results(content_diversity_eval)

In [38]:
print("Content diversity:\t%f" % diversity_content['diversity'],
      "Content serendipity:\t%f" % diversity_content['serendipity'], 
     "Content novelty:\t%f" % diversity_content['novelty'], sep='\n')

Content diversity:	0.845239
Content serendipity:	0.857982
Content novelty:	8.863385
