In [2]:
%load_ext autoreload
%autoreload 2

import sys
import os
import numpy as np
import surprise
import papermill as pm
import scrapbook as sb
import pandas as pd


import pyspark
from pyspark.ml.recommendation import ALS
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, FloatType, IntegerType, LongType, StructType, StructField
from pyspark.ml.feature import Tokenizer, RegexTokenizer, StopWordsRemover
from pyspark.ml.feature import HashingTF, CountVectorizer, VectorAssembler
from pyspark.sql.window import Window
import pyspark.sql.functions as F

from recommenders.utils.timer import Timer
from recommenders.datasets import movielens
from recommenders.utils.notebook_utils import is_jupyter
from recommenders.datasets.python_splitters import python_random_split
from recommenders.datasets.spark_splitters import spark_random_split
from recommenders.evaluation.python_evaluation import (rmse, mae, rsquared, exp_var, map_at_k, ndcg_at_k, precision_at_k, 
                                                     recall_at_k, get_top_k_items)
from recommenders.models.surprise.surprise_utils import predict, compute_ranking_predictions
from recommenders.evaluation.spark_evaluation import SparkRatingEvaluation, SparkRankingEvaluation, SparkDiversityEvaluation
from recommenders.utils.spark_utils import start_or_get_spark

In [12]:
import fun

In [5]:
MOVIELENS_DATA_SIZE = '100k'

# topk, user, item column names
TOP_K = 10

spark = start_or_get_spark("ALS PySpark", memory="16g")
spark.conf.set("spark.sql.crossJoin.enabled", "true")
spark

data_full = movielens.load_pandas_df(
    size=MOVIELENS_DATA_SIZE,
    header=['UserId', 'MovieId', 'Rating', 'Timestamp'],
    title_col='title',
    genres_col='genres'
)

data_full.head()

100%|██████████| 4.81k/4.81k [00:03<00:00, 1.33kKB/s]


Unnamed: 0,UserId,MovieId,Rating,Timestamp,title,genres
0,196,242,3.0,881250949,Kolya (1996),Comedy
1,63,242,3.0,875747190,Kolya (1996),Comedy
2,226,242,5.0,883888671,Kolya (1996),Comedy
3,154,242,3.0,879138235,Kolya (1996),Comedy
4,306,242,5.0,876503793,Kolya (1996),Comedy


In [9]:
data_full_spark = fun.movielens_to_spark(data_full, schema = fun.get_movielens_schema())

Spark df created, info: 
root
 |-- UserId: integer (nullable = true)
 |-- MovieId: integer (nullable = true)
 |-- Rating: float (nullable = true)
 |-- Timestamp: long (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)

+------+-------+------+---------+------------+------+
|UserId|MovieId|Rating|Timestamp|       title|genres|
+------+-------+------+---------+------------+------+
|   196|    242|   3.0|881250949|Kolya (1996)|Comedy|
|    63|    242|   3.0|875747190|Kolya (1996)|Comedy|
|   226|    242|   5.0|883888671|Kolya (1996)|Comedy|
|   154|    242|   3.0|879138235|Kolya (1996)|Comedy|
|   306|    242|   5.0|876503793|Kolya (1996)|Comedy|
+------+-------+------+---------+------------+------+
only showing top 5 rows



In [10]:
data_full_reversed = fun.movielens_to_pandas(data_full_spark)

In [11]:
data_full_reversed.head(5)

Unnamed: 0,UserId,MovieId,Rating,Timestamp,title,genres
0,196,242,3.0,881250949,Kolya (1996),Comedy
1,63,242,3.0,875747190,Kolya (1996),Comedy
2,226,242,5.0,883888671,Kolya (1996),Comedy
3,154,242,3.0,879138235,Kolya (1996),Comedy
4,306,242,5.0,876503793,Kolya (1996),Comedy


In [13]:
feature_data = fun.create_feature_data(data_full_spark)

In [14]:
feature_data.show(5)

+-------+--------------------+
|MovieId|            features|
+-------+--------------------+
|    673|(1043,[169,690,10...|
|    879|(1043,[909,1026,1...|
|     66|(1043,[256,1025,1...|
|      9|(1043,[11,342,101...|
|    605|(1043,[754,848,94...|
+-------+--------------------+
only showing top 5 rows

