# Spark


In [3]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType, StringType

In [4]:
spark = SparkSession.builder \
        .appName("Book Recommender") \
        .config("spark.sql.repl.eagerEval.enabled", True) \
        .config("spark.sql.repl.eagerEval.maxNumRows", 10) \
        .config("spark.driver.memory", "4g") \
        .getOrCreate()

In [5]:
spark_df = spark.read.parquet("work_df.parquet")

                                                                                

In [6]:
spark_df.printSchema()

root
 |-- userId: short (nullable = true)
 |-- bookId: short (nullable = true)
 |-- title: string (nullable = true)
 |-- language: string (nullable = true)
 |-- rating: long (nullable = true)



In [9]:
spark_df = spark_df.withColumns({
    "userId": F.col("userId").cast(IntegerType()), 
    "bookId" : F.col("bookId").cast(IntegerType()), 
    "rating" : F.col("rating").cast(IntegerType())
    }
)

In [10]:
spark_df.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- bookId: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- language: string (nullable = true)
 |-- rating: integer (nullable = true)



In [13]:
book_ratings = spark_df.select("userId", "bookId", "rating")
book_ratings

                                                                                

userId,bookId,rating
179,2872,10
277,2872,9
358,2872,7
382,2872,10
585,2872,6
611,2872,8
638,2872,8
648,2872,5
1009,2872,8
1024,2872,8


### Spark ML


In [14]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.recommendation import ALS, ALSModel
from pyspark.ml.evaluation import RegressionEvaluator

#### Train-Test Split


In [15]:
(train, tesst) = book_ratings.randomSplit([0.8, 0.2], seed=42)

#### Simple ALS Model


In [16]:
als = ALS(
    userCol="userId",
    itemCol="bookId",
    ratingCol="rating",
    nonnegative=True,
    coldStartStrategy="drop",
    seed=42
)

In [17]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")