# Spark


In [None]:
import numpy as np

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType, StringType

In [None]:
spark = SparkSession.builder \
        .appName("Book Recommender") \
        .config("spark.sql.repl.eagerEval.enabled", True) \
        .config("spark.sql.repl.eagerEval.maxNumRows", 10) \
        .config("spark.driver.memory", "4g") \
        .getOrCreate()

In [None]:
spark_df = spark.read.parquet("work_df.parquet")

In [None]:
spark_df.printSchema()

In [None]:
spark_df = spark_df.withColumns({
    "userId": F.col("userId").cast(IntegerType()), 
    "bookId" : F.col("bookId").cast(IntegerType()), 
    "rating" : F.col("rating").cast(IntegerType())
    }
)

In [None]:
spark_df.printSchema()

In [None]:
book_ratings = spark_df.select("userId", "bookId", "rating")
book_ratings

### Spark ML


In [None]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.recommendation import ALS, ALSModel
from pyspark.ml.evaluation import RegressionEvaluator

#### Train-Test Split


In [None]:
(train, test) = book_ratings.randomSplit([0.8, 0.2], seed=42)

#### Simple ALS Model


In [None]:
als = ALS(
    userCol="userId",
    itemCol="bookId",
    ratingCol="rating",
    nonnegative=True,
    coldStartStrategy="drop",
    seed=42
)

In [None]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")

In [None]:
simple_model = als.fit(train)

In [None]:
predictions = simple_model.transform(test)

In [None]:
predictions

In [None]:
rmse = evaluator.evaluate(predictions)
rmse

In [None]:
param_grid = ParamGridBuilder() \
            .addGrid(als.maxIter, [20, 30]) \
            .addGrid(als.rank, [15, 50]) \
            .addGrid(als.regParam, [0.1, 1.0]) \
            .build()

In [None]:
print(f"Num. Models: {len(param_grid)}")

In [None]:
cv = CrossValidator(
    estimator=als,
    estimatorParamMaps=param_grid,
    evaluator=evaluator,
    parallelism=4, numFolds=3
)

In [None]:
tuned_model = cv.fit(train)

In [None]:
best_model = tuned_model.bestModel

In [None]:
predictions_2 = best_model.transform(test)

In [None]:
predictions_2

In [None]:
rmse_2 = evaluator.evaluate(predictions_2)
rmse_2

In [None]:
best_model

## Save the model

In [None]:
best_model.save("alsrecommend.model")

In [None]:
spark.stop()