# Spark


In [1]:
import numpy as np

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType, StringType

In [2]:
spark = SparkSession.builder \
        .appName("Book Recommender") \
        .config("spark.sql.repl.eagerEval.enabled", True) \
        .config("spark.sql.repl.eagerEval.maxNumRows", 10) \
        .config("spark.driver.memory", "4g") \
        .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/04/16 00:49:25 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
spark_df = spark.read.parquet("work_df.parquet")

In [4]:
spark_df.printSchema()

root
 |-- userId: short (nullable = true)
 |-- bookId: short (nullable = true)
 |-- title: string (nullable = true)
 |-- language: string (nullable = true)
 |-- rating: long (nullable = true)



In [5]:
spark_df = spark_df.withColumns({
    "userId": F.col("userId").cast(IntegerType()), 
    "bookId" : F.col("bookId").cast(IntegerType()), 
    "rating" : F.col("rating").cast(IntegerType())
    }
)

In [6]:
spark_df.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- bookId: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- language: string (nullable = true)
 |-- rating: integer (nullable = true)



In [7]:
book_ratings = spark_df.select("userId", "bookId", "rating")
book_ratings

userId,bookId,rating
179,2872,10
277,2872,9
358,2872,7
382,2872,10
585,2872,6
611,2872,8
638,2872,8
648,2872,5
1009,2872,8
1024,2872,8


### Spark ML


In [8]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.recommendation import ALS, ALSModel
from pyspark.ml.evaluation import RegressionEvaluator

#### Train-Test Split


In [9]:
(train, test) = book_ratings.randomSplit([0.8, 0.2], seed=42)

#### Simple ALS Model


In [10]:
als = ALS(
    userCol="userId",
    itemCol="bookId",
    ratingCol="rating",
    nonnegative=True,
    coldStartStrategy="drop",
    seed=42
)

In [11]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")

In [12]:
simple_model = als.fit(train)

23/04/16 00:49:30 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS


In [13]:
predictions = simple_model.transform(test)

In [14]:
predictions

userId,bookId,rating,prediction
148,3473,10,10.002486
148,5909,7,8.571875
463,357,8,4.8718247
463,867,5,8.3573065
471,1290,10,8.063271
471,6153,10,6.145279
496,3655,9,9.084567
833,915,5,6.2888594
833,2056,5,4.594879
1088,3760,7,7.8322515


In [15]:
rmse = evaluator.evaluate(predictions)
rmse

1.8919501331030917

In [16]:
param_grid = ParamGridBuilder() \
            .addGrid(als.maxIter, [20, 30]) \
            .addGrid(als.rank, [1, 5]) \
            .addGrid(als.regParam, [0.1, 0.5]) \
            .build()

In [17]:
print(f"Num. Models: {len(param_grid)}")

Num. Models: 8


In [18]:
cv = CrossValidator(
    estimator=als,
    estimatorParamMaps=param_grid,
    evaluator=evaluator,
    parallelism=4, numFolds=3
)

In [19]:
tuned_model = cv.fit(train)

                                                                                

In [20]:
best_model = tuned_model.bestModel

In [21]:
predictions_2 = best_model.transform(test)

In [22]:
predictions_2

userId,bookId,rating,prediction
1395,6654,10,7.1930575
2563,463,7,7.2933846
2563,3749,8,6.7506123
1884,8389,10,8.389419
5140,7982,6,7.2989945
847,4519,7,8.884651
5055,2366,8,7.163855
2463,2366,10,8.956356
4443,5300,7,6.60134
4230,8638,10,8.017377


In [23]:
rmse_2 = evaluator.evaluate(predictions_2)
rmse_2

1.6161660104670172

In [24]:
best_model

ALSModel: uid=ALS_4d5b53655bd6, rank=1

## Save the model

In [25]:
best_model.save("alsrecommend.model")

In [26]:
spark.stop()