### Step 1: Data Loading and Preparation

In [None]:
from pyspark import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

sc = SparkContext.getOrCreate()

if (sc is None):
    sc = SparkContext(master="local[*]", appName="Collaborative Filtering")
spark = SparkSession(sparkContext=sc)

lines = spark.read.text("sample_movielens_ratings.txt").rdd

parts = lines.map(lambda row: row.value.split("::"))

ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]), movieId=int(p[1]),
                                     rating=float(p[2]), timestamp=float(p[3])))
ratings = spark.createDataFrame(ratingsRDD)

ratings.show()

### Step 2: Machine Learning Pipeline

In [None]:
(training, test) = ratings.randomSplit([0.8, 0.2])

# Build the recommendation model using ALS on the training data
# Cold start strategy is set to 'drop' to ensure we don't get NaN (Not a Number) evaluation metrics

als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")
model = als.fit(training)

# Evaluate the model by computing the RMSE on the test data for predicting rating
predictions = model.transform(test)

### Step 3: Evaluation

In [None]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

### Step 4: Making Movie Recommendations

In [None]:
# Generate top 10 movie recommendations for each user
userRecs = model.recommendForAllUsers(10).show(truncate = False)

In [None]:
# Generate top 10 user recommendations for each movie
movieRecs = model.recommendForAllItems(10).show(truncate = False)