In [1]:
# Set up AWS S3 access credentials
ACCESS_KEY = "KEY_HERE"
SECRET_KEY = "KEY_HERE"
ENCODED_SECRET_KEY = SECRET_KEY.replace("/", "%2F")
AWS_BUCKET_NAME = "comp4651-movie-data"

In [2]:
# Convert csv file to spark data frame
# INPUT: 
# fileName: the full file name(e.g. "file.csv"), 
# fileSchema: the schema (StructType Array with StructField)
# OUTPUT:
# Spark DataFrame
def loadDataFrame(fileName, fileSchema):
  return (spark.read.format("csv")
                    .schema(fileSchema)
                    .option("header", "true")
                    .option("mode", "DROPMALFORMED")
                    .csv("s3a://%s:%s@%s/%s" % (ACCESS_KEY, ENCODED_SECRET_KEY, AWS_BUCKET_NAME, fileName)))

In [3]:
from pyspark.sql.types import *

movieRatingSchema = StructType([
    StructField("userId", IntegerType(), True),
    StructField("movieId", IntegerType(), True),
    StructField("rating", FloatType(), True),
    StructField("timestamp", StringType(), True)])

movieSchema = StructType([
    StructField("movieId", IntegerType(), True),
    StructField("title", StringType(), True),
    StructField("genres", StringType(), True)])

smallMovieRatingsDF = loadDataFrame("ratings-small.csv", movieRatingSchema).cache()
smallMoviesDF = loadDataFrame("movies-small.csv", movieSchema).cache()

In [4]:
# Print out the DataFrame shcema, and a few lines as example
smallMovieRatingsDF.printSchema()
print smallMovieRatingsDF.take(3)

smallMoviesDF.printSchema()
print smallMoviesDF.take(3)

In [5]:
(trainingSet, testingSet) = smallMovieRatingsDF.randomSplit([0.8, 0.2], seed=12345L)
testingForPrediction = testingSet.rdd.map(lambda x: (x.userId, x.rating))


In [6]:
# Use ml instead of mlib for Dataframes
# http://spark.apache.org/docs/latest/api/python/pyspark.ml.html#pyspark.ml.recommendation.ALS
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql import Row

# Build the recommendation model using ALS on the training data
# Note: set cold start strategy to 'drop' to ensure not to have NaN evaluation metrics
als = ALS(maxIter=10, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating")
model = als.fit(trainingSet)

# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(testingSet)
predictions = predictions.dropna()
print predictions.take(4)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print rmse
print("Root-mean-square error = " + str(rmse))


In [7]:
#To be deleted below
# Collaborative Filtering - RDD-based API
# https://spark.apache.org/docs/2.1.0/mllib-collaborative-filtering.html
from pyspark.mllib.recommendation import ALS
import math

seed = 12345L
iterations = 10
regularization_parameter = 0.1
ranks = [4, 8, 12]
errors = [0, 0, 0]
err = 0
tolerance = 0.02

min_error = float('inf')
best_rank = -1
best_iteration = -1
for rank in ranks:
    model = ALS.train(trainingSetRDD, rank, seed=seed, iterations=iterations,
                      lambda_=regularization_parameter)
    predictions = model.predictAll(validation_for_predict_RDD).map(lambda r: ((r[0], r[1]), r[2]))
    rates_and_preds = validationSetRDD.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)
    error = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
    errors[err] = error
    err += 1
    print 'For rank %s the RMSE is %s' % (rank, error)
    if error < min_error:
        min_error = error
        best_rank = rank

print 'The best model was trained with rank %s' % best_rank

In [8]:
print predictions.take(3)
print rates_and_preds.take(3)

In [9]:
model = ALS.train(trainingSetRDD, best_rank, seed=seed, iterations=iterations,
                      lambda_=regularization_parameter)
predictions = model.predictAll(test_for_predict_RDD).map(lambda r: ((r[0], r[1]), r[2]))
rates_and_preds = testingSetRDD.map(lambda r: ((int(r[0]), int(r[1])), float(r[2]))).join(predictions)
error = math.sqrt(rates_and_preds.map(lambda r: (r[1][0] - r[1][1])**2).mean())
    
print 'For testing data the RMSE is %s' % (error)

In [10]:
my_movie = sc.parallelize([(2, 1029)])
individual_movie_rating_RDD = model.predictAll(my_movie)
print individual_movie_rating_RDD.collect()