In [1]:
import time

from pyspark import SparkContext, SparkConf
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType, DoubleType

In [2]:
conf = SparkConf()
conf.setMaster('local[*]')
conf.set('spark.executor.memory', '15G')
conf.set('spark.driver.memory', '15G')
conf.setAppName("hw41")
sc = SparkContext(conf=conf)
spark = SparkSession(sc)
spark

In [3]:
# Read in the ratings csv
ratings = spark.read.option("header", "true").csv('./data/ml-20m/ratings.csv')
ratings = ratings.withColumn('userId', F.col('userId').cast(IntegerType()))
ratings = ratings.withColumn('movieId', F.col('movieId').cast(IntegerType()))
ratings = ratings.withColumn('rating', F.col('rating').cast(DoubleType()))

In [4]:
ratings = ratings.select("userId", "movieId", "rating")
ratings.limit(10).toPandas()

Unnamed: 0,userId,movieId,rating
0,1,2,3.5
1,1,29,3.5
2,1,32,3.5
3,1,47,3.5
4,1,50,3.5
5,1,112,3.5
6,1,151,4.0
7,1,223,4.0
8,1,253,4.0
9,1,260,4.0


In [5]:
rank = 4  # number of features
als_model = ALS(itemCol='movieId',
                userCol='userId',
                ratingCol='rating',
                nonnegative=True,
                rank=rank)

In [6]:
print("Total dataset: ", ratings.count())
ratings = ratings.limit(1000) # total dataset is 20000263
trainTest = ratings.randomSplit([0.8, 0.2])

trainingDF = trainTest[0]
testDF = trainTest[1]

Total dataset:  20000263


In [7]:
time_start = time.time()
# Run cross-validation, and choose the best set of parameters.
als_model = als_model.fit(trainingDF)

# Make predictions on test documents. cvModel uses the best model found (lrModel).
test_prediction = als_model.transform(testDF)
# test_prediction.cache()
time_end = time.time()
print("ALS predictions are done!")
print("took ", time_end - time_start, " seconds for cross validation")
test_prediction.toPandas()

ALS predictions are done!
took  2.154177665710449  seconds for cross validation


Unnamed: 0,userId,movieId,rating,prediction
0,7,3175,2.0,
1,1,3997,3.5,
2,7,1721,5.0,2.827302
3,7,1270,4.0,3.867080
4,1,3000,3.5,
...,...,...,...,...
211,3,2643,1.0,
212,1,7482,3.0,
213,1,1009,3.5,
214,3,1882,4.0,2.490897


In [8]:
total = test_prediction.count()
test_prediction_with_na = test_prediction
test_prediction = test_prediction.na.drop()
non_na = test_prediction.count()
print("Total predictions: ", total, ", non-Nan predictions: ", non_na)
test_prediction.toPandas()

Total predictions:  216 , non-Nan predictions:  101


Unnamed: 0,userId,movieId,rating,prediction
0,7,1721,5.0,2.827302
1,7,1270,4.0,3.867080
2,8,597,5.0,3.559433
3,10,2797,4.0,2.820954
4,5,1198,5.0,4.325018
...,...,...,...,...
96,7,1674,4.0,2.857488
97,8,172,1.0,2.948676
98,1,1258,4.0,3.269257
99,3,2140,4.0,3.292553


In [9]:
rmse_evaluator = RegressionEvaluator(
    predictionCol="prediction",
    labelCol="rating",
    metricName="rmse")
score = rmse_evaluator.evaluate(test_prediction)
print("RMSE: ", score)

RMSE:  1.2554504086188185


In [10]:
mae_evaluator = RegressionEvaluator(
    predictionCol="prediction",
    labelCol="rating",
    metricName="mae")
print("MAE: ", mae_evaluator.evaluate(test_prediction))

MAE:  1.0223650961819262


In [11]:
mse_evaluator = RegressionEvaluator(
    predictionCol="prediction",
    labelCol="rating",
    metricName="mse")
print("MSE: ", mse_evaluator.evaluate(test_prediction))

MSE:  1.5761557285011585


Item-Item collaborative filtering
The idea here is to find a set of movies similar to a given movie, 
and rate the given movie based on how those similar movies have been rated by the user.

In [12]:
import math
import time
from statistics import mean

import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

In [13]:
def get_Matrix(data):
    unique_users = data.userId.unique()
    unique_movies = data.movieId.unique()
    utility_matrix = {}
    for user in unique_users:
        # np array of len()
        col = dict.fromkeys(unique_movies, 0)
        this_user = data[data.userId == user]
        if not this_user.empty:
            for movieID in this_user.movieId:
                # instead of this put the actual ratings in
                rating = this_user[this_user.movieId == movieID].rating.iloc[0]
                if math.isnan(rating):
                    rating = 0
                col[movieID] = rating
        utility_matrix[user] = col

    return pd.DataFrame(utility_matrix)

In [14]:
train_df = get_Matrix(ratings.toPandas())
item_item_index = train_df.index
item_similarity = cosine_similarity(train_df)
item_similarity = pd.DataFrame(item_similarity)
item_similarity.index = item_item_index
item_similarity.columns = item_item_index

k = 10
test_data = testDF.toPandas()
item_item_collaborative_labels = []
for x in test_data[:].iterrows():
    userID = x[1]['userId']
    movieID = x[1]['movieId']
    # taking only those k users that have rated the movie
    this_item_distances = item_similarity[movieID]
    sorted_distances = this_item_distances.sort_values(ascending=False)[1:]
    # get the ratings by this user
    this_user = train_df[userID]

    ratings_this_user_this_movie = []
    for key in sorted_distances.keys():
        if len(ratings_this_user_this_movie) >= k:
            break
        this_user_this_movie = this_user[key]
        if this_user_this_movie > 0:
            ratings_this_user_this_movie.append(this_user_this_movie)
    item_rating = mean(ratings_this_user_this_movie)
    item_item_collaborative_labels.append(np.float16(item_rating))
test_data['prediction-item-item-cf'] = item_item_collaborative_labels
prediction_total = test_data.merge(test_prediction_with_na.toPandas(), on=['userId', 'movieId', 'rating'])
prediction_total['avg_prediction'] = prediction_total[['prediction-item-item-cf', 'prediction']].mean(axis=1)
prediction_total

Unnamed: 0,userId,movieId,rating,prediction-item-item-cf,prediction,avg_prediction
0,1,47,3.5,3.900391,,3.900391
1,1,367,3.5,3.699219,3.446421,3.572820
2,1,1009,3.5,3.900391,,3.900391
3,1,1036,4.0,3.699219,,3.699219
4,1,1193,3.5,3.849609,2.615405,3.232507
...,...,...,...,...,...,...
211,11,356,5.0,3.800781,3.023536,3.412159
212,11,377,4.0,4.300781,2.928336,3.614559
213,11,441,1.5,3.550781,,3.550781
214,11,480,5.0,4.398438,4.164185,4.281311
