In [2]:
# In Assignment-4 when we fit our model on smaller dataset, we saw that Global Meam model won over Item to Item collaborative filtering
# In this exercise we will use PySpark and distributed computing environment to fit the model on entire data set and see if Item-Item collaborative filtering works

In [3]:
#### In this assigmnment we will build two recommender systems and access their performance
#- Recomendation based on global mean and user/movie bias
#- Recommendation based on Item-Item collaborative filtering
#### We will use training set to build a recommender system and evaluate it's performance on the test set. We will use MSE (mean square error) as a measure of model efficiency. Model wil low RMSE will win over model with higher MSE

In [4]:
# Import required libraries

In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import math
from scipy.spatial.distance import cosine
from functools import reduce
import pyspark.sql.types as types
from pyspark.sql.functions import *
from pyspark.mllib.linalg import Matrix, Matrices

In [6]:
# Read the input data

##### In this assignment we will use MovieLens dataset. It has been collected by the GroupLens Research Project at the University of Minnesota. It consists of:

# - 100,000 ratings (1-5) from 943 users on 1682 movies.
# - Each user has rated at least 20 movies.
# - Simple demographic info for the users (age, gender, occupation, zip)
# - Genre information of movies

In [7]:
#Reading ratings file:
schema = types.StructType([
    types.StructField("userid", types.IntegerType()),
    types.StructField("movieid", types.IntegerType()),
    types.StructField("rating", types.IntegerType()),
    types.StructField("unixtimestamp", types.DateType())
])
train = spark.read.csv('/mnt/data/forsachid/ua.base', sep='\t', header=True, schema=schema)
test = spark.read.csv('/mnt/data/forsachid/ua.test', sep='\t' , header=True, schema=schema)
globalmean = np.mean(train.select('rating').collect())


In [8]:
#### Model-1 Recommendation based on global mean, user bias and movie bias
#### Calculate global mean, userbias and movie bias

In [9]:
userbias =  spark.createDataFrame(train.groupby('userid').mean().collect())
userbias = userbias[['userid', 'avg(rating)']].withColumnRenamed('avg(rating)', 'rating')
userbias = userbias.withColumn('newrating', userbias.rating - globalmean ).drop('rating').withColumnRenamed('newrating', 'userbias')

moviebias =  spark.createDataFrame(train.groupby('movieid').mean().collect())
moviebias = moviebias[['movieid', 'avg(rating)']].withColumnRenamed('avg(rating)', 'rating')
moviebias = moviebias.withColumn('newrating', moviebias.rating - globalmean ).drop('rating').withColumnRenamed('newrating', 'moviebias')


In [10]:
# We can see the MSE for global mean model is 1.24

In [11]:
testjoin = test.join(userbias, test.userid == userbias.userid, 'left_outer').select(test.userid, test.movieid, test.rating, userbias.userbias)
testjoin = testjoin.join(moviebias, testjoin.movieid == moviebias.movieid, 'left_outer').select(testjoin.userid, testjoin.movieid, testjoin.rating, testjoin.userbias, moviebias.moviebias)
testjoin = testjoin.withColumn('recc_rating', globalmean + testjoin.userbias + testjoin.moviebias)
testjoin = testjoin.fillna(0)
ratarray = np.array(testjoin.select('rating').collect()).astype(float)
reccratarray = np.array(testjoin.select('recc_rating').collect()).astype(float)
mse = np.mean(((ratarray-reccratarray)**2))
print(mse)

In [12]:
#Use pyspark.mllip.recommendation library to make recommendation

In [13]:
train = train.drop('unixtimestamp')
test = test.drop('unixtimestamp')

from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating
trainratings = train.rdd.map(lambda r: Rating((r[0]), (r[1]), (r[2])))
#Build the recommendation model using Alternating Least Squares
rank = 50
numIterations = 20
model = ALS.train(trainratings, rank, numIterations)

In [14]:
# MAke recommendations on test set. We can see that MSE for Item to Item collaborative filtering model is 1.42

In [15]:
testratings = test.rdd.map(lambda r: Rating(int(r[0]), int(r[1]), float(r[2])))
testdata = testratings.map(lambda p: (p[0], p[1]))
predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2]))
ratesAndPreds = testratings.map(lambda r: ((r[0], r[1]), r[2])).join(predictions)
MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()
print("Mean Squared Error = " + str(MSE))


In [16]:
# Conclusion - We can see that global mean model is still outperforming Item to Item collaborative filtering model even when we fit our model on entire dataset. This is very interesting result. We can tweak the Item to Item collaborative filtering model parameters even further to improve the accuracy of the model. For now we will conclude our experiment with the fact that global mean model with user bias and movie bias factored in is also a very powerful model. Complex models lile content based recommendation and collaborative filtering based models needs to fine tuned in order for them to excel global mean based model