### Movie Recommendation System using MovieLens Dataset
In this notebook, we will explore two approaches to build a recommendation system using collaborative filtering algorithms: memory-based and model-based. Our analysis is based on a sampled MovieLens dataset with model training and inference implemented on Spark platform.

#### Table of Contents
1. [Data Import](#import)
2. [Sampling Ratings Dataset](#sampling)
3. [ALS Model Training](#alstrain)

In [325]:
import time
import pandas as pd

from pyspark import SparkContext, SQLContext

from pyspark.sql.functions import *
from pyspark.sql import functions as F

from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

#### 1. Data Import  <a id = import></a>
Something Something

In [5]:
sc = SparkContext()
sqlContext = SQLContext(sc)

csvf = 'com.databricks.spark.csv'
ratings = sqlContext.read.format(csvf).options(header='true', inferschema='true').load('data/raw/ratings.csv')

In [8]:
df=pd.read_csv('/Users/mohit/Documents/Columbia Data Science/Personalization Theory/MovieLens/MovieLens/data/raw/ratings.csv')

#### 2. Sampling Ratings Dataset <a id = sampling></a>
Something Something

In [3]:
ratings_count = ratings.groupby(['userId']).count()
quantile = ratings_count.approxQuantile('count', [0.25, 0.75], 0)

print("Ratings Count by User: 25th Percentile = "+str(quantile[0]))
print("Ratings Count by User: 75th Percentile = "+str(quantile[1]))

Ratings Count by User: 25th Percentile = 35.0
Ratings Count by User: 75th Percentile = 155.0


In [4]:
ratings_count = ratings_count.withColumn(
    'user_class', when(col('count') < quantile[0], 1).when(col('count') < quantile[1], 2).otherwise(3))
ratings_count = ratings_count.withColumnRenamed('userId', 'userId2')
ratings = ratings.join(ratings_count, ratings['userId'] == ratings_count['userId2'])
ratings = ratings.select(['userId', 'movieId', 'rating', 'timestamp', 'user_class'])

In [5]:
ratings_sampled = ratings.sampleBy('user_class', fractions = {1: 0, 2: 0.0001, 3: 0.005}, seed = 10)
print("Total Ratings in Sample = "+str(ratings_sampled.count()))
print("Distinct Users = "+str(ratings_sampled.select('userId').distinct().count())+
      " & Distinct Movies = "+str(ratings_sampled.select('movieId').distinct().count()))

Total Ratings in Sample = 69169
Distinct Users = 27074 & Distinct Movies = 8111


In [3]:
def data_sampling (df,item_nos=600,item_split=[0.90,0.10]):
    
    ##############################    Data Preprocessing from User Perspective   #########################
    
    #Frequency of movie rating by each user
    user_rtgs_cnt = (df.groupby(['userId']).count()).iloc[:,0:1].reset_index().rename(columns={"movieId":"rating_cnt"})
    print ("Original number of users in dataset : ",len(user_rtgs_cnt))
    
    quantile_user=user_rtgs_cnt.quantile([0.1,.25,.75,0.9], axis = 0).drop(["userId"],axis=1)
    print("Data distribution of frequency of movies rated by users : \n ", quantile_user)
    
    #Removing the lower 10% of the outliers.
    user_rtgs_cnt=user_rtgs_cnt[user_rtgs_cnt.rating_cnt>=quantile_user.iloc[0,0]]
    print ("Number of users in dataset post removal of bias based on user activity: ",len(user_rtgs_cnt))
    
    #These users are then removed from the dataset
    df=df.merge(user_rtgs_cnt[['userId']],on="userId", how="inner")   
    
    ##############################  Data Preprocessing from Item Perspective   #########################
    
    #Count of Ratings per movie
    item_count = (df[["movieId","rating"]].groupby(['movieId']).count()).reset_index().rename(columns={"rating":"rating_per_item"})
    print("Original number of movies in dataset :\n ",len(item_count))
    
    quantile_item=item_count.quantile([0.1,.25,.75,1], axis = 0).drop(["movieId"],axis=1)
    print("Data distribution of frequency of ratings per movie : \n ", quantile_item)
    
    #Removing all items which have less than 3 user counts i.e Q1 or based on a fixed number 
    #item_count=item_count[item_count.rating_per_item>=quantile_item.iloc[1,0]].reset_index(drop=True)
    item_count=item_count[item_count.rating_per_item>=5].reset_index(drop=True)
    item_count["item_subset"]=np.where(item_count.rating_per_item < quantile_item.iloc[2,0],1,2)
    print("Total number of movies in dataset post removal of low rated movies: ",len(item_count))
    
    
    ######################################################  Data Sampling   #########################
    
    sampled_ratings=pd.DataFrame()
    j=len(item_split)-1
    
    for i in item_count.item_subset.unique():
        sampled_ratings=sampled_ratings.append(item_count[item_count.item_subset==i].sample(n=int(item_split[j]*item_nos), random_state=10))
        j=j-1
        
    sampled_ratings.reset_index(drop=True, inplace=True)   
    print ("Sum of all the ratings for the selected movies : ",sampled_ratings['rating_per_item'].sum())
    
    
    #Select user rows for only those movies which have been sampled
    df=df.merge(sampled_ratings[['movieId']],on="movieId", how="inner")
    
    #Since not all items are selected it may happen that we again get items with only user frequency.
    #Removing single frequency users so as to reduce sparsity and enable item-item comparison between pairs
    
    user_rtgs_cnt_2=(df.groupby(['userId']).count()).iloc[:,0:1].reset_index().rename(columns={"movieId":"user_freq"})
    df=df.merge(user_rtgs_cnt_2,on="userId", how="inner")
    
    #For any personalized recommendation to a user, we are setting a rule that user should have watched 5 movies before. 
    #Before that only popular recommendations to him
    df=df[df.user_freq>7] 
    df.drop(['user_freq'],axis=1, inplace=True)
    df=df.reset_index(drop=True)
    print("Number of rows in total sampled dataset : ", len(df))
    
    #############################################   Train-Test Split   ###################################
    
    df_train=df.groupby(['userId']).apply(lambda x : x.sample(frac=0.8,random_state=10)).reset_index(drop=True)
    z=df.merge(df_train,how='outer',on=['userId','movieId','rating','timestamp'],indicator=True)
    df_test=z.query('_merge != "both"')
    df_test=df_test.drop(['_merge'],axis=1)
    df_test.reset_index(drop=True,inplace=True)
        
    return [df, df_train, df_test]

In [9]:
ratings_sampled, train, test=data_sampling(df)

Original number of users in dataset :  138493
Data distribution of frequency of movies rated by users : 
        rating_cnt
0.10        24.0
0.25        35.0
0.75       155.0
0.90       334.0
Number of users in dataset post removal of bias based on user activity:  125431
Original number of movies in dataset :
  26737
Data distribution of frequency of ratings per movie : 
        rating_per_item
0.10              1.0
0.25              3.0
0.75            204.0
1.00          65080.0
Total number of movies in dataset post removal of low rated movies:  18328
Sum of all the ratings for the selected movies :  250958
Number of rows in total sampled dataset :  90256


#### 3. ALS Model Training <a id = alstrain></a>
Something Something

In [14]:
(training, test) = ratings_sampled.randomSplit([0.8, 0.2])

In [12]:
ratings_sampled = sqlContext.createDataFrame(ratings_sampled)

In [13]:
type(ratings_sampled)

pyspark.sql.dataframe.DataFrame

In [334]:
def als_model_train(training):
    # Initializing implicit ALS with user, movie and ratings column
    als = ALS(userCol="userId", 
              itemCol="movieId", 
              ratingCol="rating",
              nonnegative=True,
              coldStartStrategy="drop")
    
    # We use a ParamGridBuilder to construct a grid of parameters to search over
    param_grid = ParamGridBuilder() \
        .addGrid(als.rank, [50, 75, 100]) \
        .addGrid(als.regParam, [0.01, 0.1, 1.0]) \
        .build()
    
    # Defining the evaluation criteria for choosing best set of hyperparameters
    evaluator = RegressionEvaluator(metricName="rmse", 
                                    labelCol="rating", 
                                    predictionCol="prediction")
    
    # To try all combinations of hyperparameters and determine best model using evaluator
    hypertuned = CrossValidator(estimator=als, 
                                estimatorParamMaps=param_grid, 
                                evaluator=evaluator,
                                numFolds=4)
    
    # Choosing the best set of hyperparameters from cross validation
    cvModel = hypertuned.fit(training)
    
    return cvModel

In [319]:
def als_model_predict(model, test):
    predictions = model.bestModel.transform(test)
    predictions = predictions.withColumn('prediction', 
                                         when(col('prediction') > 5, 5).otherwise(col('prediction')))
    evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
    rmse = evaluator.evaluate(predictions)
    return rmse

In [278]:
def als_model_recommend(model, k = 10):
    user_recs = model.bestModel.recommendForAllUsers(ratings_sampled.select('movieId').distinct().count())
    user_recs_pd = userRecs.toPandas()
    user_rated = ratings_sampled.toPandas()
    
    user_rated_movies = user_rated.groupby('userId')['movieId'].apply(lambda x: x.values.tolist()).to_dict()
    user_movie_recs = pd.DataFrame(columns = ['userId', 'recommendations'])
    
    for i in range(len(user_recs_pd)):
        userID = user_recs_pd['userId'][i]
        user_movie_recs.loc[i, 'userId'] = userID
        rated_movies = user_rated_movies.get(userID)
        
        count = 0
        recommendations = []
        for j in range(len(user_recs_pd.loc[i, 'recommendations'])):
            if(user_recs_pd.loc[i, 'recommendations'][j][0] not in rated_movies):
                recommendations.append((user_recs_pd.loc[i, 'recommendations'][j][0], 
                                        user_recs_pd.loc[i, 'recommendations'][j][1]))
                count = count + 1
            if(count == k):
                user_movie_recs.loc[i, 'recommendations'] = recommendations
                break
    
    return user_movie_recs

In [274]:
x = model.bestModel.recommendForAllUsers(ratings_sampled.select('movieID').distinct().count())

In [322]:
temp = als_model_predict(model, test)

In [333]:
params = [{p.name: v for p, v in m.items()} for m in model.getEstimatorParamMaps()]
hyperparameter pd.DataFrame.from_dict([
    {model.getEvaluator().getMetricName(): metric, **ps} 
    for ps, metric in zip(params, model.avgMetrics)
])

Unnamed: 0,rank,regParam,rmse
0,50,0.1,0.99708
1,50,1.0,1.32292
2,70,0.1,0.993696
3,70,1.0,1.32292


In [281]:
predictions = model.bestModel.transform(test)

In [177]:
user_rated = ratings_sampled.toPandas()
user_rated_movies = user_rated.groupby('userId')['movieId'].apply(lambda x: x.values.tolist()).to_dict()

In [268]:
user_movie_recs = pd.DataFrame(columns = ['userId', 'recommendations'])
for i in range(len(userRecsPD)):
    userID = userRecsPD['userId'][i]
    user_movie_recs.loc[i, 'userId'] = userID
    rated_movies = user_rated_movies.get(userID)
    count = 0
    recommendations = []
    for j in range(len(userRecsPD.loc[i, 'recommendations'])):
        if(userRecsPD.loc[i, 'recommendations'][j][0] not in rated_movies):
            recommendations.append((userRecsPD.loc[i, 'recommendations'][j][0], 
                                    userRecsPD.loc[i, 'recommendations'][j][1]))
            count = count + 1
        if(count == 10):
            user_movie_recs.loc[i, 'recommendations'] = recommendations
            break

SyntaxError: can't assign to literal (<ipython-input-268-170d3d8eec74>, line 6)

In [17]:
start = timeit.timeit()
model = als_model_train(training)
# y = als_model_predict(x, test)
end = timeit.timeit()
print(end - start)

0.00043247300004622957


In [18]:
pred = als_model_predict(model, test)
# userRecs = model.recommendForAllUsers(10)

In [470]:
distinct_movies = ratings.select('movieId').distinct()

In [485]:
userRecsPD = userRecs.toPandas()

In [19]:
print(pred)

0.9286564163328874


In [460]:
predictions.show()

+------+-------+------+----------+----------+------------+
|userID|movieID|rating| timestamp|user_class|  prediction|
+------+-------+------+----------+----------+------------+
| 77156|   1580|   2.0|1210200237|         3|   1.1354338|
|137883|   1645|   4.5|1185243830|         3|  -0.5382513|
|  1779|   2366|   3.0| 918644611|         3|  -1.6133387|
| 80778|    858|   3.0|1329837137|         3|  -1.0161623|
|109934|    858|   5.0|1051051191|         3|   1.6370859|
| 23911|    858|   5.0|1097624530|         3|   1.1424176|
| 36884|    858|   5.0|1197993216|         3| -0.68622744|
| 80087|   1025|   4.0| 948333039|         3|   2.8063383|
|136497|   1127|   4.0|1086055105|         3|   1.5948832|
| 13285|   1483|   0.5|1107027300|         3|   -3.513239|
|135128|   1483|   4.5|1215795673|         3|  -0.8128172|
| 14700|   1721|   4.0|1111479834|         3|  0.68048674|
| 98464|   3698|   2.0|1072034871|         3|   0.6222136|
| 12954|  48780|   4.0|1390951222|         3|  -2.571923

In [411]:
# x.getEstimatorParamMaps()[ np.argmax(x.avgMetrics) ]

x.bestModel.transform(test).show()
# for x in range(len(bestModel.stages)):
#     print (bestModel.stages[x])


+------+-------+------+----------+----------+-------------+
|userID|movieID|rating| timestamp|user_class|   prediction|
+------+-------+------+----------+----------+-------------+
| 78308|   4344|   2.5|1111822036|         3|-2.894627E-12|
| 15278|   5528|   0.5|1111818810|         3| 3.9035367E-5|
|109258|  52722|   3.0|1332136180|         3| -4.156107E-4|
+------+-------+------+----------+----------+-------------+



In [316]:
als = ALS(maxIter=5, 
          regParam=0.01, 
          userCol="userID", 
          itemCol="movieID", 
          ratingCol="rating", 
          coldStartStrategy="drop")
model = als.fit(training)

In [320]:
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", 
                                labelCol="rating", 
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-Mean-Square-Error = " + str(rmse))

Root-Mean-Square-Error = 3.884697743067076


In [331]:
# Generate top 10 movie recommendations for each user
userRecs = model.recommendForAllUsers(10)

# Generate top 10 movie recommendations for a specified set of users
users = ratings.select(als.getUserCol()).distinct().limit(3)
userSubsetRecs = model.recommendForUserSubset(users, 10)