# Import Packages & Data

In [18]:
import pandas as pd 
import numpy as np
from scipy import sparse
import pyspark as spark
from pyspark.sql import SparkSession

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder


from surprise.prediction_algorithms import knns
from surprise.similarities import cosine, msd, pearson
from surprise import Reader, Dataset
from surprise import accuracy

# importing relevant libraries
from surprise.model_selection import train_test_split
from surprise.model_selection import cross_validate
from surprise.prediction_algorithms import SVD, SVDpp, SlopeOne, NMF, NormalPredictor, KNNBaseline
from surprise.prediction_algorithms import KNNBasic, KNNWithMeans, KNNWithZScore, BaselineOnly, CoClustering
from surprise.model_selection import GridSearchCV

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [158]:
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder, CrossValidator
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

In [19]:
spark = SparkSession.builder.master('local').getOrCreate()

## active SparkSession
spark = SparkSession\
        .builder\
        .appName('ALSExample').config('spark.driver.host', 'localhost')\
        .getOrCreate()

In [20]:
#For reproducibility

import random
import numpy as np

my_seed = 24
random.seed(my_seed)
np.random.seed(my_seed)

In [None]:
pip install jupyter-spark
jupyter serverextension enable --py jupyter_spark
jupyter nbextension install --py jupyter_spark
jupyter nbextension enable --py jupyter_spark
jupyter nbextension enable --py widgetsnbextension

## Dataframe Imports

In [47]:
ratings = pd.read_csv('ratings')
gridsearch_models = pd.read_csv('gridsearch_models')

In [32]:
movie_ratings = spark.read.csv('ratings', header='true', inferSchema='true')

# Spark ALS Model

## First pass - vanilla

In [23]:
# Build the recommendation model using ALS on the training data

In [33]:
#Train-test split

(train, test) = movie_ratings.randomSplit([0.8, 0.2])

In [34]:
# Cold start strategy set to 'drop' to exclude NaN data from evaluation.

als = ALS(maxIter=5,rank=4, regParam=0.01, userCol='userId', itemCol='movieId', ratingCol='rating',
          coldStartStrategy='drop')

In [36]:
# fit the ALS model to the training set
model = als.fit(train)

In [37]:
# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating',predictionCol='prediction')
rmse = evaluator.evaluate(predictions)
print('Root-mean-square error = ' + str(rmse))


#RMSE is greater than that of our best SVDpp model

Root-mean-square error = 1.004732324130051


## Refining ALS Model

In [49]:
# initialize the ALS model
als_model = ALS(userCol='userId', itemCol='movieId', 
                ratingCol='rating', coldStartStrategy='drop')

# create the parameter grid                 
params = ParamGridBuilder()\
          .addGrid(als_model.regParam, [0.01, 0.05, 0.1,1.0])\
          .addGrid(als.maxIter, [5, 50,100, 250, 500])\
          .addGrid(als_model.rank, [4, 10,25, 50,100]).build()

# instantiating crossvalidator estimator
cv = CrossValidator(estimator=als_model, estimatorParamMaps=params,evaluator=evaluator,parallelism=4)
best_model = cv.fit(train)

In [50]:
# Evaluate the model by computing the RMSE on the test data

predictions = best_model.transform(test)
evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating',predictionCol='prediction')
rmse = evaluator.evaluate(predictions)
print('Root-mean-square error = ' + str(rmse))


#RMSE is greater than that of our best SVDpp model, but it is improved over the initial model. 
#We will move forward with this model given efficient computation and similar RMSE score (~1 star off)

Root-mean-square error = 0.8854920503652149


In [56]:
#SVDpp_Gridsearch RMSE score below for reference

gridsearch_models[:1]

Unnamed: 0,Model,RMSE
0,SVDpp_Gridsearch,0.852285


In [368]:
best_model.bestModel.rank

100

# Making Recommendations

New users will be promopted to provide information regarding their pre-existing movie preferences. In order to do so, we will prompt them using the 250 most reviewed movies. We will use rating frequency to infer popularity, in the hopes that new users are likely to have already viewed these films.

In [196]:
#Creating list of movie genres

movie_genres = []

for genres in movie_info.genres:
    genre_list = genres.split('|')
    for x in genre_list:
        movie_genres.append(x)

movie_genres = set(movie_genres)
movie_genres.remove('(no genres listed)')
movie_genres

{'Action',
 'Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'IMAX',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western'}

In [383]:
#Creating a formula to generate recommendations for new and existing users. 
#First, we ascertain whether or not a user is returning or new

user_id = input("Please enter your user Id, press n if you are a new user:")
print("")
if user_id == 'n':
    user_id = 0

else:
    user_id = int(user_id)
    
    
#If this is a returning user, 

if len(movie_info.rating[movie_info.userId == user_id])>0: #checking this user has input ratings
    print("Hi, user", user_id)
    print("")
    print("Welcome back.")
    print("")
    print("Recommended movies for you:")
    print("")
    
    #using model to make recommendations based on existing user
    
    movie_ids = []
    recommendations = model.recommendForAllUsers(5)
    recs_for_user = recommendations.where(recommendations.userId == user_id).take(1)
    
    for x in recs_for_user[0]['recommendations']:
        rec_id = x[0]
        movie_ids.append(rec_id)
        
    #getting move titles 
    
    recommended_movies = []
    
    for x in movie_ids:
        movie = list(movie_info.title[movie_info.movieId == x].unique())[0]
        print(movie)

#If this is a new user, 
        
if user_id == 0:
    
    print("Welcome new user!")
    print("")
    print("Help us figure out what kind of movies you enjoy by rating 5 titles")
    print("")
    
    num = 5
    rating_list = []

    while num > 0:
        if num == 0:
            False

        else: 

            rate_movie_id = list(movie_info.sample(1)['movieId'])[0]
            rate_movie_title = list(movie_info.title[movie_info.movieId == rate_movie_id].unique())[0]
            print(rate_movie_title)
            rating = rating = input("""How do you rate this movie on a scale of 1-5?
            Press n if you have not seen or already rated:""")
            print("")

            if rating == 'n':
                pass   
            else:
                rating_one_movie = (user_id,int(rate_movie_id),float(rating))
                rating_list.append(rating_one_movie) 
                num -= 1
    
    print("")
    print("Generating recommendations...")
    print("")
    
    new_data = spark.createDataFrame(rating_list,movie_ratings.columns)
    new_data_df = movie_ratings.union(new_data)
    
    als = ALS(maxIter=5,rank=100, regParam=0.01, userCol='userId', itemCol='movieId', ratingCol='rating',
          coldStartStrategy='drop')

    new_user_model = als.fit(new_data_df)
    
    print("")
    print("Recommended movies for you:")
    print("")
    
    movie_ids = []
    recommendations = new_user_model.recommendForAllUsers(5)
    recs_for_user = recommendations.where(recommendations.userId == user_id).take(1)
    
    for x in recs_for_user[0]['recommendations']:
        rec_id = x[0]
        movie_ids.append(rec_id)
        
    #getting move titles 
    
    recommended_movies = []
    
    for x in movie_ids:
        movie = list(movie_info.title[movie_info.movieId == x].unique())[0]
        print(movie)
    
    

Please enter your user Id, press n if you are a new user:n

Welcome new user!

Help us figure out what kind of movies you enjoy by rating 5 titles

Legend of Bagger Vance, The (2000)
How do you rate this movie on a scale of 1-5?
            Press n if you have not seen or already rated:n

Baby Driver (2017)
How do you rate this movie on a scale of 1-5?
            Press n if you have not seen or already rated:4

Ghost World (2001)
How do you rate this movie on a scale of 1-5?
            Press n if you have not seen or already rated:n

Ponyo (Gake no ue no Ponyo) (2008)
How do you rate this movie on a scale of 1-5?
            Press n if you have not seen or already rated:n

Dante's Peak (1997)
How do you rate this movie on a scale of 1-5?
            Press n if you have not seen or already rated:n

Zodiac (2007)
How do you rate this movie on a scale of 1-5?
            Press n if you have not seen or already rated:n

As Good as It Gets (1997)
How do you rate this movie on a scale of 1

# Conclusions and Next Steps

The final ALS model is the best model for our purposes, given the low RMSE (<0.9) and quick processing time.

The recommendation system has some room for improvement. A few ideas below:

- Ask new users to rate some of the most popular movies, to avoid drawing out the initial survey. The initial survey is necessary to resolve the cold start problem.

- Create a more sophisticated model that can generate recommendations based on genres or even age group (children only, for example).

- Our dataset has a high number of comedy, fantasy and thriller movies. It's possible that our recommendations won't be as strong for movies that fall into less popular genres. We could consider a larger dataset.

- We should be updating the dataset as new movies become available on our platform.