## Assignment 2 DAT410 - Design of AI systems
## Martin Hansson




### Import libraries

In [130]:
import pandas as pd
from pyspark.sql.functions import col, explode
from pyspark import SparkContext
import numpy as np
import pyspark

### Initiate spark session

In [131]:
from pyspark.sql import SparkSession
sc = SparkContext
# sc.setCheckpointDir('checkpoint')
spark = SparkSession.builder.appName('Recommendations').getOrCreate()

# 1. Load data

### User reviews

In [132]:
user = pd.read_csv("user_reviews.csv")  #reading data
user=user[['Unnamed: 0','User']] #selecting columns for use list
user.rename(columns={user.columns[0]: 'userId',user.columns[1]: 'Name'}, inplace=True) #renaming columns
user

Unnamed: 0,userId,Name
0,0,Vincent
1,1,Edgar
2,2,Addilyn
3,3,Marlee
4,4,Javier
...,...,...
595,595,Mariana
596,596,Ivy
597,597,Kevin
598,598,Nora


In [139]:
rating_pd = pd.read_csv("user_reviews.csv")  #reading data

utility = rating_pd.drop(['Unnamed: 0', 'User'], axis =1) #selecting columns for utility matrix
utility=utility.to_numpy()
index = np.where(utility>0) # Find already rated user/movies
a=np.transpose(np.asarray(index))

#Create a list with rated movies/users
rate=np.zeros((len(a),3))
for i in range(len(a)):
    rate[i,0]=a[i,0]
    rate[i,1]=a[i,1]
    rate[i,2]=utility[a[i,0],a[i,1]]

rating_pd = pd.DataFrame(rate,columns=['userId', 'movieId','rating'])
rating_pd.head(10)

Unnamed: 0,userId,movieId,rating
0,0.0,127.0,2.0
1,0.0,141.0,3.0
2,0.0,151.0,3.0
3,0.0,223.0,3.0
4,0.0,264.0,4.0
5,0.0,324.0,5.0
6,0.0,392.0,3.0
7,0.0,573.0,5.0
8,0.0,583.0,3.0
9,0.0,594.0,5.0


### Create list with unrated movies

In [141]:
index = np.where(utility==0)
b=np.transpose(np.asarray(index))
X_new_pd =np.zeros((len(b),2))
for i in range(len(b)):
    X_new_pd[i,0]=b[i,0]
    X_new_pd[i,1]=b[i,1]


X_new_pd = pd.DataFrame(X_new_pd,columns=['userId', 'movieId'])
X_new_pd.head(10)


Unnamed: 0,userId,movieId
0,0.0,0.0
1,0.0,1.0
2,0.0,2.0
3,0.0,3.0
4,0.0,4.0
5,0.0,5.0
6,0.0,6.0
7,0.0,7.0
8,0.0,8.0
9,0.0,9.0


### Reading movie genres data (NOTE: This data is not needed for prediction)

In [135]:
movie_genres = pd.read_csv("movie_genres.csv")  #reading data

for j in range(2,27):
  col_name = movie_genres.columns[j]
  movie_genres[col_name] = movie_genres[col_name].astype(str)
  for i in range(0,len(movie_genres)):
    if movie_genres.iat[i,j]=='1':
      movie_genres.iat[i,j]=movie_genres.columns[j].replace('genre_', '|')+'|'
    else:
      movie_genres.iat[i,j]=''
for i in range(0,len(movie_genres)):
  movie_genres.iat[i,2]=movie_genres.iloc[i,2:27].str.cat(sep='')
movie_genres.drop(movie_genres.columns[3:27],axis=1,inplace=True)
movie_genres.columns = ['movieId', 'title','genres']
movie_genres.head(10)

 

Unnamed: 0,movieId,title,genres
0,0,The Net,|action||crime||drama||mystery||thriller|
1,1,Happily N'Ever After,|adventure||animation||comedy||family||fantasy|
2,2,Tomorrowland,|action||adventure||family||mystery||sci-fi|
3,3,American Hero,|action||comedy||drama||sci-fi|
4,4,Das Boot,|adventure||drama||thriller||war|
5,5,Final Destination 3,|horror|
6,6,Licence to Kill,|action||adventure||thriller|
7,7,The Hundred-Foot Journey,|comedy||drama|
8,8,The Matrix,|action||sci-fi|
9,9,Creature,|horror||sci-fi||thriller|


### Converting to spark data frame

In [142]:
movies = spark.createDataFrame(movie_genres)
ratings = spark.createDataFrame(rating_pd)
X_new = spark.createDataFrame(X_new_pd)

### Calculate sparsity

In [143]:
# Count the total number of ratings in the dataset
numerator = ratings.select("rating").count()

# Count the number of distinct userIds and distinct movieIds
num_users = ratings.select("userId").distinct().count()
num_movies = ratings.select("movieId").distinct().count()

# Set the denominator equal to the number of users multiplied by the number of movies
denominator = num_users * num_movies

# Divide the numerator by the denominator
sparsity = (1.0 - (numerator *1.0)/denominator)*100
print("The ratings dataframe is ", "%.2f" % sparsity + "% empty.")

The ratings dataframe is  98.62% empty.


## Interpret ratings

In [144]:
# Group data by userId, count ratings
userId_ratings = ratings.groupBy("userId").count().orderBy('count', ascending=False)
userId_ratings.show()

+------+-----+
|userId|count|
+------+-----+
| 195.0|   46|
| 159.0|   44|
| 413.0|   44|
| 499.0|   43|
| 543.0|   43|
| 171.0|   43|
| 329.0|   42|
| 360.0|   42|
|  61.0|   40|
| 326.0|   40|
| 546.0|   40|
| 530.0|   40|
| 196.0|   40|
| 458.0|   40|
| 504.0|   40|
| 482.0|   40|
| 593.0|   40|
| 299.0|   39|
|   0.0|   39|
|  27.0|   39|
+------+-----+
only showing top 20 rows



In [145]:
# Group data by userId, count ratings
movieId_ratings = ratings.groupBy("movieId").count().orderBy('count', ascending=False)
movieId_ratings.show()

+-------+-----+
|movieId|count|
+-------+-----+
| 1178.0|   20|
|  143.0|   20|
| 1894.0|   20|
| 1011.0|   19|
| 1666.0|   19|
| 1944.0|   19|
| 1088.0|   19|
|  500.0|   19|
|  669.0|   18|
|  651.0|   18|
| 1143.0|   18|
|  116.0|   18|
| 1080.0|   18|
| 1375.0|   18|
|  448.0|   18|
| 1430.0|   17|
| 1812.0|   17|
|  282.0|   17|
| 1432.0|   17|
| 1694.0|   16|
+-------+-----+
only showing top 20 rows



## Build ALS Model

In [146]:
# Import the required functions
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

In [147]:
# Create test and train set
(train, test) = ratings.randomSplit([0.9, 0.1], seed = 10)

# Create ALS model
als = ALS(userCol="userId", itemCol="movieId", ratingCol="rating", nonnegative = True, implicitPrefs = False, coldStartStrategy="drop")

# Confirm that a model called "als" was created
type(als)

pyspark.ml.recommendation.ALS

## Tell Spark how to tune your ALS model

In [148]:
# Import the requisite items
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Add hyperparameters and their respective values to param_grid
param_grid = ParamGridBuilder() \
            .addGrid(als.rank, [10,30,50,75,100]) \
            .addGrid(als.regParam, [.1]) \
            .build()
            #             .addGrid(als.maxIter, [5, 50, 100, 200]) \

           
# Define evaluator as RMSE and print length of evaluator
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") 
print ("Num models to be tested: ", len(param_grid))

Num models to be tested:  5


## Build your cross validation pipeline

In [149]:
# Build cross validation using CrossValidator
cv = CrossValidator(estimator=als, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=5)

# Confirm cv was built
print(cv)

CrossValidator_358236c5f3aa


## Best Model and Best Model Parameters

In [150]:
#Fit cross validator to the 'train' dataset
model = cv.fit(train)

#Extract best model from the cv model above
best_model = model.bestModel

In [151]:

# Print best_model
print(type(best_model))

# Complete the code below to extract the ALS model parameters
print("**Best Model**")

# # Print "Rank"
print("  Rank:", best_model._java_obj.parent().getRank())

# Print "MaxIter"
print("  MaxIter:", best_model._java_obj.parent().getMaxIter())

# Print "RegParam"
print("  RegParam:", best_model._java_obj.parent().getRegParam())

<class 'pyspark.ml.recommendation.ALSModel'>
**Best Model**
  Rank: 50
  MaxIter: 10
  RegParam: 0.1


In [153]:
# Test model by predicting on test set and compute RMSE
test_predictions = best_model.transform(test)
RMSE = evaluator.evaluate(test_predictions)
print(RMSE)

1.3329411568262501


### Prediction of unrated movies

In [155]:
Y_pred = best_model.transform(X_new)

### Convert result to panda data frame print results

In [156]:
prediction = Y_pred.toPandas()
prediction = prediction.join(user.set_index('userId'), on='userId')
prediction = prediction.join(movie_genres.set_index('movieId'), on='movieId')

### Rearrange columns

In [157]:
cols=['userId', 'Name', 'movieId', 'title', 'genres', 'prediction']
prediction = prediction[cols]


### Print results for first 5 userId

In [158]:
prediction[prediction['userId']==0].sort_values(by='prediction', ascending=False)

Unnamed: 0,userId,Name,movieId,title,genres,prediction
1078780,0.0,Vincent,1011.0,Perrier's Bounty,|action||comedy||crime||drama||romance||thriller|,4.891662
853389,0.0,Vincent,1587.0,Street Fighter: The Legend of Chun-Li,|action||crime||drama||mystery||thriller|,4.847449
1119555,0.0,Vincent,172.0,The Magic Sword: Quest for Camelot,|adventure||animation||comedy||drama||family||...,4.720531
557541,0.0,Vincent,1072.0,The Pacifier,|action||comedy||drama||family||thriller|,4.634796
1066961,0.0,Vincent,1187.0,Kiss the Girls,|crime||drama||mystery||thriller|,4.616664
...,...,...,...,...,...,...
247958,0.0,Vincent,774.0,12 Rounds,|action||crime||thriller|,1.135820
741021,0.0,Vincent,589.0,Richard III,|drama||war|,1.131019
1074655,0.0,Vincent,1571.0,Pale Rider,|western|,1.068643
158577,0.0,Vincent,388.0,The Wolf of Wall Street,|biography||comedy||crime||drama|,0.905781


In [159]:
prediction[prediction['userId']==1].sort_values(by='prediction', ascending=False)

Unnamed: 0,userId,Name,movieId,title,genres,prediction
786761,1.0,Edgar,1049.0,Zodiac,|crime||drama||history||mystery||thriller|,5.063781
235093,1.0,Edgar,1542.0,Wild Things,|crime||drama||mystery||thriller|,5.031269
145704,1.0,Edgar,236.0,Seeking a Friend for the End of the World,|adventure||comedy||drama||romance||sci-fi|,4.950089
1078346,1.0,Edgar,1011.0,Perrier's Bounty,|action||comedy||crime||drama||romance||thriller|,4.916863
1032215,1.0,Edgar,1983.0,Force 10 from Navarone,|action||drama||war|,4.798882
...,...,...,...,...,...,...
1074209,1.0,Edgar,1571.0,Pale Rider,|western|,1.133976
158128,1.0,Edgar,388.0,The Wolf of Wall Street,|biography||comedy||crime||drama|,0.939729
724022,1.0,Edgar,1714.0,Eve's Bayou,|drama|,0.918976
568359,1.0,Edgar,742.0,The Innkeepers,|horror|,0.863868


In [160]:
prediction[prediction['userId']==2].sort_values(by='prediction', ascending=False)

Unnamed: 0,userId,Name,movieId,title,genres,prediction
540344,2.0,Addilyn,1211.0,Morning Glory,|comedy||drama||romance|,4.895284
512534,2.0,Addilyn,547.0,Astro Boy,|action||animation||comedy||family||sci-fi|,4.849523
778896,2.0,Addilyn,1918.0,88 Minutes,|crime||drama||mystery||thriller|,4.716270
1017246,2.0,Addilyn,1728.0,BrainDead,|comedy||drama||horror||sci-fi||thriller|,4.701271
370515,2.0,Addilyn,1847.0,Sinbad: Legend of the Seven Seas,|adventure||animation||comedy||drama||family||...,4.686368
...,...,...,...,...,...,...
568765,2.0,Addilyn,742.0,The Innkeepers,|horror|,1.282332
494179,2.0,Addilyn,383.0,Frenzy,|thriller|,1.158899
1074614,2.0,Addilyn,1571.0,Pale Rider,|western|,1.108552
182187,2.0,Addilyn,604.0,The Men Who Stare at Goats,|comedy||war|,0.765583


In [161]:
prediction[prediction['userId']==3].sort_values(by='prediction', ascending=False)

Unnamed: 0,userId,Name,movieId,title,genres,prediction
801555,3.0,Marlee,1126.0,Dutch Kills,|crime||drama||thriller|,4.550503
693271,3.0,Marlee,1388.0,The Hunting Party,|adventure||comedy||drama||romance||thriller||...,4.466138
234526,3.0,Marlee,1512.0,Red State,|action||crime||horror||thriller|,4.367875
197819,3.0,Marlee,225.0,How to Lose Friends & Alienate People,|comedy||drama||romance|,4.330592
1078367,3.0,Marlee,1011.0,Perrier's Bounty,|action||comedy||crime||drama||romance||thriller|,4.277447
...,...,...,...,...,...,...
647688,3.0,Marlee,244.0,The Final Destination,|horror|,1.058703
1074230,3.0,Marlee,1571.0,Pale Rider,|western|,1.057423
247530,3.0,Marlee,774.0,12 Rounds,|action||crime||thriller|,1.027408
158149,3.0,Marlee,388.0,The Wolf of Wall Street,|biography||comedy||crime||drama|,0.823058


In [162]:
prediction[prediction['userId']==4].sort_values(by='prediction', ascending=False)

Unnamed: 0,userId,Name,movieId,title,genres,prediction
557265,4.0,Javier,1072.0,The Pacifier,|action||comedy||drama||family||thriller|,4.271224
160076,4.0,Javier,602.0,Bad Company,|action||adventure||comedy||thriller|,4.247203
1100966,4.0,Javier,605.0,Sex and the City 2,|comedy||drama||romance|,4.097620
512299,4.0,Javier,547.0,Astro Boy,|action||animation||comedy||family||sci-fi|,4.065212
904623,4.0,Javier,630.0,Chill Factor,|action||adventure||comedy||drama||thriller|,4.024970
...,...,...,...,...,...,...
247679,4.0,Javier,774.0,12 Rounds,|action||crime||thriller|,0.988118
647837,4.0,Javier,244.0,The Final Destination,|horror|,0.951566
691638,4.0,Javier,821.0,This Is England,|crime||drama|,0.878433
158298,4.0,Javier,388.0,The Wolf of Wall Street,|biography||comedy||crime||drama|,0.844921
