In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import datetime
plt.style.use('dark_background')
import sys
sys.path.insert(1, '/home/mauricio/code/mcr')
from mcr.util import glimpse, plot_value_counts, plot_value_counts_timeseries, missing_report, plot_missing, plot_unique, plot_duplicates, size

from pyspark import SparkContext
# SparkContext.getOrCreate(conf: Optional[pyspark.conf.SparkConf] = None) -> 'SparkContext'
sc = SparkContext.getOrCreate()
# sc.setLogLevel('DEBUG')

from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local[*]').appName('spark_application').getOrCreate()
print(spark.version)

from pyspark.sql import functions as F
from pyspark.sql.types import *

23/04/28 11:38:03 WARN Utils: Your hostname, rig resolves to a loopback address: 127.0.1.1; using 192.168.0.105 instead (on interface enp6s0)
23/04/28 11:38:03 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/04/28 11:38:03 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
3.3.2


# Introduction to the MovieLens dataset

## MovieLens dataset
F. Maxwell Harper and Joseph A. Konstan. 2015  
The MovieLens Datasets: History and Context.  
ACM Transitions on Interactive Intelligent Systems (TiiS) 5, 4, Article 19 (December 2015), 19  
Pages. DOI=http://dx.doi.org/10.1145/2827872

## MovieLens summary stats
        Ratings: 20,000,000+
        Users: 138,493
        Movies: 27,278

## Explore the data

In [2]:
# Read data from CSV file
ratings = spark.read.csv('ratings.csv',
                         sep=',',
                         header=True,
                         inferSchema=True).drop('timestamp')
ratings.show(3)
ratings.printSchema()

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|     1|     31|   2.5|
|     1|   1029|   3.0|
|     1|   1061|   3.0|
+------+-------+------+
only showing top 3 rows

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)



## MovieLens sparsity
$$Sparsity = 1 - \frac{Number\ of\ Ratings\ in\ Matrix}{Number\ of\ Users \times Number\ of\ Movies}$$

## Sparsity

In [3]:
# Number of ratings in matrix
number_of_ratings = ratings.count()
number_of_ratings

100004

In [4]:
# Distinct users
number_of_users = ratings.select("userId").distinct().count()
number_of_users

671

In [5]:
# Distinct movies
number_of_movies = ratings.select("movieId").distinct().count()
number_of_movies

9066

In [6]:
#Calculating sparsity
print("Sparsity: ", 1 - (number_of_ratings / (number_of_users * number_of_movies)))

Sparsity:  0.9835608583913366


## The .distinct() method

In [7]:
ratings.select("userId").distinct().count()

671

## GroupBy method

In [8]:
# Num of song plays by userId
ratings.groupBy("userId").count().show()

+------+-----+
|userId|count|
+------+-----+
|   148|  132|
|   463|  483|
|   471|  216|
|   496|  126|
|   243|  307|
|   392|   25|
|   540|   20|
|   623|  103|
|    31|   69|
|   516|  149|
|    85|  107|
|   137|   80|
|   251|  119|
|   451|   52|
|   580|  922|
|    65|   27|
|   458|   76|
|    53|   46|
|   255|  145|
|   481|  436|
+------+-----+
only showing top 20 rows



## GroupBy method min

In [9]:
#from pyspark.sql.functions import min, max, avg
# Min num of song plays by userId
ratings.groupBy("userId").count().select(F.min("count")).show()

+----------+
|min(count)|
+----------+
|        20|
+----------+



## GroupBy method max

In [10]:
# Max num of song plays by userId
ratings.groupBy("userId").count().select(F.max("count")).show()

+----------+
|max(count)|
+----------+
|      2391|
+----------+



## GroupBy method avg

In [11]:
# Avg num of song plays by userId
ratings.groupBy("userId").count().select(F.avg("count")).show()

+------------------+
|        avg(count)|
+------------------+
|149.03725782414307|
+------------------+



## Filter method

In [12]:
# Removes users with less than 20 ratings
ratings.groupBy("userId").count().filter(F.col("count") >= 20).show()

+------+-----+
|userId|count|
+------+-----+
|   148|  132|
|   463|  483|
|   471|  216|
|   496|  126|
|   243|  307|
|   392|   25|
|   540|   20|
|   623|  103|
|    31|   69|
|   516|  149|
|    85|  107|
|   137|   80|
|   251|  119|
|   451|   52|
|   580|  922|
|    65|   27|
|   458|   76|
|    53|   46|
|   255|  145|
|   481|  436|
+------+-----+
only showing top 20 rows



## Exercises

# ALS model buildout on MovieLens Data

## Fitting a basic model

In [13]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

# Split data
(training_data, test_data) = ratings.randomSplit([0.8, 0.2])
# Build ALS model
als = ALS(userCol="userId", itemCol="movieId", ratingCol="rating",
          rank=25, maxIter=100, regParam=0.05,
          nonnegative=True, coldStartStrategy="drop", implicitPrefs=False)

# https://intellipaat.com/community/18452/spark-gives-a-stackoverflowerror-when-training-using-als
sc.setCheckpointDir('checkpoint/')
# ALS.checkpointInterval = 2
# Fit model to training data
model = als.fit(training_data)

# Generate predictions on test_data
predictions = model.transform(test_data)
# Tell Spark how to evaluate predictions
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
# Obtain and print RMSE
rmse = evaluator.evaluate(predictions)
print("RMSE: ", rmse)

RMSE:  0.9656497242107647


## Build generic ALS model without hyperparameters

In [14]:
# Build generic ALS model without hyperparameters
als = ALS(userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop", nonnegative = True, implicitPrefs = False)

## Adding Hyperparameter Values to the ParamGridBuilder

In [15]:
# Imports ParamGridBuilder package
from pyspark.ml.tuning import ParamGridBuilder

# Creates a ParamGridBuilder, and adds hyperparameters and values
# param_grid = ParamGridBuilder()\
#     .addGrid(als.rank, [5, 40, 80, 120])\
#     .addGrid(als.maxIter, [5, 100, 250, 500])\
#     .addGrid(als.regParam, [.05, .1, 1.5])\
#     .build()
param_grid = ParamGridBuilder()\
    .addGrid(als.rank, [5, 10, 20])\
    .addGrid(als.maxIter, [5, 10, 20])\
    .addGrid(als.regParam, [.05, .1, 1.5])\
    .build()

## CrossValidator

In [16]:
from pyspark.ml.tuning import CrossValidator

# Build cross validation step using CrossValidator
cv = CrossValidator(estimator=als, estimatorParamMaps=param_grid,
                    evaluator=evaluator, numFolds=5)

## Best model

In [17]:
# Tell Spark what values to try for each hyperparameter
# Run the cv on the training data
model = cv.fit(training_data)
# Extract best combination of values from cross validation
best_model = model.bestModel

**Last executed at 2023-04-26 15:50:24 in 1m 59.83s**

In [18]:
type(best_model)

pyspark.ml.recommendation.ALSModel

## Predictions and performance evaluation

In [19]:
# Generate test set predictions and evaluate using RMSE
predictions = best_model.transform(test_data)
rmse = evaluator.evaluate(predictions)

In [20]:
# Print evaluation metrics and model parameters
print("**Best Model**")
print("RMSE = ", rmse)
print(" Rank: ", best_model.rank)
print(" MaxIter: ", best_model._java_obj.parent().getMaxIter())
print(" RegParam: ", best_model._java_obj.parent().getRegParam())

**Best Model**
RMSE =  0.9113623121523982
 Rank:  20
 MaxIter:  5
 RegParam:  0.1


## Exercises

### Create test/train splits and build your ALS model

In [24]:
# Import the required functions
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Create test and train set
(train, test) = ratings.randomSplit([0.8, 0.2], seed = 1234)

# Create ALS model
als = ALS(userCol="userId", itemCol="movieId", ratingCol="rating", nonnegative = True, coldStartStrategy='drop', implicitPrefs = False)

# Confirm that a model called "als" was created
type(als)

pyspark.ml.recommendation.ALS

### Tell Spark how to tune your ALS model

In [25]:
# Import the requisite items
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Add hyperparameters and their respective values to param_grid
param_grid = ParamGridBuilder() \
            .addGrid(als.rank, [10, 50, 100, 150]) \
            .addGrid(als.maxIter, [5, 50, 100, 200]) \
            .addGrid(als.regParam, [.01, .05, .1, 1.5]) \
            .build()
           
# Define evaluator as RMSE and print length of evaluator
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction") 
print ("Num models to be tested: ", len(param_grid))

Num models to be tested:  64


### Build your cross validation pipeline

In [26]:
# Build cross validation using CrossValidator
cv = CrossValidator(estimator=als, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=5)

# Confirm cv was built
print(cv)

CrossValidator_3f639e395316


### Fit the best model

In [42]:
from os.path import isdir
from pyspark.ml.tuning import CrossValidatorModel

In [60]:
path = 'saved_model'
if not isdir(path):
    print(f'Saving {path}')
    #Fit cross validator to the 'train' dataset
    model = cv.fit(train)
    model.save(path)
else:
    print(f'Loading {path}')
    model = CrossValidatorModel.load(path)

Loading saved_model


In [61]:
#Extract best model from the cv model above
best_model = model.bestModel

**Last executed at 2023-04-26 18:31:13 in 2h 4m 20s**

### Best Model and Best Parameters

In [63]:
# # Print best_model
# print(type(best_model))

# # Complete the code below to extract the ALS model parameters
# print("**Best Model**")

# # Print "Rank"
# print("  Rank:", best_model._java_obj.parent().getRank())

# # Print "MaxIter"
# print("  MaxIter:", best_model._java_obj.parent().getMaxIter())

# # Print "RegParam"
# print("  RegParam:", best_model._java_obj.parent().getRegParam())

<class 'pyspark.ml.recommendation.ALSModel'>
**Best Model**


AttributeError: 'NoneType' object has no attribute 'getRank'

        <class 'pyspark.ml.recommendation.ALSModel'>
        **Best Model**
          Rank: 100
          MaxIter: 5
          RegParam: 0.1

# Model Performance Evaluation and Output Cleanup

## Exercises

### Generate predictions and calculate RMSE

In [66]:
# Generate predictions on test_data
predictions = best_model.transform(test)
# Tell Spark how to evaluate predictions
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
# Obtain and print RMSE
evaluator.evaluate(predictions)

0.8982861635120967

### Do recommendations make sense

In [71]:
# Look at user 60's ratings
print("User 60's Ratings:")
ratings.filter(F.col("userId") == 60).sort("rating", ascending = False).show()

# Look at the movies recommended to user 60
print("User 60s Recommendations:")
predictions.filter(F.col("userId") == 60).show()

# Look at user 63's ratings
print("User 63's Ratings:")
ratings.filter(F.col("userId") == 63).sort("rating", ascending = False).show()

# Look at the movies recommended to user 63
print("User 63's Recommendations:")
predictions.filter(F.col("userId") == 63).show()

User 60's Ratings:
+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|    60|    541|   5.0|
|    60|   1653|   5.0|
|    60|   1732|   5.0|
|    60|   2324|   5.0|
|    60|   3949|   5.0|
|    60|    235|   5.0|
|    60|   5995|   5.0|
|    60|   6350|   5.0|
|    60|   7361|   5.0|
|    60|   8638|   5.0|
|    60|   8981|   5.0|
|    60|  27803|   5.0|
|    60|  30749|   5.0|
|    60|   5060|   5.0|
|    60|    858|   5.0|
|    60|   5690|   5.0|
|    60|   1221|   5.0|
|    60|   1673|   4.5|
|    60|   1080|   4.5|
|    60|   1208|   4.5|
+------+-------+------+
only showing top 20 rows

User 60s Recommendations:
+------+-------+------+----------+
|userId|movieId|rating|prediction|
+------+-------+------+----------+
|    60|     21|   3.0|  3.548698|
|    60|    163|   4.0| 3.6403236|
|    60|    466|   4.0| 3.0087955|
|    60|    745|   3.5| 4.0264053|
|    60|   1080|   4.5| 4.1969285|
|    60|   1090|   4.0| 4.2307177|
|    60|   1221|   5.0| 4.4486976|
|  