# ALS Bagging Ensemble for Movie Recommendation
<br>  
This notebook is an example of using bagging ensemble w/ pySpark ASL on [MovieLens](https://grouplens.org/datasets/movielens/) data

In [5]:
import pyspark
import pyspark.sql.functions as F
from pyspark.ml.recommendation import ALS
from pyspark.sql import SparkSession
from pyspark.sql.types import (
    StructType,
    StructField,
    StringType,
    FloatType,
    IntegerType,
    LongType
)

import pandas as pd
from bagging import RecoBagging

spark = SparkSession.builder.appName("SAR pySpark").getOrCreate()

In [7]:
# Number of models to combine
NUM_MODELS = 10

# Number of items to recommend for each user
TOP_K = 10

Load 100k MovieLens data and randomly split into training and test sets

In [2]:
url = "http://files.grouplens.org/datasets/movielens/ml-100k/u.data"
data_pd = pd.read_csv(url, sep='\t', names=['userId', 'movieId', 'rating', 'timestamp'])
assert len(data_pd) == 100000

data_pd['rating'] = data_pd['rating'].astype(float)
data_pd.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,196,242,3.0,881250949
1,186,302,3.0,891717742
2,22,377,1.0,878887116
3,244,51,2.0,880606923
4,166,346,1.0,886397596


In [3]:
schema = StructType([
    StructField('userId', IntegerType()),
    StructField('movieId', IntegerType()),
    StructField('rating', FloatType()),
    StructField('timestamp', LongType()),
])
data_df = spark.createDataFrame(data_pd, schema=schema)

train, test = data_df.randomSplit([0.7, 0.3], 123)
print("Train vs test: {} vs {}".format(
    train.cache().count(),
    test.cache().count()))

Train vs test: 69854 vs 30146



Train multiple ALS with bootstraping sampling. To add more diversity in the ensemble, randomize some of the ALS hyper params too.


In [8]:
params = {
    'userCol': 'userId',
    'itemCol': 'movieId',
    'ratingCol': 'rating',
    'rank': (20, 50),
    'maxIter': 15,
    'implicitPrefs': True,
    'alpha': (0.1, 40.0),
    'regParam': (0.01, 0.2),
    'coldStartStrategy': 'drop',
    'nonnegative': True
}

bagging = RecoBagging(
    ALS,
    num_models=NUM_MODELS,
    user_col='userId', item_col='movieId', rating_col='rating',
    **params
)

bagging.fit(train)

Training model 0 {'userCol': 'userId', 'itemCol': 'movieId', 'ratingCol': 'rating', 'rank': 37, 'maxIter': 15, 'implicitPrefs': True, 'alpha': 27.390812878519107, 'regParam': 0.11073720135143822, 'coldStartStrategy': 'drop', 'nonnegative': True}
Training model 1 {'userCol': 'userId', 'itemCol': 'movieId', 'ratingCol': 'rating', 'rank': 37, 'maxIter': 15, 'implicitPrefs': True, 'alpha': 5.640445320280572, 'regParam': 0.06761005361189487, 'coldStartStrategy': 'drop', 'nonnegative': True}
Training model 2 {'userCol': 'userId', 'itemCol': 'movieId', 'ratingCol': 'rating', 'rank': 41, 'maxIter': 15, 'implicitPrefs': True, 'alpha': 1.2118704217201888, 'regParam': 0.1975057861034909, 'coldStartStrategy': 'drop', 'nonnegative': True}
Training model 3 {'userCol': 'userId', 'itemCol': 'movieId', 'ratingCol': 'rating', 'rank': 47, 'maxIter': 15, 'implicitPrefs': True, 'alpha': 16.44757889508595, 'regParam': 0.19468782404064525, 'coldStartStrategy': 'drop', 'nonnegative': True}
Training model 4 {'

Recommend top k Movies for each user

In [9]:
recommendations = bagging.recommend_k_items(test, top_k=TOP_K, merge_by='sum', scale=True)
recommendations.cache().show()

Recommending by 0
Recommending by 1
Recommending by 2
Recommending by 3
Recommending by 4
Recommending by 5
Recommending by 6
Recommending by 7
Recommending by 8
Recommending by 9
+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|   148|[[408, 2.01506903...|
|   463|[[124, 1.99898111...|
|   471|[[969, 1.76125515...|
|   496|[[98, 0.920692272...|
|   833|[[32, 1.884712659...|
|   243|[[137, 2.29751922...|
|   392|[[286, 2.83141433...|
|   540|[[117, 2.39173550...|
|   623|[[50, 1.670710236...|
|   737|[[23, 0.926466422...|
|   858|[[286, 2.56716712...|
|   897|[[98, 1.351963298...|
|    31|[[484, 1.84409073...|
|   516|[[127, 1.70551274...|
|    85|[[381, 1.68441748...|
|   137|[[181, 3.71769568...|
|   251|[[50, 3.185611522...|
|   451|[[331, 1.74581842...|
|   580|[[748, 1.72157082...|
|   808|[[302, 2.15206930...|
+------+--------------------+
only showing top 20 rows

