In [1]:
import os
import sys
os.environ["PYSPARK_SUBMIT_ARGS"]='pyspark-shell'
os.environ["PYSPARK_PYTHON"]='python3'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.4-src.zip'))
os.environ["PYSPARK_PYTHON"] = 'python3'
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.3.0
      /_/

Using Python version 3.6.4 (default, Jan 28 2018 00:00:00)
SparkSession available as 'spark'.


In [2]:
spark

![ALS](pics/mf.png)

## Our goal is to represent the matrix of user-item ratings as the product of two LOW RANK matrices 
## $$R = U \times P^\top $$ 
## where 
## $$U \in \mathbb{R}^{n \times k}, k \ll n$$
## and 
## $$P \in \mathbb{R}^{m \times k}, k \ll m$$

## This means that
## $$R_{i,j} = u_i \bullet p_j$$
## recalling
## $$\cos(\theta) = \frac{a \bullet b}{\|a\|\|b\|}$$

## How do we find a solution? By minimizing the cost function!
## $$J = \|R - U \times P^\top\|_2 + \lambda(\|U\|_2 + \|P\|_2)$$

## How do we optimize this?
![GD](pics/gradient_descent.jpeg)

## There are two problems here though:
+ the number of parameters is $n \times k + m \times k$
+ the loss is non-convex (https://www.quora.com/Why-is-the-matrix-factorization-optimization-function-in-recommender-systems-not-convex)

## So what do we do? ALS (alternating least squares)
## Recall Ordinary Least Squares
## $$J(\beta) = \|y - X\beta\|_2$$
## $$\beta = (X^\top X)^{-1}X^\top y$$

## ALS is a two-step iterative optimization process
## $$ \forall u_i : J(u_i) = \|R_i - u_i \times P^\top\|_2 + \lambda\|u_i\|_2$$
## $$ \forall p_j : J(p_j) = \|R_i - U \times p^{\top}_{j}\|_2 + \lambda\|p_j\|_2$$
## The solution is
## $$u_i = (P^\top \times P + \lambda I)^{-1} \times P^\top \times R_i$$
## $$p_j = (U^\top \times U + \lambda I)^{-1} \times U^\top \times R_j$$

In [3]:
from pyspark.sql.types import *

In [4]:
schema = StructType([
    StructField("user", IntegerType()),
    StructField("item", IntegerType()),
    StructField("rating", IntegerType()),
    StructField("timestamp", LongType())
])

In [5]:
dataset = spark.read.csv("/share/ml-100k/ua.base", sep="\t", schema=schema).drop("timestamp").repartition(20).cache()

In [6]:
dataset.take(5)

[Row(user=297, item=1, rating=3),
 Row(user=119, item=924, rating=4),
 Row(user=373, item=230, rating=4),
 Row(user=900, item=508, rating=3),
 Row(user=13, item=828, rating=1)]

In [7]:
dataset.rdd.getNumPartitions()

20

In [8]:
from pyspark.ml.recommendation import ALS

In [9]:
als = ALS(rank=10, maxIter=5, seed=5757)

In [10]:
model = als.fit(dataset)

In [11]:
model.rank

10

In [12]:
test = spark.read.csv("/share/ml-100k/ua.test", sep="\t", schema=schema).drop("timestamp").repartition(4).cache()

In [13]:
predictions = model.transform(test)

In [14]:
predictions.take(5)

[Row(user=251, item=148, rating=2, prediction=3.0381240844726562),
 Row(user=580, item=148, rating=4, prediction=3.0317912101745605),
 Row(user=602, item=148, rating=4, prediction=3.5022647380828857),
 Row(user=372, item=148, rating=5, prediction=3.834611177444458),
 Row(user=274, item=148, rating=2, prediction=3.3601744174957275)]

In [15]:
from pyspark.ml.evaluation import RegressionEvaluator

In [16]:
evaluator = RegressionEvaluator(labelCol="rating", metricName="rmse")

In [17]:
evaluator.evaluate(predictions)

nan

In [18]:
predictions.groupBy("rating").count().collect()

[Row(rating=1, count=542),
 Row(rating=3, count=2424),
 Row(rating=5, count=2153),
 Row(rating=4, count=3316),
 Row(rating=2, count=995)]

In [19]:
import pyspark.sql.functions as f

In [20]:
predictions.filter(f.isnan("prediction")).count()

2

In [21]:
predictions.filter(f.isnan("prediction")).collect()

[Row(user=675, item=1653, rating=5, prediction=nan),
 Row(user=405, item=1582, rating=1, prediction=nan)]

In [22]:
dataset.filter(dataset.user == 675).collect()

[Row(user=675, item=235, rating=1),
 Row(user=675, item=311, rating=3),
 Row(user=675, item=937, rating=1),
 Row(user=675, item=258, rating=3),
 Row(user=675, item=900, rating=4),
 Row(user=675, item=242, rating=4),
 Row(user=675, item=750, rating=4),
 Row(user=675, item=86, rating=4),
 Row(user=675, item=272, rating=3),
 Row(user=675, item=244, rating=3),
 Row(user=675, item=318, rating=5),
 Row(user=675, item=891, rating=2),
 Row(user=675, item=344, rating=4),
 Row(user=675, item=1628, rating=5),
 Row(user=675, item=1007, rating=4),
 Row(user=675, item=303, rating=5),
 Row(user=675, item=223, rating=1),
 Row(user=675, item=427, rating=5),
 Row(user=675, item=286, rating=4),
 Row(user=675, item=896, rating=5),
 Row(user=675, item=269, rating=5),
 Row(user=675, item=312, rating=2),
 Row(user=675, item=321, rating=2),
 Row(user=675, item=463, rating=5)]

In [23]:
dataset.filter(dataset.item == 1653).collect()

[]

In [24]:
dataset.filter((dataset.user == 675) & (dataset.item == 1582)).collect()

[]

In [25]:
predictions = predictions.dropna()

In [26]:
evaluator.evaluate(predictions)

0.9590533627741923

## But how do we deal with the cold start in Spark?!

In [27]:
model = als.fit(dataset, params={als.coldStartStrategy: "drop"})

In [28]:
model.getOrDefault("coldStartStrategy")

'drop'

In [29]:
predictions = model.transform(test)

In [30]:
evaluator.evaluate(predictions)

0.9590533627741922

## Can we do better?

In [93]:
model = als.fit(dataset, params={als.coldStartStrategy: "drop", als.maxIter: 20})

In [94]:
predictions = model.transform(test)

In [95]:
evaluator.evaluate(predictions)

0.9558872242636991

## Or even better?

In [97]:
model = als.fit(dataset, params={als.coldStartStrategy: "drop", als.maxIter: 20, als.rank: 100})

In [98]:
predictions = model.transform(test)

In [99]:
evaluator.evaluate(predictions)

0.9475039684939921

## What else do we have here?

In [31]:
model.recommendForAllItems(5).take(5)

[Row(item=1342, recommendations=[Row(user=662, rating=3.724311590194702), Row(user=118, rating=3.557661533355713), Row(user=440, rating=3.4993064403533936), Row(user=270, rating=3.4897232055664062), Row(user=427, rating=3.3939871788024902)]),
 Row(item=148, recommendations=[Row(user=507, rating=4.894386291503906), Row(user=907, rating=4.649669647216797), Row(user=513, rating=4.618252277374268), Row(user=688, rating=4.602916240692139), Row(user=849, rating=4.570733070373535)]),
 Row(item=1088, recommendations=[Row(user=36, rating=3.9826483726501465), Row(user=644, rating=3.8986754417419434), Row(user=137, rating=3.739666700363159), Row(user=309, rating=3.6950972080230713), Row(user=471, rating=3.689685344696045)]),
 Row(item=1238, recommendations=[Row(user=519, rating=4.736871719360352), Row(user=810, rating=4.2670488357543945), Row(user=36, rating=4.163325309753418), Row(user=688, rating=3.951127529144287), Row(user=126, rating=3.911823272705078)]),
 Row(item=1580, recommendations=[Row

In [32]:
model.recommendForAllUsers(5).take(5)

[Row(user=496, recommendations=[Row(item=838, rating=5.011465549468994), Row(item=1589, rating=4.678163051605225), Row(item=320, rating=4.476866722106934), Row(item=1449, rating=4.4084930419921875), Row(item=853, rating=4.3602375984191895)]),
 Row(user=148, recommendations=[Row(item=1463, rating=5.407992839813232), Row(item=1643, rating=5.2708821296691895), Row(item=169, rating=4.9199419021606445), Row(item=1449, rating=4.914376735687256), Row(item=1367, rating=4.882181167602539)]),
 Row(user=463, recommendations=[Row(item=1463, rating=4.4693193435668945), Row(item=1449, rating=4.301555156707764), Row(item=1512, rating=4.2987494468688965), Row(item=1084, rating=4.217663764953613), Row(item=1251, rating=4.2151079177856445)]),
 Row(user=833, recommendations=[Row(item=1368, rating=5.062682628631592), Row(item=1643, rating=4.714048862457275), Row(item=1536, rating=4.3504438400268555), Row(item=1597, rating=4.296138763427734), Row(item=320, rating=4.292140007019043)]),
 Row(user=471, recomm

In [34]:
model.itemFactors.take(5)

[Row(id=10, features=[0.4910842180252075, 0.5687710046768188, 0.6812558770179749, -0.5482439398765564, 0.27426472306251526, 0.3678845167160034, -1.098655104637146, 0.2107856124639511, 0.8881864547729492, 0.2629821300506592]),
 Row(id=20, features=[0.9072177410125732, 0.5339254140853882, 0.5115773677825928, -1.2328755855560303, 0.542868435382843, 0.44991403818130493, -0.4627978801727295, 0.22674496471881866, -0.17887842655181885, -0.09169133007526398]),
 Row(id=30, features=[0.5513217449188232, 0.30307069420814514, 0.8260578513145447, -0.6058114171028137, 0.48463699221611023, 0.17869749665260315, -0.6395977735519409, 1.0904102325439453, 0.3720622658729553, -0.11694475263357162]),
 Row(id=40, features=[1.0447009801864624, 0.25295910239219666, -0.10710208117961884, -0.6006267666816711, 0.36589130759239197, -0.19846627116203308, -0.24621443450450897, 0.017913779243826866, 1.0390408039093018, 0.5447020530700684]),
 Row(id=50, features=[0.6331756711006165, 0.15788942575454712, 0.922681808471

In [35]:
model.userFactors.count()

[Row(id=10, features=[0.9489773511886597, 0.5666304230690002, 1.103602409362793, -1.2175092697143555, 0.11277274787425995, -0.24626755714416504, -0.7906545400619507, 1.0588929653167725, 0.6422020792961121, 0.6299095749855042]),
 Row(id=20, features=[1.147719383239746, 0.6592509746551514, -0.3106571137905121, -1.272091031074524, -0.3390337824821472, -0.4940218925476074, -0.4846492409706116, 0.5361677408218384, 0.9728611707687378, 0.6635444760322571]),
 Row(id=30, features=[0.5372956395149231, 0.13461139798164368, 0.7390300631523132, -1.470736026763916, 0.1607738584280014, 0.02893531695008278, -0.9771374464035034, 1.3123841285705566, 1.0575284957885742, 0.3354685306549072]),
 Row(id=40, features=[0.5047808289527893, -0.5589486360549927, 0.3527592420578003, -1.2439231872558594, 0.2284407764673233, -0.24316594004631042, -1.0813521146774292, 0.6494705677032471, 0.464910089969635, 0.448367714881897]),
 Row(id=50, features=[0.6008652448654175, 0.12734994292259216, 2.0770909786224365, -0.73200

In [None]:
model.itemFactors.crossJoin(model.userFactors)