In [32]:
!pip install pyspark

Defaulting to user installation because normal site-packages is not writeable


In [33]:
import numpy as np
from numpy.linalg import inv
from pyspark.sql import SparkSession

In [34]:
spark = SparkSession\
  .builder\
  .appName('scratch-ALS')\
  .getOrCreate()
sc = spark.sparkContext

In [35]:
numWorkers = sc.defaultParallelism
numWorkers

4

In [36]:
def getRelativeIndex(value, index_list):
  return index_list[value]

In [37]:
def sortByRelativeIndex(user_or_item, input):
  if user_or_item == 'user':
    return input\
      .map(lambda x: x[1])\
      .distinct()\
      .sortBy(lambda x: x, ascending=True)\
      .zipWithIndex().collect()
  else: return input\
      .map(lambda x: x[2][1])\
      .distinct()\
      .sortBy(lambda x: x, ascending=True)\
      .zipWithIndex().collect()


In [38]:
def getBlock(user_or_item, ratings, sorted_users, sorted_items):
  if user_or_item == 'user':
    return ratings\
      .map(lambda x: (getRelativeIndex(x[0], sorted_users), getRelativeIndex(x[1][0], sorted_items)))\
      .groupByKey()
  else:
    return ratings\
      .map(lambda x: (getRelativeIndex(x[1][0], sorted_items), getRelativeIndex(x[0], sorted_users)))\
      .groupByKey()


In [83]:
data = sc.textFile("ml-100k/u.data")
raw_data = data.map(
  lambda l: l.split('\t')
).map(lambda l: (int(l[0]), int(l[1]), float(l[2])))

In [84]:
sorted_users = dict(raw_data.map(lambda x: x[0]).distinct().sortBy(lambda idx: idx, ascending = True)\
  .zipWithIndex().collect())

sorted_items = dict(raw_data.map(lambda x: x[1]).distinct().sortBy(lambda idx: idx, ascending = True)\
  .zipWithIndex().collect())

item_count = len(sorted_items)
user_count = len(sorted_users)

In [85]:
M = raw_data.map(lambda x: (getRelativeIndex(x[0], sorted_users), getRelativeIndex(x[1], sorted_items), x[2]))

In [86]:
numFactors = 10
num_factors = 10
W = M.map(lambda x: tuple([int(x[0]),1])).reduceByKey(lambda x,y : x+y).map(lambda x: tuple([x[0], np.random.rand(1,numFactors).astype('float16')])).persist()
H = M.map(lambda x: tuple([int(x[1]),1])).reduceByKey(lambda x,y : x+y).map(lambda x: tuple([x[0], np.random.rand(1,numFactors).astype('float16')])).persist()

In [87]:
R_u = M.map(lambda x: (x[0], (x[1], x[2]))).groupByKey().cache()
R_i = M.map(lambda x: (x[1], (x[0], x[2]))).groupByKey().cache()

In [88]:
np.array(W.sortByKey().map(lambda x: x[1][0]).collect()).shape

(943, 10)

In [100]:
w_broadcast = sc.broadcast(np.array(W.sortByKey().map(lambda x: x[1][0]).collect()))
h_broadcast = sc.broadcast(np.array(H.sortByKey().map(lambda x: x[1][0]).collect()))

                                                                                

In [104]:
def computeOptimizeMatrix(iterables, constant_matrix_broadcast, lamb):
  fixed_matrix = constant_matrix_broadcast.value
  iter_dict = dict(iterables)
  #X = np.array([fixed_matrix[k] for k in iter_dict.keys()])
  X = fixed_matrix[list(iter_dict.keys()), :]
  R = np.matrix(list(iter_dict.values()))
  XtX = X.T.dot(X)
  XtR = X.T.dot(R.T)
  return np.linalg.solve(XtX + lamb.value * np.eye(num_factors), XtR)

In [None]:
# random_user = R_u.groupByKey().collect()[6]
# iterables = list(random_user[1])
# constant_matrix_broadcast = h_broadcast

# fixed_matrix = constant_matrix_broadcast.value
# num_factors = fixed_matrix.shape[0]

# iter_dict = dict(iterables)
# XtX = np.zeros((num_factors, num_factors))
# XXt = fixed_matrix.dot(fixed_matrix.T)
# XtX = np.zeros((numFactors, numFactors))
# RX = np.zeros((numFactors, 1))
# for i in iter_dict.items():
#   index = sorted_items[i[0]]
#   rating = i[1]
#   C = H[:, [index]]
#   RX += rating * C
#   XtX += (C.dot(C.T))
# np.linalg.solve(XtX, RX)

In [68]:
LAMBDA = 0.01   # regularization
np.random.seed(42)


def get_rmse(R, ms: np.ndarray, us: np.ndarray) -> np.float64:
    diff = R - ms * us.T
    return np.sqrt(np.sum(np.power(diff, 2)) / (M_count * U_count))


def update(i: int, mat: np.ndarray, ratings: np.ndarray) -> np.ndarray:
    uu = mat.shape[0]
    ff = mat.shape[1]

    XtX = mat.T * mat
    Xty = mat.T * ratings[i, :].T

    for j in range(ff):
        XtX[j, j] += LAMBDA * uu

    return np.linalg.solve(XtX, Xty)


In [91]:
lamb = sc.broadcast(0.01)

In [None]:
def get_error_square(rating, i, j):
  pred = w_broadcast.value[i].dot(h_broadcast.value[j].T)
  return (rating - pred)**2

In [75]:
R_i.groupByKey()

PythonRDD[1377] at RDD at PythonRDD.scala:53

In [None]:
R_u.groupWith()

In [109]:
np.array(R_u\
    .mapValues(lambda row:computeOptimizeMatrix(row,h_broadcast,lamb))\
    .sortByKey()\
    .mapValues(lambda m: np.array(m.T))\
    .map(lambda x: x[1][0])\
    .collect())

array([[ 0.87903601,  0.33789654,  0.85999694, ...,  0.79700616,
         0.89964003,  0.53204307],
       [ 0.62626005,  0.67105382,  1.06663227, ...,  0.56396863,
         0.71561498, -0.0546765 ],
       [-0.19668581,  1.04795343,  0.36372988, ...,  0.34568758,
         0.69381249, -0.60824673],
       ...,
       [-0.0269491 ,  1.73624279,  1.28529887, ...,  1.58294335,
         0.65567751,  0.99572395],
       [ 1.03905764,  0.40092323,  0.03545471, ...,  1.75437576,
         0.33064409,  1.03950409],
       [ 0.51109285,  0.81673427,  0.24543816, ...,  0.70225582,
         0.63196032,  0.64909968]])

In [None]:
ITERATIONS = 50
for i in range(ITERATIONS):
  newW = R_u\
    .mapValues(lambda row:computeOptimizeMatrix(row,h_broadcast,lamb))\
    .sortByKey()\
    .collect()
  W = np.array(list(map(lambda x: x, newW))).T
  w_broadcast.destroy()
  w_broadcast = sc.broadcast(dict(W.collect()))
  newH = R_i\
    .mapValues(lambda row:computeOptimizeMatrix(row,w_broadcast,lamb))\
    .sortByKey()\
    .collect()
  H = np.array(list(map(lambda x: np.array(x.flatten())[0], newH))).T
  h_broadcast.destroy()
  h_broadcast = sc.broadcast(dict(H.collect()))
  sse = M.map(lambda x: get_error_square(x[2], x[0], x[1])).reduce(lambda x,y: x+y)[0,0]
  count = M.count()
  mse = pow((sse/count), 0.5)
  print("Iteration %d:" % i)
  print("\nRMSE: %5.4f\n" % mse)


                                                                                

Iteration 0:

RMSE: 0.8367



                                                                                

Iteration 1:

RMSE: 0.7730



                                                                                

Iteration 2:

RMSE: 0.7472



                                                                                

Iteration 3:

RMSE: 0.7327



                                                                                

Iteration 4:

RMSE: 0.7237



                                                                                

Iteration 5:

RMSE: 0.7174



                                                                                

Iteration 6:

RMSE: 0.7127



                                                                                

Iteration 7:

RMSE: 0.7091



                                                                                

Iteration 8:

RMSE: 0.7062



                                                                                

Iteration 9:

RMSE: 0.7039



                                                                                

Iteration 10:

RMSE: 0.7019



                                                                                

Iteration 11:

RMSE: 0.7001



                                                                                

Iteration 12:

RMSE: 0.6986



                                                                                

Iteration 13:

RMSE: 0.6973



                                                                                

Iteration 14:

RMSE: 0.6961



                                                                                

Iteration 15:

RMSE: 0.6950



                                                                                

Iteration 16:

RMSE: 0.6940



                                                                                

Iteration 17:

RMSE: 0.6931



                                                                                

Iteration 18:

RMSE: 0.6923



                                                                                

Iteration 19:

RMSE: 0.6916



                                                                                

Iteration 20:

RMSE: 0.6909



                                                                                

Iteration 21:

RMSE: 0.6903



                                                                                

Iteration 22:

RMSE: 0.6897



                                                                                

Iteration 23:

RMSE: 0.6892



                                                                                

Iteration 24:

RMSE: 0.6887



                                                                                

Iteration 25:

RMSE: 0.6882



                                                                                

Iteration 26:

RMSE: 0.6878



                                                                                

Iteration 27:

RMSE: 0.6874



                                                                                

Iteration 28:

RMSE: 0.6870



                                                                                

Iteration 29:

RMSE: 0.6866



                                                                                

Iteration 30:

RMSE: 0.6863



                                                                                

Iteration 31:

RMSE: 0.6860



                                                                                

Iteration 32:

RMSE: 0.6858



                                                                                

Iteration 33:

RMSE: 0.6855



                                                                                

Iteration 34:

RMSE: 0.6853



                                                                                

Iteration 35:

RMSE: 0.6850



                                                                                

Iteration 36:

RMSE: 0.6848



                                                                                

Iteration 37:

RMSE: 0.6847



                                                                                

Iteration 38:

RMSE: 0.6845



                                                                                

Iteration 39:

RMSE: 0.6843



                                                                                

Iteration 40:

RMSE: 0.6841



                                                                                

Iteration 41:

RMSE: 0.6840



                                                                                

Iteration 42:

RMSE: 0.6838



                                                                                

Iteration 43:

RMSE: 0.6837



                                                                                

Iteration 44:

RMSE: 0.6836



                                                                                

Iteration 45:

RMSE: 0.6834



                                                                                

Iteration 46:

RMSE: 0.6833



                                                                                

Iteration 47:

RMSE: 0.6832



                                                                                

Iteration 48:

RMSE: 0.6831



                                                                                

Iteration 49:

RMSE: 0.6830



In [None]:
np.array(newW[0].flatten())[0]

array([0.67517169, 1.46851969, 0.35888359, 0.40483493, 0.74416579,
       0.52722992, 0.43725883, 0.96443489, 1.07526524, 0.57850029])

In [None]:
np.array(list(map(lambda x: x.flatten(), newW))).T.shape

(10, 1, 943)

In [None]:
M_count = len(sorted_items)
U_count = len(sorted_users)
F = numFactors
partitions = numWorkers
ITERATIONS = 2

In [None]:
R = np.zeros((M_count, U_count))
rating_rdd = M.map(lambda x: (x[0], x[1][0], x[1][1])).collect()


In [None]:
ms = np.matrix(np.random.rand(M_count, F))
us = np.matrix(np.random.rand(U_count, F))

Rb = sc.broadcast(R)
msb = sc.broadcast(ms)
usb = sc.broadcast(us)



for i in range(ITERATIONS):
    ms_ = sc.parallelize(range(M), partitions) \
        .map(lambda x: update(x, usb.value, Rb.value)) \
        .collect()
    # collect() returns a list, so array ends up being
    # a 3-d array, we take the first 2 dims for the matrix
    ms = np.matrix(np.array(ms_)[:, :, 0])
    msb = sc.broadcast(ms)

    us_ = sc.parallelize(range(U), partitions) \
        .map(lambda x: update(x, msb.value, Rb.value.T)) \
        .collect()
    us = np.matrix(np.array(us_)[:, :, 0])
    usb = sc.broadcast(us)

    error = rmse(R, ms, us)
    print("Iteration %d:" % i)
    print("\nRMSE: %5.4f\n" % error)


In [None]:
def get_error_square(rating, i, j):
  pred = us[[i], :].dot(ms[[j], :].T)[0][0]
  return (rating - pred)**2

ITERATIONS = 100
for i in range(ITERATIONS):
    ms_ = sc.parallelize(range(M_count), partitions) \
        .map(lambda x: update(x, usb.value, Rb.value)) \
        .collect()
    # collect() returns a list, so array ends up being
    # a 3-d array, we take the first 2 dims for the matrix
    ms = np.matrix(np.array(ms_)[:, :, 0])
    msb = sc.broadcast(ms)

    us_ = sc.parallelize(range(U_count), partitions) \
        .map(lambda x: update(x, msb.value, Rb.value.T)) \
        .collect()
    us = np.matrix(np.array(us_)[:, :, 0])
    usb = sc.broadcast(us)

    
    sse = M.map(lambda x: get_error_square(x[1][1], sorted_users[x[0]], sorted_items[x[1][0]])).reduce(lambda x,y: x+y)[0, 0]
    count = M.count()
    rmse = pow(sse/count, 0.5)
    error = get_rmse(R, ms, us)
    print("Iteration %d:" % i)
    print("\nRMSE: %5.4f" % error)
    print("\nGlobal RMSE: %5.4f\n" % rmse)


22/12/11 11:31:56 ERROR Executor: Exception in task 1.0 in stage 829.0 (TID 1239)
org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/home/noing/data_framework/spark-3.3.0-bin-hadoop3/python/lib/pyspark.zip/pyspark/worker.py", line 686, in main
    process()
  File "/home/noing/data_framework/spark-3.3.0-bin-hadoop3/python/lib/pyspark.zip/pyspark/worker.py", line 678, in process
    serializer.dump_stream(out_iter, outfile)
  File "/home/noing/data_framework/spark-3.3.0-bin-hadoop3/python/lib/pyspark.zip/pyspark/serializers.py", line 273, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/home/noing/data_framework/spark-3.3.0-bin-hadoop3/python/lib/pyspark.zip/pyspark/util.py", line 81, in wrapper
    return f(*args, **kwargs)
  File "/tmp/ipykernel_54366/243499966.py", line 8, in <lambda>
  File "/tmp/ipykernel_54366/4186574233.py", line 15, in update
  File "/usr/lib/python3/dist-packages/numpy/matrixlib/defmatrix.py", line 2

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 2 in stage 829.0 failed 1 times, most recent failure: Lost task 2.0 in stage 829.0 (TID 1240) (192.168.1.16 executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/home/noing/data_framework/spark-3.3.0-bin-hadoop3/python/lib/pyspark.zip/pyspark/worker.py", line 686, in main
    process()
  File "/home/noing/data_framework/spark-3.3.0-bin-hadoop3/python/lib/pyspark.zip/pyspark/worker.py", line 678, in process
    serializer.dump_stream(out_iter, outfile)
  File "/home/noing/data_framework/spark-3.3.0-bin-hadoop3/python/lib/pyspark.zip/pyspark/serializers.py", line 273, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/home/noing/data_framework/spark-3.3.0-bin-hadoop3/python/lib/pyspark.zip/pyspark/util.py", line 81, in wrapper
    return f(*args, **kwargs)
  File "/tmp/ipykernel_54366/243499966.py", line 8, in <lambda>
  File "/tmp/ipykernel_54366/4186574233.py", line 15, in update
  File "/usr/lib/python3/dist-packages/numpy/matrixlib/defmatrix.py", line 218, in __mul__
    return N.dot(self, asmatrix(other))
  File "<__array_function__ internals>", line 5, in dot
ValueError: shapes (10,943) and (1,943) not aligned: 943 (dim 1) != 1 (dim 0)

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:559)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:765)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:747)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:512)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:366)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:364)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:358)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:358)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:345)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:339)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.rdd.RDD.$anonfun$collect$2(RDD.scala:1021)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2268)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:136)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:548)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1504)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:551)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:750)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2672)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2608)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2607)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2607)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1182)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1182)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1182)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2860)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2802)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2791)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:952)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2228)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2249)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2268)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2293)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1021)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:406)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1020)
	at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:180)
	at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
	at sun.reflect.GeneratedMethodAccessor51.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:750)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/home/noing/data_framework/spark-3.3.0-bin-hadoop3/python/lib/pyspark.zip/pyspark/worker.py", line 686, in main
    process()
  File "/home/noing/data_framework/spark-3.3.0-bin-hadoop3/python/lib/pyspark.zip/pyspark/worker.py", line 678, in process
    serializer.dump_stream(out_iter, outfile)
  File "/home/noing/data_framework/spark-3.3.0-bin-hadoop3/python/lib/pyspark.zip/pyspark/serializers.py", line 273, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/home/noing/data_framework/spark-3.3.0-bin-hadoop3/python/lib/pyspark.zip/pyspark/util.py", line 81, in wrapper
    return f(*args, **kwargs)
  File "/tmp/ipykernel_54366/243499966.py", line 8, in <lambda>
  File "/tmp/ipykernel_54366/4186574233.py", line 15, in update
  File "/usr/lib/python3/dist-packages/numpy/matrixlib/defmatrix.py", line 218, in __mul__
    return N.dot(self, asmatrix(other))
  File "<__array_function__ internals>", line 5, in dot
ValueError: shapes (10,943) and (1,943) not aligned: 943 (dim 1) != 1 (dim 0)

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:559)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:765)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:747)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:512)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:366)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:364)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:358)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:358)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:345)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:339)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.rdd.RDD.$anonfun$collect$2(RDD.scala:1021)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2268)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:136)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:548)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1504)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:551)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more


In [None]:
usb.value.shape[1]

In [None]:
def get_error_square(rating, i, j):
  pred = us[:, [i]].T.dot(ms[:, [j]])[0][0]
  return (rating - pred)**2

In [None]:
ms_ = sc.parallelize(range(M_count), partitions) \
    .map(lambda x: update(x, usb.value, Rb.value)) \
    .collect()
# collect() returns a list, so array ends up being
# a 3-d array, we take the first 2 dims for the matrix
ms = np.matrix(np.array(ms_)[:, :, 0])
msb = sc.broadcast(ms)

us_ = sc.parallelize(range(U_count), partitions) \
    .map(lambda x: update(x, msb.value, Rb.value.T)) \
    .collect()
us = np.matrix(np.array(us_)[:, :, 0])
usb = sc.broadcast(us)
mse = get_rmse(M, ms, us)
error = rmse(R, ms, us)
print("Iteration %d:" % i)
print("\nErr: %5.4f\n" % error)
print("\nRMSE: %5.4f\n" % rmse)


In [None]:
R

In [None]:
XtX = usb.value.T * usb.value
inv(XtX + 0.01 * np.eye(F)) * Xty

In [None]:
Xty = usb.value.T * Rb.value[6, :].T
Xty