In [32]:
!pip install pyspark

Defaulting to user installation because normal site-packages is not writeable


In [33]:
import numpy as np
from numpy.linalg import inv
from pyspark.sql import SparkSession

In [34]:
spark = SparkSession\
  .builder\
  .appName('scratch-ALS')\
  .getOrCreate()
sc = spark.sparkContext

In [35]:
numWorkers = sc.defaultParallelism
numWorkers

4

In [36]:
def getRelativeIndex(value, index_list):
  return index_list[value]

In [37]:
def sortByRelativeIndex(user_or_item, input):
  if user_or_item == 'user':
    return input\
      .map(lambda x: x[1])\
      .distinct()\
      .sortBy(lambda x: x, ascending=True)\
      .zipWithIndex().collect()
  else: return input\
      .map(lambda x: x[2][1])\
      .distinct()\
      .sortBy(lambda x: x, ascending=True)\
      .zipWithIndex().collect()


In [38]:
def getBlock(user_or_item, ratings, sorted_users, sorted_items):
  if user_or_item == 'user':
    return ratings\
      .map(lambda x: (getRelativeIndex(x[0], sorted_users), getRelativeIndex(x[1][0], sorted_items)))\
      .groupByKey()
  else:
    return ratings\
      .map(lambda x: (getRelativeIndex(x[1][0], sorted_items), getRelativeIndex(x[0], sorted_users)))\
      .groupByKey()


In [180]:
data = sc.textFile("ml-100k/u.data")
raw_data = data.map(
  lambda l: l.split('\t')
).map(lambda l: (int(l[0]), int(l[1]), float(l[2])))

In [181]:
trainRDD, testRDD = raw_data.randomSplit([8,2],42)

In [182]:
sorted_users = dict(raw_data.map(lambda x: x[0]).distinct().sortBy(lambda idx: idx, ascending = True)\
  .zipWithIndex().collect())

sorted_items = dict(raw_data.map(lambda x: x[1]).distinct().sortBy(lambda idx: idx, ascending = True)\
  .zipWithIndex().collect())

item_count = len(sorted_items)
user_count = len(sorted_users)

In [183]:
M = trainRDD.map(lambda x: (getRelativeIndex(x[0], sorted_users), getRelativeIndex(x[1], sorted_items), x[2]))

In [166]:
numFactors = 10
num_factors = 10
# W = M.map(lambda x: tuple([int(x[0]),1])).reduceByKey(lambda x,y : x+y).map(lambda x: tuple([x[0], np.random.rand(1,numFactors).astype('float16')])).persist()
# H = M.map(lambda x: tuple([int(x[1]),1])).reduceByKey(lambda x,y : x+y).map(lambda x: tuple([x[0], np.random.rand(1,numFactors).astype('float16')])).persist()

In [167]:
W = np.random.rand(user_count, num_factors)
H = np.random.rand(item_count, num_factors)


In [168]:
R_u = M.map(lambda x: (x[0], (x[1], x[2]))).groupByKey().cache()
R_i = M.map(lambda x: (x[1], (x[0], x[2]))).groupByKey().cache()

In [170]:
w_broadcast = sc.broadcast(W)
h_broadcast = sc.broadcast(H)

In [220]:
def computeOptimizeMatrix(iterables, constant_matrix_broadcast, lamb):
  fixed_matrix = constant_matrix_broadcast.value
  iter_dict = dict(iterables)
  #X = np.array([fixed_matrix[k] for k in iter_dict.keys()])
  X = fixed_matrix[list(iter_dict.keys()), :]
  R = np.array(list(iter_dict.values()))
  XtX = X.T.dot(X)
  XtR = X.T.dot(R.T)
  return np.linalg.solve(XtX + lamb.value * np.eye(num_factors), XtR).T

In [None]:
# random_user = R_u.groupByKey().collect()[6]
# iterables = list(random_user[1])
# constant_matrix_broadcast = h_broadcast

# fixed_matrix = constant_matrix_broadcast.value
# num_factors = fixed_matrix.shape[0]

# iter_dict = dict(iterables)
# XtX = np.zeros((num_factors, num_factors))
# XXt = fixed_matrix.dot(fixed_matrix.T)
# XtX = np.zeros((numFactors, numFactors))
# RX = np.zeros((numFactors, 1))
# for i in iter_dict.items():
#   index = sorted_items[i[0]]
#   rating = i[1]
#   C = H[:, [index]]
#   RX += rating * C
#   XtX += (C.dot(C.T))
# np.linalg.solve(XtX, RX)

In [172]:
LAMBDA = 0.01   # regularization
np.random.seed(42)


def get_rmse(R, ms: np.ndarray, us: np.ndarray) -> np.float64:
    diff = R - ms * us.T
    return np.sqrt(np.sum(np.power(diff, 2)) / (M_count * U_count))


def update(i: int, mat: np.ndarray, ratings: np.ndarray) -> np.ndarray:
    uu = mat.shape[0]
    ff = mat.shape[1]

    XtX = mat.T * mat
    Xty = mat.T * ratings[i, :].T

    for j in range(ff):
        XtX[j, j] += LAMBDA * uu

    return np.linalg.solve(XtX, Xty)


In [173]:
lamb = sc.broadcast(0.01)

In [174]:
def get_error_square(rating, i, j):
  pred = w_broadcast.value[i].dot(h_broadcast.value[j].T)
  return (rating - pred)**2

In [189]:

set(testRDD.map(lambda x: getRelativeIndex(x[1], sorted_items)).collect()) - set(R_i.map(lambda x: x[0]).collect())

{676,
 710,
 1342,
 1347,
 1451,
 1475,
 1499,
 1514,
 1545,
 1561,
 1565,
 1579,
 1580,
 1582,
 1613,
 1615,
 1617,
 1625,
 1629,
 1639,
 1640,
 1648,
 1666,
 1678,
 1681}

In [222]:
newH = dict(R_i\
    .mapValues(lambda row:computeOptimizeMatrix(row,w_broadcast,lamb))\
    .sortByKey()\
    .collect())
newH

{0: array([0.67200031, 0.70816672, 0.52045569, 0.9667302 , 0.5900885 ,
        0.86265606, 0.90499907, 0.68578903, 0.83411493, 0.86792303]),
 1: array([ 0.27007417,  0.74390799, -0.05262054,  0.72775365,  1.04355591,
         1.50344473,  0.46093752,  0.62643114,  0.35636519,  0.58537   ]),
 2: array([ 1.17027021,  0.50608063,  1.23448649, -0.76405448,  1.13640943,
         0.69372575,  1.63976562, -0.37293304, -1.42794024,  1.57878319]),
 3: array([0.49258354, 0.56759592, 0.94468541, 0.24772671, 0.77074288,
        0.73294322, 0.72606825, 0.75486566, 0.6402327 , 1.04828753]),
 4: array([ 1.39244134,  1.10901273,  0.40765951,  0.01024306,  0.63634537,
         0.64794339,  1.18366956,  0.60462749, -0.24827763,  0.60390901]),
 5: array([-3.71368302,  5.61294196, -0.97663152, -1.94725255,  3.45681417,
         7.51131265,  4.24768266,  1.91301239, -0.58699314, -7.35541835]),
 6: array([0.30707789, 0.56964701, 0.43454619, 0.63451405, 0.54194982,
        0.62124196, 1.03275659, 1.40087003,

In [208]:
H[[1], :].shape

(1, 10)

In [197]:
W[0].dot(H[5].T)

3.6129803537544607

In [219]:
train_users = M.map(lambda x: x[0]).distinct().collect()
train_items = M.map(lambda x: x[0]).distinct().collect()
testRDD\
    .map(lambda x: (getRelativeIndex(x[0], sorted_users), getRelativeIndex(x[1], sorted_items), x[2]))\
    .filter(lambda x: x[0] in train_users and x[1] in train_items).collect()

                                                                                

[(5, 85, 3.0),
 (302, 784, 3.0),
 (166, 485, 4.0),
 (307, 0, 4.0),
 (101, 767, 2.0),
 (159, 233, 5.0),
 (286, 326, 5.0),
 (80, 431, 2.0),
 (71, 678, 2.0),
 (200, 218, 4.0),
 (12, 525, 3.0),
 (59, 426, 5.0),
 (193, 164, 4.0),
 (41, 402, 3.0),
 (94, 624, 4.0),
 (7, 337, 4.0),
 (31, 293, 3.0),
 (89, 381, 5.0),
 (165, 327, 5.0),
 (249, 495, 4.0),
 (278, 831, 3.0),
 (236, 513, 4.0),
 (93, 788, 4.0),
 (43, 194, 5.0),
 (263, 199, 5.0),
 (215, 657, 3.0),
 (86, 273, 4.0),
 (298, 110, 3.0),
 (193, 465, 4.0),
 (302, 918, 4.0),
 (48, 558, 2.0),
 (304, 116, 2.0),
 (79, 465, 5.0),
 (98, 78, 4.0),
 (25, 257, 3.0),
 (199, 672, 5.0),
 (118, 327, 4.0),
 (212, 171, 5.0),
 (275, 321, 3.0),
 (12, 359, 4.0),
 (14, 404, 2.0),
 (250, 747, 2.0),
 (147, 407, 5.0),
 (96, 227, 5.0),
 (109, 687, 1.0),
 (177, 567, 4.0),
 (302, 841, 2.0),
 (300, 400, 4.0),
 (35, 338, 5.0),
 (69, 745, 3.0),
 (278, 67, 4.0),
 (118, 236, 5.0),
 (267, 1, 2.0),
 (180, 263, 2.0),
 (296, 132, 4.0),
 (274, 97, 4.0),
 (218, 81, 1.0),
 (221, 

In [211]:
ITERATIONS = 50
for i in range(ITERATIONS):
  newW = dict(R_u\
    .mapValues(lambda row:computeOptimizeMatrix(row,h_broadcast,lamb))\
    .sortByKey()\
    .mapValues(lambda x: list(np.array(x)[0]))\
    .collect())
  W = np.array([newW.get(i, val) for i, val in enumerate(W) ])
  #w_broadcast.destroy()
  w_broadcast = sc.broadcast(W)
  newH = dict(R_i\
    .mapValues(lambda row:computeOptimizeMatrix(row,w_broadcast,lamb))\
    .sortByKey()\
    .mapValues(lambda x: list(np.array(x)[0]))\
    .collect())
  H = np.array([newH.get(i, val) for i, val in enumerate(H) ])
  #h_broadcast.destroy()
  h_broadcast = sc.broadcast(H)
  sse = M.map(lambda x: (x[2]- W[[x[0]],:].dot(H[[x[1]], :].T))**2).reduce(lambda x,y: x+y)[0,0]
  count = M.count()
  mse = pow((sse/count), 0.5)
  print("Iteration %d:" % i)
  print("\nRMSE: %5.4f\n" % mse)


                                                                                

Iteration 0:

RMSE: 0.7698



                                                                                

Iteration 1:

RMSE: 0.7349



                                                                                

Iteration 2:

RMSE: 0.7144



                                                                                

Iteration 3:

RMSE: 0.7009



                                                                                

Iteration 4:

RMSE: 0.6920



                                                                                

Iteration 5:

RMSE: 0.6857



                                                                                

Iteration 6:

RMSE: 0.6810



                                                                                

Iteration 7:

RMSE: 0.6775



                                                                                

Iteration 8:

RMSE: 0.6747



                                                                                

Iteration 9:

RMSE: 0.6725



                                                                                

Iteration 10:

RMSE: 0.6707



                                                                                

Iteration 11:

RMSE: 0.6691



                                                                                

Iteration 12:

RMSE: 0.6678



                                                                                

Iteration 13:

RMSE: 0.6666



                                                                                

Iteration 14:

RMSE: 0.6656



                                                                                

Iteration 15:

RMSE: 0.6647



                                                                                

Iteration 16:

RMSE: 0.6639



                                                                                

Iteration 17:

RMSE: 0.6632



                                                                                

Iteration 18:

RMSE: 0.6625



                                                                                

Iteration 19:

RMSE: 0.6619



                                                                                

Iteration 20:

RMSE: 0.6614



                                                                                

Iteration 21:

RMSE: 0.6609



                                                                                

Iteration 22:

RMSE: 0.6604



                                                                                

Iteration 23:

RMSE: 0.6599



                                                                                

Iteration 24:

RMSE: 0.6594



                                                                                

Iteration 25:

RMSE: 0.6590



                                                                                

Iteration 26:

RMSE: 0.6586



                                                                                

Iteration 27:

RMSE: 0.6582



ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/home/noing/.local/lib/python3.10/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/home/noing/.local/lib/python3.10/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

Exception in thread "serve RDD 2489" java.net.SocketTimeoutException: Accept timed out
	at java.net.PlainSocketImpl.socketAccept(Native Method)
	at java.net.AbstractPlainSocketImpl.accept(AbstractPlainSocketImpl.java:409)
	at java.net.ServerSocket.implAccept(ServerSocket.java:560)
	at java.net.ServerSocket.accept(ServerSocket.java:528)
	at org.apache.spark.security.SocketAuthServer$$anon$1.run(SocketAuthServer.scala:64)


In [None]:
np.array(newW[0].flatten())[0]

array([0.67517169, 1.46851969, 0.35888359, 0.40483493, 0.74416579,
       0.52722992, 0.43725883, 0.96443489, 1.07526524, 0.57850029])

In [None]:
np.array(list(map(lambda x: x.flatten(), newW))).T.shape

(10, 1, 943)

In [None]:
M_count = len(sorted_items)
U_count = len(sorted_users)
F = numFactors
partitions = numWorkers
ITERATIONS = 2

In [None]:
R = np.zeros((M_count, U_count))
rating_rdd = M.map(lambda x: (x[0], x[1][0], x[1][1])).collect()


In [None]:
ms = np.matrix(np.random.rand(M_count, F))
us = np.matrix(np.random.rand(U_count, F))

Rb = sc.broadcast(R)
msb = sc.broadcast(ms)
usb = sc.broadcast(us)



for i in range(ITERATIONS):
    ms_ = sc.parallelize(range(M), partitions) \
        .map(lambda x: update(x, usb.value, Rb.value)) \
        .collect()
    # collect() returns a list, so array ends up being
    # a 3-d array, we take the first 2 dims for the matrix
    ms = np.matrix(np.array(ms_)[:, :, 0])
    msb = sc.broadcast(ms)

    us_ = sc.parallelize(range(U), partitions) \
        .map(lambda x: update(x, msb.value, Rb.value.T)) \
        .collect()
    us = np.matrix(np.array(us_)[:, :, 0])
    usb = sc.broadcast(us)

    error = rmse(R, ms, us)
    print("Iteration %d:" % i)
    print("\nRMSE: %5.4f\n" % error)


In [None]:
def get_error_square(rating, i, j):
  pred = us[[i], :].dot(ms[[j], :].T)[0][0]
  return (rating - pred)**2

ITERATIONS = 100
for i in range(ITERATIONS):
    ms_ = sc.parallelize(range(M_count), partitions) \
        .map(lambda x: update(x, usb.value, Rb.value)) \
        .collect()
    # collect() returns a list, so array ends up being
    # a 3-d array, we take the first 2 dims for the matrix
    ms = np.matrix(np.array(ms_)[:, :, 0])
    msb = sc.broadcast(ms)

    us_ = sc.parallelize(range(U_count), partitions) \
        .map(lambda x: update(x, msb.value, Rb.value.T)) \
        .collect()
    us = np.matrix(np.array(us_)[:, :, 0])
    usb = sc.broadcast(us)

    
    sse = M.map(lambda x: get_error_square(x[1][1], sorted_users[x[0]], sorted_items[x[1][0]])).reduce(lambda x,y: x+y)[0, 0]
    count = M.count()
    rmse = pow(sse/count, 0.5)
    error = get_rmse(R, ms, us)
    print("Iteration %d:" % i)
    print("\nRMSE: %5.4f" % error)
    print("\nGlobal RMSE: %5.4f\n" % rmse)


22/12/11 11:31:56 ERROR Executor: Exception in task 1.0 in stage 829.0 (TID 1239)
org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/home/noing/data_framework/spark-3.3.0-bin-hadoop3/python/lib/pyspark.zip/pyspark/worker.py", line 686, in main
    process()
  File "/home/noing/data_framework/spark-3.3.0-bin-hadoop3/python/lib/pyspark.zip/pyspark/worker.py", line 678, in process
    serializer.dump_stream(out_iter, outfile)
  File "/home/noing/data_framework/spark-3.3.0-bin-hadoop3/python/lib/pyspark.zip/pyspark/serializers.py", line 273, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/home/noing/data_framework/spark-3.3.0-bin-hadoop3/python/lib/pyspark.zip/pyspark/util.py", line 81, in wrapper
    return f(*args, **kwargs)
  File "/tmp/ipykernel_54366/243499966.py", line 8, in <lambda>
  File "/tmp/ipykernel_54366/4186574233.py", line 15, in update
  File "/usr/lib/python3/dist-packages/numpy/matrixlib/defmatrix.py", line 2

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 2 in stage 829.0 failed 1 times, most recent failure: Lost task 2.0 in stage 829.0 (TID 1240) (192.168.1.16 executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/home/noing/data_framework/spark-3.3.0-bin-hadoop3/python/lib/pyspark.zip/pyspark/worker.py", line 686, in main
    process()
  File "/home/noing/data_framework/spark-3.3.0-bin-hadoop3/python/lib/pyspark.zip/pyspark/worker.py", line 678, in process
    serializer.dump_stream(out_iter, outfile)
  File "/home/noing/data_framework/spark-3.3.0-bin-hadoop3/python/lib/pyspark.zip/pyspark/serializers.py", line 273, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/home/noing/data_framework/spark-3.3.0-bin-hadoop3/python/lib/pyspark.zip/pyspark/util.py", line 81, in wrapper
    return f(*args, **kwargs)
  File "/tmp/ipykernel_54366/243499966.py", line 8, in <lambda>
  File "/tmp/ipykernel_54366/4186574233.py", line 15, in update
  File "/usr/lib/python3/dist-packages/numpy/matrixlib/defmatrix.py", line 218, in __mul__
    return N.dot(self, asmatrix(other))
  File "<__array_function__ internals>", line 5, in dot
ValueError: shapes (10,943) and (1,943) not aligned: 943 (dim 1) != 1 (dim 0)

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:559)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:765)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:747)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:512)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:366)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:364)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:358)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:358)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:345)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:339)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.rdd.RDD.$anonfun$collect$2(RDD.scala:1021)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2268)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:136)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:548)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1504)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:551)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:750)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2672)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2608)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2607)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2607)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1182)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1182)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1182)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2860)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2802)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2791)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:952)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2228)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2249)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2268)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2293)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1021)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:406)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1020)
	at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:180)
	at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
	at sun.reflect.GeneratedMethodAccessor51.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:750)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/home/noing/data_framework/spark-3.3.0-bin-hadoop3/python/lib/pyspark.zip/pyspark/worker.py", line 686, in main
    process()
  File "/home/noing/data_framework/spark-3.3.0-bin-hadoop3/python/lib/pyspark.zip/pyspark/worker.py", line 678, in process
    serializer.dump_stream(out_iter, outfile)
  File "/home/noing/data_framework/spark-3.3.0-bin-hadoop3/python/lib/pyspark.zip/pyspark/serializers.py", line 273, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/home/noing/data_framework/spark-3.3.0-bin-hadoop3/python/lib/pyspark.zip/pyspark/util.py", line 81, in wrapper
    return f(*args, **kwargs)
  File "/tmp/ipykernel_54366/243499966.py", line 8, in <lambda>
  File "/tmp/ipykernel_54366/4186574233.py", line 15, in update
  File "/usr/lib/python3/dist-packages/numpy/matrixlib/defmatrix.py", line 218, in __mul__
    return N.dot(self, asmatrix(other))
  File "<__array_function__ internals>", line 5, in dot
ValueError: shapes (10,943) and (1,943) not aligned: 943 (dim 1) != 1 (dim 0)

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:559)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:765)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:747)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:512)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:366)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:364)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:358)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:358)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:345)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:339)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.rdd.RDD.$anonfun$collect$2(RDD.scala:1021)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2268)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:90)
	at org.apache.spark.scheduler.Task.run(Task.scala:136)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:548)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1504)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:551)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more


In [None]:
usb.value.shape[1]

In [None]:
def get_error_square(rating, i, j):
  pred = us[:, [i]].T.dot(ms[:, [j]])[0][0]
  return (rating - pred)**2

In [None]:
ms_ = sc.parallelize(range(M_count), partitions) \
    .map(lambda x: update(x, usb.value, Rb.value)) \
    .collect()
# collect() returns a list, so array ends up being
# a 3-d array, we take the first 2 dims for the matrix
ms = np.matrix(np.array(ms_)[:, :, 0])
msb = sc.broadcast(ms)

us_ = sc.parallelize(range(U_count), partitions) \
    .map(lambda x: update(x, msb.value, Rb.value.T)) \
    .collect()
us = np.matrix(np.array(us_)[:, :, 0])
usb = sc.broadcast(us)
mse = get_rmse(M, ms, us)
error = rmse(R, ms, us)
print("Iteration %d:" % i)
print("\nErr: %5.4f\n" % error)
print("\nRMSE: %5.4f\n" % rmse)


In [None]:
R

In [None]:
XtX = usb.value.T * usb.value
inv(XtX + 0.01 * np.eye(F)) * Xty

In [None]:
Xty = usb.value.T * Rb.value[6, :].T
Xty