# Exercise 2

## Imports

In [1]:
import pickle
import os.path
import pyspark.sql.functions as F
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.types import StringType, ArrayType, FloatType, DoubleType, IntegerType
from itertools import combinations
from typing import Iterable, Any, List, Set

import pandas as pd
import numpy as np
from typing import Union

import matplotlib.pyplot as plt

from pyspark.ml.clustering import BisectingKMeans

## Spark initialization

In [3]:
spark = SparkSession.builder \
    .appName('exercise2') \
    .config('spark.master', 'local[*]') \
    .config("spark.executor.memory", "1g") \
    .getOrCreate()

spark.conf.set("spark.sql.shuffle.partitions", "16")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/04/25 21:09:18 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Prepare the data

*Start with the small dataset first.*

In [4]:
df = (spark.read
    .option('header', 'false')
    .option('sep', '\t')
    .csv('data/ml-100k/u.data')
)

df = (df.withColumnRenamed('_c0', 'user_id') 
    .withColumnRenamed('_c1', 'item_id')
    .withColumnRenamed('_c2', 'rating')
    .drop('_c3')
    .select(F.col('user_id').cast(IntegerType()), F.col('item_id').cast(IntegerType()), F.col('rating').cast(IntegerType()))
)

## Collaborative Filtering

### Calculate baselines.

In [5]:
user_baseLine = {
    row['user_id']:row['user_avg']
    for row in df.groupBy('user_id')
        .agg(F.avg('rating').alias('user_avg'))
        .collect()
}
item_baseLine = {
    row['item_id']:row['item_avg']
    for row in df.groupBy('item_id')
        .agg(F.avg('rating').alias('item_avg'))
        .collect()
}
mu = df.agg(F.avg('rating')).collect()[0][0]

                                                                                

In [6]:
def calculate_user_item_baseline(user_id: int, item_id: int) -> float:
    user_avg = user_baseLine[user_id]
    item_avg = item_baseLine[item_id]
    return -mu + user_avg + item_avg

### Clustering the movies

In [7]:
user_number = df.select('user_id').distinct().count()
item_number = df.select('item_id').distinct().count()

In [8]:
@F.udf(returnType=ArrayType(IntegerType(),False))
def utility_matrix_row(elems: Iterable[Any]):

    lista = [0] * user_number
    
    for (user_id,rating) in elems:
        lista[user_id-1] = rating

    return lista

### Find similar items

In [9]:
from pyspark.sql.types import StructType, StructField

@F.udf(returnType=StructType([
    StructField('items_in_cluster', ArrayType(IntegerType(), False)),
    StructField('ratings_in_cluster', ArrayType(ArrayType(DoubleType(), False), False)),
]))
def find_most_similar(user_id: int, items_in_cluster: List[int], ratings_in_cluster: List[List[float]]):
    return tuple(zip(*(
        (item_id, ratings)
        for item_id, ratings in zip(items_in_cluster, ratings_in_cluster)
        if ratings[user_id-1] != 0
    )))

Calculate similarities.

In [10]:
@F.udf(returnType=DoubleType())
def predict_rating_no_numpy(user_id: int, item_id: int, ratings: List[float], items_in_cluster: List[int], ratings_in_cluster: List[List[float]]) -> float:

    # TODO: maybe remove item_id itself?

    ratings_mean = sum(r for r in ratings if r != 0) / sum(1 for r in ratings if r != 0)
    ratings_without_mean = [r - ratings_mean for r in ratings]

    ratings_in_cluster_mean = [sum(r for r in ratings if r != 0) / sum(1 for r in ratings if r != 0) for ratings in ratings_in_cluster]  
    ratings_in_cluster_without_mean = [[r - mean for r in ratings] for ratings, mean in zip(ratings_in_cluster, ratings_in_cluster_mean)]
    
    pearson_correlation_similarities = [
        sum(r1 * r2 for r1, r2 in zip(ratings_without_mean, rs)) / (sum(r**2 for r in ratings_without_mean)**.5 * sum(r**2 for r in rs)**.5)
        for rs in ratings_in_cluster_without_mean
    ]

    base_line = calculate_user_item_baseline(user_id, item_id)

    other_ratings = [r[user_id-1] for r in ratings_in_cluster ]

    other_baselines = [calculate_user_item_baseline(user_id, item) for item in items_in_cluster]

    return base_line + sum(pcs * (ort - obl) for pcs, ort, obl in zip(pearson_correlation_similarities, other_ratings, other_baselines)) / sum(pearson_correlation_similarities)

In [11]:
@F.udf(returnType=DoubleType())
def predict_rating(user_id: int, item_id: int, ratings: List[float], items_in_cluster: List[int], ratings_in_cluster: List[List[float]]) -> float:

    # TODO: maybe remove item_id itself?

    # TODO: with numpy or not?
    ratings_np = np.array(ratings).reshape((1, -1))
    ratings_in_cluster_np = np.array(ratings_in_cluster)

    ratings_np_pc = ratings_np - np.average(ratings_np[ratings_np.nonzero()])
    non_zero = np.zeros(ratings_in_cluster_np.shape)
    non_zero[ratings_in_cluster_np.nonzero()] = 1
    ratings_in_cluster_np_pc = ratings_in_cluster_np - np.average(ratings_in_cluster_np, axis=1, weights=non_zero, keepdims=True)

    pearson_correlation_similarities = np.sum(ratings_np_pc * ratings_in_cluster_np_pc, axis=1) / (np.sqrt(np.sum(ratings_in_cluster_np_pc**2, axis=1)) * np.sqrt(np.sum(ratings_np_pc**2, axis=1)))

    base_line = calculate_user_item_baseline(user_id, item_id)

    other_ratings = ratings_in_cluster_np[:, user_id-1]

    other_baselines = np.array([calculate_user_item_baseline(user_id, item) for item in items_in_cluster])

    return base_line + float(np.sum(pearson_correlation_similarities * (other_ratings - other_baselines)) / np.sum(pearson_correlation_similarities))

## Testing

Leave out 10% of the ratings for evaluation.

In [12]:
seed_random = 1
# TODO: maybe instead add random column and filter by that? the randomSplit seems too contiguous
train_set, test_set = df.randomSplit([9.0, 1.0], seed=seed_random)

train_matrix = (train_set.groupBy('item_id')
    .agg(F.collect_list(F.array('user_id','rating')).alias('ratings'))
    .withColumn('ratings', utility_matrix_row(F.col('ratings')).cast(ArrayType(DoubleType(),False)))
)

In [13]:
def print_progress(progress: float, message: str):
    print(f"[{progress:3%}] {message:100}", end="\r")

In [14]:
neighbours = 10
bkm = BisectingKMeans(featuresCol='ratings',minDivisibleClusterSize = neighbours,predictionCol = "cluster_id").setK(10).setSeed(1)
model = bkm.fit(train_matrix)

train_matrix_with_clusters = model.transform(train_matrix)

[Stage 27:>                                                         (0 + 1) / 1]

23/04/25 21:09:35 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
23/04/25 21:09:35 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS


                                                                                

In [15]:
# TODO: show how many of the items in the test set could not be predicted because they were not in the train set
predictions = (test_set
    .withColumnRenamed('rating', 'true_rating')
    .join(train_matrix_with_clusters.select('item_id', 'ratings', 'cluster_id'), on='item_id', how='inner') # obtain ratings and cluster_id of item_id
    .join(
        train_matrix_with_clusters.groupBy('cluster_id').agg(F.collect_list('item_id').alias('items_in_cluster'), F.collect_list('ratings').alias('ratings_in_cluster')),
        on='cluster_id',
        how='inner'
    )
    .withColumn('find_most_similar_results', find_most_similar('user_id', 'items_in_cluster', 'ratings_in_cluster'))
    .withColumns({
        'items_in_cluster': F.col('find_most_similar_results').items_in_cluster,
        'ratings_in_cluster': F.col('find_most_similar_results').ratings_in_cluster,
    })
    .drop('find_most_similar_results')
    .withColumn('predicted_rating', predict_rating_no_numpy('user_id', 'item_id', 'ratings', 'items_in_cluster', 'ratings_in_cluster'))
    .select('user_id', 'item_id', 'true_rating', 'predicted_rating')
)

In [16]:
predictions.show()

Traceback (most recent call last):                                  (0 + 1) / 1]
  File "/home/pedro/Desktop/MEI-4ano/2semestre/MDLE/spark-3.3.2-bin-hadoop3/python/lib/pyspark.zip/pyspark/daemon.py", line 187, in manager
  File "/home/pedro/Desktop/MEI-4ano/2semestre/MDLE/spark-3.3.2-bin-hadoop3/python/lib/pyspark.zip/pyspark/daemon.py", line 74, in worker
  File "/home/pedro/Desktop/MEI-4ano/2semestre/MDLE/spark-3.3.2-bin-hadoop3/python/lib/pyspark.zip/pyspark/worker.py", line 730, in main
    if read_int(infile) == SpecialLengths.END_OF_STREAM:
  File "/home/pedro/Desktop/MEI-4ano/2semestre/MDLE/spark-3.3.2-bin-hadoop3/python/lib/pyspark.zip/pyspark/serializers.py", line 595, in read_int
    raise EOFError
EOFError


23/04/25 21:10:07 ERROR Utils: Uncaught exception in thread stdout writer for /usr/bin/python3
java.lang.OutOfMemoryError: Java heap space
	at java.base/java.util.Arrays.copyOf(Arrays.java:3745)
	at java.base/java.io.ByteArrayOutputStream.grow(ByteArrayOutputStream.java:120)
	at java.base/java.io.ByteArrayOutputStream.ensureCapacity(ByteArrayOutputStream.java:95)
	at java.base/java.io.ByteArrayOutputStream.write(ByteArrayOutputStream.java:156)
	at java.base/java.io.OutputStream.write(OutputStream.java:122)
	at net.razorvine.pickle.Pickler.put_float(Pickler.java:707)
	at net.razorvine.pickle.Pickler.dispatch(Pickler.java:286)
	at net.razorvine.pickle.Pickler.save(Pickler.java:185)
	at net.razorvine.pickle.Pickler.put_collection(Pickler.java:407)
	at net.razorvine.pickle.Pickler.dispatch(Pickler.java:363)
	at net.razorvine.pickle.Pickler.save(Pickler.java:185)
	at net.razorvine.pickle.Pickler.put_collection(Pickler.java:407)
	at net.razorvine.pickle.Pickler.dispatch(Pickler.java:363)
	at

Exception in thread "stdout writer for /usr/bin/python3" java.lang.OutOfMemoryError: Java heap space
	at java.base/java.util.Arrays.copyOf(Arrays.java:3745)
	at java.base/java.io.ByteArrayOutputStream.grow(ByteArrayOutputStream.java:120)
	at java.base/java.io.ByteArrayOutputStream.ensureCapacity(ByteArrayOutputStream.java:95)
	at java.base/java.io.ByteArrayOutputStream.write(ByteArrayOutputStream.java:156)
	at java.base/java.io.OutputStream.write(OutputStream.java:122)
	at net.razorvine.pickle.Pickler.put_float(Pickler.java:707)
	at net.razorvine.pickle.Pickler.dispatch(Pickler.java:286)
	at net.razorvine.pickle.Pickler.save(Pickler.java:185)
	at net.razorvine.pickle.Pickler.put_collection(Pickler.java:407)
	at net.razorvine.pickle.Pickler.dispatch(Pickler.java:363)
	at net.razorvine.pickle.Pickler.save(Pickler.java:185)
	at net.razorvine.pickle.Pickler.put_collection(Pickler.java:407)
	at net.razorvine.pickle.Pickler.dispatch(Pickler.java:363)
	at net.razorvine.pickle.Pickler.save(Pic

Use evaluation metric on the 10% (precision at 10 / rank correlation?).

In [17]:
(predictions
    .withColumn('square_error', (F.col('true_rating') - F.col('predicted_rating'))**2)
    .select('square_error')
    .groupBy()
    .sum('square_error')
).show()

[Stage 294:>                                                        (0 + 1) / 1]

23/04/25 21:10:11 ERROR Utils: Uncaught exception in thread stdout writer for /usr/bin/python3
java.lang.OutOfMemoryError: Java heap space
	at java.base/java.util.Arrays.copyOf(Arrays.java:3745)
	at java.base/java.io.ByteArrayOutputStream.toByteArray(ByteArrayOutputStream.java:211)
	at net.razorvine.pickle.Pickler.dumps(Pickler.java:142)
	at org.apache.spark.sql.execution.python.BatchEvalPythonExec.$anonfun$evaluate$4(BatchEvalPythonExec.scala:77)
	at org.apache.spark.sql.execution.python.BatchEvalPythonExec$$Lambda$3773/0x0000000841560040.apply(Unknown Source)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at org.apache.spark.api.python.PythonRDD$.writeIteratorToStream(PythonRDD.scala:307)
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$1.writeIteratorToStream(PythonUDFR

Exception in thread "stdout writer for /usr/bin/python3" java.lang.OutOfMemoryError: Java heap space
	at java.base/java.util.Arrays.copyOf(Arrays.java:3745)
	at java.base/java.io.ByteArrayOutputStream.toByteArray(ByteArrayOutputStream.java:211)
	at net.razorvine.pickle.Pickler.dumps(Pickler.java:142)
	at org.apache.spark.sql.execution.python.BatchEvalPythonExec.$anonfun$evaluate$4(BatchEvalPythonExec.scala:77)
	at org.apache.spark.sql.execution.python.BatchEvalPythonExec$$Lambda$3773/0x0000000841560040.apply(Unknown Source)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at org.apache.spark.api.python.PythonRDD$.writeIteratorToStream(PythonRDD.scala:307)
	at org.apache.spark.sql.execution.python.PythonUDFRunner$$anon$1.writeIteratorToStream(PythonUDFRunner.scala:53)
	at org.apache.spark.a