# Exercise 2

## Imports

In [74]:
import pickle
import os.path
import pyspark.sql.functions as F
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.types import StringType, ArrayType, FloatType, DoubleType, IntegerType
from itertools import combinations
from typing import Iterable, Any, List, Set

import pandas as pd
import numpy as np
from typing import Union

import matplotlib.pyplot as plt

## Spark initialization

In [75]:
spark = SparkSession.builder \
    .appName('exercise2') \
    .config('spark.master', 'local[*]') \
    .getOrCreate()

spark.conf.set("spark.sql.shuffle.partitions", "16")

## Prepare the data

*Start with the small dataset first.*

In [76]:
df = (spark.read
    .option('header', 'false')
    .option('sep', '\t')
    .csv('data/ml-100k/u.data')
)
df = (df.withColumnRenamed('_c0', 'user_id') 
    .withColumnRenamed('_c1', 'item_id')
    .withColumnRenamed('_c2', 'rating')
    .drop('_c3')
    .select(F.col('user_id').cast(IntegerType()), F.col('item_id').cast(IntegerType()), F.col('rating').cast(IntegerType()))
)

## Collaborative Filtering

### Calculate baselines.

In [77]:
user_baseLine_df = (df.groupBy('user_id')
    .agg(F.avg('rating').alias('user_avg'))
)
item_baseLine_df = (df.groupBy('item_id')
    .agg(F.avg('rating').alias('item_avg'))
)
mu = df.agg(F.avg('rating')).collect()[0][0]

In [78]:
def calculate_user_item_baseline(user_id: int, item_id: int) -> float:
    user_avg = user_baseLine_df.filter(F.col('user_id') == user_id).collect()[0]["user_avg"]
    item_avg = item_baseLine_df.filter(F.col('item_id') == item_id).collect()[0]["item_avg"]
    return -mu + user_avg + item_avg

### Clustering the movies

In [None]:
user_number = df.select('user_id').distinct().count()
item_number = df.select('item_id').distinct().count()

In [79]:
@F.udf(returnType=ArrayType(IntegerType(),False))
def utility_matrix_row(elems: Iterable[Any]):

    lista = [0] * user_number
    
    for (user_id,rating) in elems:
        lista[user_id-1] = rating

    return lista

In [80]:
from pyspark.ml.clustering import BisectingKMeans

neighbours = 10

dataset = (df.groupBy('item_id')
    .agg(F.collect_list(F.array('user_id','rating')).alias('ratings'))
    .withColumn('ratings', utility_matrix_row(F.col('ratings')).cast(ArrayType(FloatType(),False)))
)

bkm = BisectingKMeans(featuresCol='ratings',minDivisibleClusterSize = neighbours,predictionCol = "cluster_id").setK(10).setSeed(1)
model = bkm.fit(dataset)


                                                                                

Dataframe with the cluster id to each movie

In [81]:
clusters_df = model.transform(dataset)

### Find similar items

In [82]:
def find_most_similar(user_id: int , item_id: int):
    
    item_id_cluster = clusters_df.filter(F.col('item_id') == item_id).select('cluster_id').collect()[0][0]

    cluster = (clusters_df.filter(F.col('cluster_id') == item_id_cluster)
        .filter(F.col('ratings')[user_id] != 0)
    )

    return cluster
    

Calculate similarities.

In [106]:
def predict_rating(user_id:int , item_id:int) -> float:

    candidate_items_df = find_most_similar(user_id, item_id)

    item_ratings = candidate_items_df.filter(F.col('item_id') == item_id).select('ratings').collect()[0][0]
    item_ratings_without_0 = [x for x in item_ratings if x != 0]

    item_ratings_mean = sum(item_ratings_without_0)/len(item_ratings_without_0)
    item_ratings = [x - item_ratings_mean for x in item_ratings]
    
    @F.udf(returnType=DoubleType())
    def calculate_pearson_correlation(elems: Iterable[Any]):

        ratings = np.array(elems)
        item_ratings_np = np.array(item_ratings)

        ratings -= np.average(ratings[ratings.nonzero()])

        return float((item_ratings_np * ratings).sum() / (np.sqrt((item_ratings_np**2).sum()) * np.sqrt((ratings**2).sum())))
        
    pearson_correlation = (candidate_items_df
        .withColumn('pearson_correlation', calculate_pearson_correlation(F.col('ratings')))
        .collect()
    )

    return pearson_correlation
    #np.array(for i in pearson_correlation )

    #candidate_items_df

    #base_line = calculate_user_item_baseline(user_id,item_id)

    #return base_line + 


In [107]:
predict_rating(1,1)

                                                                                

[Row(item_id=1, ratings=[5.0, 4.0, 0.0, 0.0, 4.0, 4.0, 0.0, 0.0, 0.0, 4.0, 0.0, 0.0, 3.0, 0.0, 1.0, 5.0, 4.0, 5.0, 0.0, 3.0, 5.0, 0.0, 5.0, 0.0, 5.0, 3.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 5.0, 0.0, 0.0, 4.0, 5.0, 5.0, 4.0, 5.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 4.0, 0.0, 4.0, 5.0, 5.0, 2.0, 0.0, 0.0, 2.0, 3.0, 4.0, 3.0, 3.0, 3.0, 0.0, 0.0, 4.0, 0.0, 4.0, 2.0, 0.0, 4.0, 0.0, 5.0, 0.0, 4.0, 0.0, 4.0, 4.0, 4.0, 2.0, 0.0, 0.0, 0.0, 0.0, 5.0, 0.0, 0.0, 4.0, 5.0, 4.0, 5.0, 5.0, 4.0, 0.0, 4.0, 0.0, 3.0, 3.0, 0.0, 0.0, 0.0, 4.0, 0.0, 4.0, 4.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 4.0, 0.0, 0.0, 4.0, 4.0, 0.0, 0.0, 3.0, 4.0, 0.0, 0.0, 4.0, 0.0, 5.0, 4.0, 0.0, 0.0, 5.0, 0.0, 0.0, 3.0, 4.0, 0.0, 0.0, 3.0, 0.0, 0.0, 4.0, 3.0, 0.0, 0.0, 4.0, 0.0, 4.0, 5.0, 0.0, 0.0, 0.0, 0.0, 0.0, 5.0, 4.0, 0.0, 4.0, 0.0, 4.0, 0.0, 0.0, 0.0, 0.0, 0.0, 5.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.0, 0.0, 0.0, 3.0, 4.0, 0.0, 0.0, 3.0, 4.0, 0.0, 4.0, 0.0, 0.0, 0.0, 0.0, 5.0, 0.0, 0.0, 0.0, 4.0, 4.0, 0.0, 

## Testing

Leave out 10% of the ratings for evaluation.

In [None]:
...

Use evaluation metric on the 10% (precision at 10 / rank correlation?).

In [None]:
...