# User-User collaborative filtering

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import StringIndexer
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vectors
from pyspark.ml.stat import Correlation
from pyspark.sql.functions import udf, col, lit
from pyspark.sql.types import FloatType
import pyspark.sql.functions as F
from pyspark.sql.functions import col, lit, avg

from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity
from pyspark.sql.window import Window
import pyspark.sql.functions as F
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import Normalizer
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import BucketedRandomProjectionLSH

## data preparation

In [2]:
  # Initialize Spark session
spark = SparkSession.builder.appName("ALSRecommender").config("spark.driver.memory", "4g").config("spark.executor.memory", "4g").getOrCreate()


# Load data from a CSV file, considering semicolon delimiter and quotes
data = spark.read.csv("Book reviews/BX-Book-Ratings.csv", header=True, inferSchema=True, sep=';', quote='"')

# Select and rename the columns according to the CSV file's format
ratings = data.select(
    col('User-ID').cast('int').alias('userId'), 
    col('ISBN').alias('bookId'), 
    col('Book-Rating').cast('int').alias('rating')
)

# Transform the ISBN string to an index using StringIndexer
stringIndexer = StringIndexer(inputCol="bookId", outputCol="bookIdIndexed")
model = stringIndexer.fit(ratings)
ratingsIndexed = model.transform(ratings)

# Split data into training and test sets
seed = 12345
(training, test) = ratingsIndexed.randomSplit([0.8, 0.2], seed=seed)

## Modeling

In [4]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
import dask.dataframe as dd


ddf = dd.read_csv('Book reviews/BX-Book-Ratings.csv', sep=';', encoding='latin-1',dtype={'User-ID': 'int32', 'Book-Rating': 'int8', 'ISBN': 'category'})

rating_summary = ddf.groupby('ISBN')['Book-Rating'].mean().compute()

df = ddf.compute()
df = df.head(60000)


#df = df[df['Book-Rating'] > 0] 


# create user-item interaction matrix
ratings_matrix = df.pivot(index='User-ID', columns='ISBN', values='Book-Rating').fillna(0)


# calculate the cosine similarity of the users
cosine_sim = cosine_similarity(ratings_matrix, ratings_matrix)

# convert the similarity matrix into a DataFrame
cosine_sim_df = pd.DataFrame(cosine_sim, index=ratings_matrix.index, columns=ratings_matrix.index)

# create a df with the 5 most similar users for each user
top_5_similar_users = {}
for user in cosine_sim_df.columns:
    top_5_similar_users[user] = cosine_sim_df[user].sort_values(ascending=False)[1:6]
    
top_5_similar_users_df = pd.DataFrame(top_5_similar_users)


# predict the rating that a user would give to a book using mean of top 5 similar users
def predict_rating(user_id, book_id):
    top_5_similar = top_5_similar_users_df[user_id]
    
    # get the ratings of the top 5 similar users for the book
    ratings = []
    for user in top_5_similar.index:
        rating = ratings_matrix.loc[user, book_id]
        ratings.append(rating)
        
    # calculate the mean rating
    prediction = np.mean(ratings)
    
    return prediction


# get the mean rating for each row in df and put it in predictions column
df['prediction'] = df.apply(lambda x: predict_rating(x['User-ID'], x['ISBN']), axis=1)

# calculate the RMSE
rmse = np.sqrt(np.mean((df['Book-Rating'] - df['prediction'])**2))
print(rmse)

5.200383931769
