## Required installations

In [None]:
!pip install gdown
!pip install pyspark

## Required imports

In [None]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from keras.layers import Input,Dense
from keras.models import Model
from keras.optimizers import Adam
from keras.regularizers import l2
from sklearn.metrics import recall_score, precision_score, f1_score
import tensorflow as tf
from tensorflow.python.keras import backend as K
from tensorflow.keras.utils import get_custom_objects
get_custom_objects().update({'identity': tf.identity})
import numpy as np
import pandas as pd

## Spark Session


In [None]:
spark = SparkSession.builder \
        .master("local[*]") \
        .appName("Recommender") \
        .config("spark.driver.memory", "16g") \
        .config("spark.executor.memory", "16g") \
        .getOrCreate()

## Load and format the ratings data

### 100k

In [None]:
!gdown 1lwPW7OefaJnwsaqYBQs-wgcIGiatYLXb

def load_100k() :
    data = spark.read.option("delimiter", "\t")\
                    .option("header", "False")\
                    .csv('/kaggle/working/u.data')\
                    .select('_c0','_c1','_c2')\
                    .withColumnRenamed('_c0','userId')\
                    .withColumnRenamed('_c1', 'movieId') \
                    .withColumnRenamed('_c2', 'rating')
    data = data.select([F.col(c).cast("int") for c in data.columns])
    return data

def dataPreprocessor(rating_df, movies_list, users_list):

    df = pd.DataFrame(columns=movies_list, index=users_list).fillna(0)
    for (userID, itemID, rating) in rating_df.collect():
        df.loc[userID,itemID] = rating

    return df.astype(np.float32)

    
ratings_df = load_100k()

movies_list = ratings_df.select("movieId").distinct().rdd.flatMap(lambda x: x).collect()
users_list  = ratings_df.select("userId").distinct().rdd.flatMap(lambda x: x).collect()

num_users = len(users_list)
num_items = len(movies_list)

train_df, test_df = ratings_df.sampleBy('userId', fractions={val: 0.9 for val in ratings_df.select('userId').distinct().rdd.map(lambda row: row[0]).collect()}, seed=42).randomSplit([0.9, 0.1], seed=42)
train_df, validation_df = train_df.sampleBy('userId', fractions={val: 0.8 for val in train_df.select('userId').distinct().rdd.map(lambda row: row[0]).collect()}, seed=42).randomSplit([0.8, 0.2], seed=42)
train_df      = dataPreprocessor(train_df,  movies_list, users_list)
validation_df = dataPreprocessor(validation_df,movies_list, users_list)

### 1M

In [None]:
!gdown 18sHWE7Eu28hDqXib2PvesBYMea5AQmZs

def load_1m() :
    data = spark.read.option("delimiter", "::")\
                    .option("header", "False")\
                    .csv('/kaggle/working/ratings.dat')\
                    .select('_c0','_c1','_c2')\
                    .withColumnRenamed('_c0','userId')\
                    .withColumnRenamed('_c1', 'movieId') \
                    .withColumnRenamed('_c2', 'rating')
    data = data.select([F.col(c).cast("int") for c in data.columns])
    return data
    
def dataPreprocessor(rating_df, movies_list, users_list):

    df = pd.DataFrame(columns=movies_list, index=users_list).fillna(0)
    for (userID, itemID, rating) in rating_df.collect():
        df.loc[userID,itemID] = rating

    return df.astype(np.float32)

    
ratings_df = load_1m()

movies_list = ratings_df.select("movieId").distinct().rdd.flatMap(lambda x: x).collect()
users_list  = ratings_df.select("userId").distinct().rdd.flatMap(lambda x: x).collect()

num_users = len(users_list)
num_items = len(movies_list)

train_df, test_df = ratings_df.sampleBy('userId', fractions={val: 0.9 for val in ratings_df.select('userId').distinct().rdd.map(lambda row: row[0]).collect()}, seed=42).randomSplit([0.9, 0.1], seed=42)
train_df, validation_df = train_df.sampleBy('userId', fractions={val: 0.9 for val in train_df.select('userId').distinct().rdd.map(lambda row: row[0]).collect()}, seed=42).randomSplit([0.9, 0.1], seed=42)
train_df      = dataPreprocessor(train_df,  movies_list, users_list)
validation_df = dataPreprocessor(validation_df,movies_list, users_list)                                 

### 10M

In [None]:
!gdown 1e064MFX83PYtPDcISjYQw4fTQtv-PG38

def load_10m() :
    data = spark.read.option("delimiter", "::")\
                    .option("header", "False")\
                    .csv('/kaggle/working/ratings.dat')\
                    .select('_c0','_c1','_c2')\
                    .withColumnRenamed('_c0','userId')\
                    .withColumnRenamed('_c1', 'movieId') \
                    .withColumnRenamed('_c2', 'rating')
    data = data.select([F.col(c).cast("int") for c in data.columns])
    return data
    
def dataPreprocessor(rating_df, movies_list, users_list):

    df = pd.DataFrame(columns=movies_list, index=users_list).fillna(0)
    for (userID, itemID, rating) in rating_df.collect():
        df.loc[userID,itemID] = rating

    return df.astype(np.float32)

    
ratings_df = load_10m()

movies_list = ratings_df.select("movieId").distinct().rdd.flatMap(lambda x: x).collect()
users_list  = ratings_df.select("userId").distinct().rdd.flatMap(lambda x: x).collect()

num_users = len(users_list)
num_items = len(movies_list)

train_df, test_df = ratings_df.sampleBy('userId', fractions={val: 0.9 for val in ratings_df.select('userId').distinct().rdd.map(lambda row: row[0]).collect()}, seed=42).randomSplit([0.9, 0.1], seed=42)
train_df      = dataPreprocessor(train_df,  movies_list, users_list)

## Train model

In [None]:
def masked_mse(y_true, y_pred):
    # masked function
    mask_true = K.cast(K.not_equal(y_true, 0), K.floatx())
    # masked squared error
    masked_squared_error = K.square(mask_true * (y_true - y_pred))
    masked_mse = K.sum(masked_squared_error) / K.maximum(K.sum(mask_true), 1)
    return masked_mse
    
def masked_rmse_clip(y_true, y_pred):
    # masked function
    mask_true = K.cast(K.not_equal(y_true, 0), K.floatx())
    # masked squared error
    masked_squared_error = K.square(mask_true * (y_true - y_pred))
    masked_rmse = K.sqrt(K.sum(masked_squared_error) / K.maximum(K.sum(mask_true), 1))
    return masked_rmse

def I_AutoRec(n, k, f, g, reg, lr):
    """
    IAutoRec is an item-based AutoRec model.

    Args:
        n: The number of items.
        k: The number of hidden units.
        f: The activation function for the hidden layer.
        g: The activation function for the output layer.
        reg: The regularization strength (used to prevent overfitting).
        lr: The learning rate for the optimizer.

    Returns:
        An I_AutoRec model.
    """

    input_layer = Input(shape=(n,))
    encoded = Dense(k, activation=g, kernel_regularizer=l2(reg), use_bias=True)(input_layer)
    decoded = Dense(n, activation=f, kernel_regularizer=l2(reg), use_bias=True)(encoded)

    model = Model(input_layer, decoded)
    model.compile(optimizer=Adam(learning_rate=lr), loss=masked_mse, metrics=[masked_rmse_clip])

    return model


autorec = I_AutoRec(
    n=num_items,
    k=600,   # 500 for 100k - 1100 for 1M
    g='sigmoid',
    f='identity',
    reg=0.001,
    lr=0.0001
    )

history = autorec.fit(
    x=train_df,
    y=train_df,
    validation_data=[train_df, validation_df],
    epochs=200,
    batch_size=64) #64 for 100k - 128 for 1M


In [None]:
def create_binarised_output(ratings):
    binary = []
    for rating in ratings:
        if rating > treshold:
            binary.append(1)
        else:
            binary.append(0)
    return binary

treshold = 3.5

true =  dataPreprocessor(test_df,  movies_list, users_list).values.flatten()
pred = np.array(autorec.predict(train_df), dtype=np.float32).flatten()
rmse = masked_rmse_clip(true, pred)

print(f"RMSE1: {rmse}")

nonzero_indices = np.nonzero(true)[0]

y = true[nonzero_indices].tolist()
pred = pred[nonzero_indices].tolist()

y_binary = create_binarised_output(y)
pred_binary = create_binarised_output(pred)

precision = precision_score(y_binary, pred_binary)
print("Precision:", precision)

# Calculate recall
recall = recall_score(y_binary, pred_binary)
print("Recall:", recall)

# Calculate accuracy
accuracy = f1_score(y_binary, pred_binary)
print("Fmeasure:", accuracy)

## Making recommendations for a given user

In [None]:
# # Get predictions for test set
# predictions = autorec.predict(np.array(test_df, dtype=np.float32))
# predictions = K.clip(predictions, 1, 5)
# predictions = pd.DataFrame(predictions, index=users_list,columns=movies_list)

# # Get the user ID or index in the test data
# user_id = 943

# # Extract the row of the user's ratings from the test matrix
# user_ratings = test_df.loc[user_id, :]

# # Get the indices where the user has rated the item
# rated_item_indices = user_ratings[user_ratings != 0].index

# not_rated_item_indices = list( set(movies_list)-set(rated_item_indices))

# # Get all predicted user ratings
# predicted_user_ratings = predictions.loc[user_id , not_rated_item_indices]

# sorted_ratings = predicted_user_ratings.sort_values(ascending=False)

# recommendations = sorted_ratings.index[:20]
# print("Recommendations for user", user_id, ":")
# for i, movie_id in enumerate(recommendations):
#     print(i+1, ".", movie_id , "  \t\t predicted rating : ",sorted_ratings.values[i] )
