In [2]:
import numpy as np
from surprise import Dataset
from surprise.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from cluster_rec import CB_SVDpp
from time import time

In [3]:
# Import and split data
data = Dataset.load_builtin('ml-100k')
trainset, testset = train_test_split(data, test_size=0.2)

# Train on random values
algo = CB_SVDpp(num_clusters=50, alpha=0.15, n_epochs=20, verbose=True)
start = time()
algo.fit(trainset)
algo.calc_Nu(trainset)

# Predict on random values and calculate the RMSE and runtime
y_pred, y_true = algo.predict_df(testset)
end = time()
rmse = np.sqrt(mean_squared_error(y_true, y_pred))
print(f"RMSE: {rmse}")
print(f"Took {round(end-start,2)} seconds to fit and predict.")

 processing epoch 0
 processing epoch 1
 processing epoch 2
 processing epoch 3
 processing epoch 4
 processing epoch 5
 processing epoch 6
 processing epoch 7
 processing epoch 8
 processing epoch 9
 processing epoch 10
 processing epoch 11
 processing epoch 12
 processing epoch 13
 processing epoch 14
 processing epoch 15
 processing epoch 16
 processing epoch 17
 processing epoch 18
 processing epoch 19
Model fitted! Parameters updated.
RMSE: 0.9236186030060799
Took 36.62 seconds to fit and predict.


In [4]:

# Define a custom cross validator
def custom_cross_validation(algo_class, train_test: list, alpha_values: list, cluster_values: list, cv: int):
    """
    Perform cross-validation to find the best alpha and cluster values.
    
    Args:
        algo_class: The algorithm class to use (e.g., CB_SVDpp).
        data: The dataset to use for training and testing.
        alpha_values (list): List of alpha values to try.
        cluster_values (list): List of cluster counts to try.
        cv (int): Number of splits for cross-validation.
        
    Returns:
        dict: Results for each parameter combination.
        tuple: Best alpha and cluster values based on mean RMSE.
    """
    trainset = train_test[0]
    testset = train_test[1]
    
    results = {}
    for alpha in alpha_values:
        for clusters in cluster_values:
            start = time()
            print(f"Testing alpha={alpha}, clusters={clusters}")
            rmses = []
            
            model = algo_class(num_clusters=clusters, alpha=alpha, n_epochs=20)
            model.fit(trainset)
            model.calc_Nu(trainset)
            
            for _ in range(cv):

                y_pred, y_true = model.predict_df(testset)  # Assuming predict_df returns (predictions, _)
                rmse = np.sqrt(mean_squared_error(y_true, y_pred))
                rmses.append(rmse)
            
            mean_rmse = np.mean(rmses)
            results[(alpha, clusters)] = mean_rmse
            print(f"Mean RMSE: {mean_rmse}")
            end = time()
            print(f"Took {round(end-start, 2)} seconds to fit and predict.")
        
    
    # Find the best parameters
    best_params = min(results, key=results.get)
    print(f"Best params (alpha, clusters): {best_params}, RMSE: {results[best_params]}")
    
    return results, best_params

In [5]:
# Initialize different alpha and cluster values
alpha_values = [0.1, 0.15, 0.2]
cluster_values = [50, 100, 150]

# Cross validate alpha and cluster values. Output best params and total runtime
cv_start = time()
results, best_params = custom_cross_validation(CB_SVDpp, [trainset, testset], alpha_values, cluster_values, cv=3)
cv_end = time()
print(f"CV total runtime: {round(cv_end - cv_start, 2)} seconds.")

Testing alpha=0.1, clusters=50
Model fitted! Parameters updated.
Mean RMSE: 0.9247156694979318
Took 27.05 seconds to fit and predict.
Testing alpha=0.1, clusters=100
Model fitted! Parameters updated.
Mean RMSE: 0.9258901440822158
Took 28.9 seconds to fit and predict.
Testing alpha=0.1, clusters=150
Model fitted! Parameters updated.
Mean RMSE: 0.9291062036463457
Took 29.3 seconds to fit and predict.
Testing alpha=0.15, clusters=50
Model fitted! Parameters updated.
Mean RMSE: 0.9236186030060799
Took 27.99 seconds to fit and predict.
Testing alpha=0.15, clusters=100
Model fitted! Parameters updated.
Mean RMSE: 0.9247592337551623
Took 29.88 seconds to fit and predict.
Testing alpha=0.15, clusters=150
Model fitted! Parameters updated.
Mean RMSE: 0.930137213657449
Took 29.84 seconds to fit and predict.
Testing alpha=0.2, clusters=50
Model fitted! Parameters updated.
Mean RMSE: 0.923285378949365
Took 25.59 seconds to fit and predict.
Testing alpha=0.2, clusters=100
Model fitted! Parameters up

In [6]:
import plotly.graph_objects as go

alpha = []
clusters = []
rmses = list(results.values())  # This line is fine as is.

for t in results.keys():
    alpha.append(t[0])  # t is a tuple (alpha, clusters)
    clusters.append(t[1])
    
x = list(alpha)
y = list(clusters)
z = list(rmses)

fig = go.Figure(data=[go.Scatter3d(x=x, y=y, z=z, mode='markers', 
                                   marker=dict(size=5, color=z, colorscale='Viridis', opacity=0.8))])

fig.update_layout(title='3D Scatter Plot of RMSEs by Alpha and Cluster Count', scene=dict(
                    xaxis_title='Alpha',
                    yaxis_title='Clusters',
                    zaxis_title='RMSE'))

fig.show()