In [18]:
import numpy as np
import pandas as pd
from surprise import Dataset, SVDpp
from surprise.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.metrics import mean_squared_error
from collections import defaultdict

# Load the ML100k dataset
data = Dataset.load_builtin('ml-100k')
trainset, testset = train_test_split(data, test_size=0.2)

# Initialize and train the SVD++ model
algo = SVDpp(random_state = 0, n_epochs=10, verbose=True)
algo.fit(trainset)

# Clustering
num_clusters = 50  # Number of clusters
user_features = algo.pu
item_features = algo.qi
yj_features = algo.yj

# Clustering user features to find pCu
kmeans_user = KMeans(n_clusters=num_clusters, random_state=42)
user_labels = kmeans_user.fit_predict(user_features)
pCu = kmeans_user.cluster_centers_

# Clustering item features to find qCi
kmeans_item = KMeans(n_clusters=num_clusters, random_state=42)
item_labels = kmeans_item.fit_predict(item_features)
qCi = kmeans_item.cluster_centers_

# Clustering yj features to find yCj
kmeans_yj = KMeans(n_clusters=num_clusters, random_state=42)
yj_labels = kmeans_yj.fit_predict(yj_features)
yCj = kmeans_yj.cluster_centers_

bi = algo.bi
bu = algo.bu

# Initialize dictionaries to store |N(u)|^-0.5 and N(u) for each user
Nu_minus_half = defaultdict(float)
Nu = defaultdict(set)

for u, i, _ in trainset.all_ratings():
    # Update the set of items for user 'u'
    Nu[u].add(i)

# Calculate |N(u)|^-0.5 for each user
for u in Nu:
    Nu_minus_half[u] = 1 / np.sqrt(len(Nu[u]))

# Convert Nu from a set of item indices to a list for easier processing later
Nu = {u: list(items) for u, items in Nu.items()}

 processing epoch 0
 processing epoch 1
 processing epoch 2
 processing epoch 3
 processing epoch 4
 processing epoch 5
 processing epoch 6
 processing epoch 7
 processing epoch 8
 processing epoch 9


In [2]:
def predictor_function(alpha, user_id, item_id, item_labels, user_labels, qi, qCi, pu, pCu, Nu_minus_half, Nu, yj, yCj, bi, bu):
    user_id = int(user_id)
    item_id = int(item_id)
    
    part1 = ((1-alpha)*qi[item_id] + alpha*qCi[item_labels[item_id]]).T
    part2 = ((1-alpha)*pu[user_id]+alpha*pCu[user_labels[user_id]])

    # Assuming 'yj' and 'yCj' are passed correctly as dictionaries or arrays where you can access an item using its ID or cluster ID
    sum_result = np.zeros_like(pu[user_id])   # Use 'pu' dimension as a reference for initializing 'sum_result'
    
    for j in Nu[user_id]:  # Iterating over items in N(u)
        # Map item ID to cluster ID for accessing yCj
        cluster_id = item_labels[j]
        
        term = (1 - alpha) * yj[j] + alpha * yCj[cluster_id]  # Accessing yCj with cluster_id
        sum_result += term

    intermediate = part2 + (Nu_minus_half[user_id] * sum_result)
    res = np.dot(part1, intermediate) + bi[item_id] + bu[user_id] # Adding biases at the end
    return res


In [3]:
minus_half_powers = {user: len(items)**(-0.5) for user, items in Nu.items()}

In [6]:
minus_half_powers[0], Nu_minus_half[0]

(0.05538487756217113, 0.05538487756217113)

In [18]:
u = trainset.to_inner_uid(str(100))
i = trainset.to_inner_uid(str(50))

pred = predictor_function(0.2, u, i, item_labels, user_labels, item_features, qCi, user_features, pCu, pow(yj_features, -0.5), yj_features, yj_features, yCj, bi, bu)

  pred = predictor_function(0.2, u, i, item_labels, user_labels, item_features, qCi, user_features, pCu, pow(yj_features, -0.5), yj_features, yj_features, yCj, bi, bu)


IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [14]:
alpha = 0.2
predictions = []
y_true = []

for user_id, item_id, actual_rating in testset:  
    
    try:
        user_id = algo.trainset.to_inner_uid(str(user_id))
        item_id = algo.trainset.to_inner_iid(str(item_id))

        predicted_rating = predictor_function(alpha, user_id, item_id, item_labels, user_labels, item_features, qCi,
                                        user_features, pCu, Nu_minus_half, Nu, yj_features, yCj, algo.bi, algo.bu)

        predictions.append(predicted_rating)
        y_true.append(actual_rating)
    except Exception as e:
        pass


In [15]:
min_val = min(predictions)
max_val = max(predictions)
new_min = min(y_true)
new_max = max(y_true)

# Apply scaling to each prediction
scaled_predictions = [(x - min_val) / (max_val - min_val) * (new_max - new_min) + new_min for x in predictions]


In [16]:
print(f"Test RMSE: {np.sqrt(mean_squared_error(y_true, scaled_predictions))}")

Test RMSE: 0.9682211099578129


In [6]:
user_id = algo.trainset.to_inner_uid(str(50))
item_id = algo.trainset.to_inner_iid(str(100))
alpha = 0.15

predicted_rating = predictor_function(alpha, user_id, item_id, item_labels, user_labels, item_features, qCi,
                                user_features, pCu, Nu_minus_half, Nu, yj_features, yCj, algo.bi, algo.bu)

In [7]:
predicted_rating

0.41250036631492965

In [30]:
from cluster_rec import CB_SVDpp

In [1]:
import numpy as np
import pandas as pd
from surprise import Dataset, SVDpp
from surprise.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.metrics import mean_squared_error
from collections import defaultdict

# Load the ML100k dataset
data = Dataset.load_builtin('ml-100k')
trainset, testset = train_test_split(data, test_size=0.2)

In [47]:
algo = CB_SVDpp(num_clusters=200, alpha=0.15)

In [48]:
algo.fit(trainset=trainset, random_state=123)
algo.calc_Nu(trainset)

Model fitted! Parameters updated.


In [49]:
y_pred, y_true = algo.predict_df(testset)

In [50]:
print(f"RMSE: {np.sqrt(mean_squared_error(y_true, y_pred))}")

RMSE: 0.9646200554398879


In [41]:
algo = SVDpp()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x14d14db90>

In [44]:
from surprise import accuracy

In [45]:
predictions = algo.test(testset)

# Compute RMSE
rmse = accuracy.rmse(predictions)

RMSE: 0.9217


In [46]:
rmse

0.9216692470634698

In [14]:
import numpy as np
import random

# Set a global random seed
random.seed(42)
np.random.seed(42)

# Assuming train_test_split is used somewhere before this
trainset, testset = train_test_split(data, test_size=0.25, random_state=42)

# Then your existing setup
algo = CB_SVDpp(50, 0.15)
algo.fit(trainset, 123)  # This random_state only affects KMeans within fit
algo.calc_Nu(trainset)
y_pred, y_true = algo.predict_df(testset)

# Compute RMSE
rmse = np.sqrt(mean_squared_error(y_true, y_pred))
print(rmse)

Model fitted! Parameters updated.
1.08172642354242


In [16]:
# Then your existing setup
algo = CB_SVDpp(200, 0.15)
algo.fit(trainset, 123)  # This random_state only affects KMeans within fit
algo.calc_Nu(trainset)
y_pred, y_true = algo.predict_df(testset)

# Compute RMSE
rmse = np.sqrt(mean_squared_error(y_true, y_pred))
print(rmse)

Model fitted! Parameters updated.
0.9748010215668316


In [3]:
from surprise import Dataset
from surprise.model_selection import cross_validate, train_test_split
import numpy as np

# Define your custom cross-validation function
def custom_cross_validation(algo_class, data, alpha_values, cluster_values, cv=3):
    """
    Perform cross-validation to find the best alpha and cluster values.
    
    Args:
        algo_class: The algorithm class to use (e.g., CB_SVDpp).
        data: The dataset to use for training and testing.
        alpha_values (list): List of alpha values to try.
        cluster_values (list): List of cluster counts to try.
        cv (int): Number of splits for cross-validation.
        
    Returns:
        dict: Results for each parameter combination.
        tuple: Best alpha and cluster values based on mean RMSE.
    """
    
    results = {}
    for alpha in alpha_values:
        for clusters in cluster_values:
            print(f"Testing alpha={alpha}, clusters={clusters}")
            rmses = []
            
            trainset, testset = train_test_split(data, test_size=0.25, random_state=123)
            model = algo_class(num_clusters=clusters, alpha=alpha)
            model.fit(trainset, random_state=42)
            model.calc_Nu(trainset)
            
            for _ in range(cv):

                y_pred, y_true = model.predict_df(testset)  # Assuming predict_df returns (predictions, _)
                rmse = np.sqrt(mean_squared_error(y_true, y_pred))
                rmses.append(rmse)
            
            mean_rmse = np.mean(rmses)
            results[(alpha, clusters)] = mean_rmse
            print(f"Mean RMSE: {mean_rmse}")
    
    # Find the best parameters
    best_params = min(results, key=results.get)
    print(f"Best params (alpha, clusters): {best_params}, RMSE: {results[best_params]}")
    
    return results, best_params


In [4]:
alpha_values = [0.1, 0.15, 0.2]
cluster_values = [50, 100, 150]

results, best_params = custom_cross_validation(CB_SVDpp, data, alpha_values, cluster_values, cv=3)

Testing alpha=0.1, clusters=50
Model fitted! Parameters updated.
Mean RMSE: 1.1613853935622103
Testing alpha=0.1, clusters=100
Model fitted! Parameters updated.
Mean RMSE: 0.9961615256573507
Testing alpha=0.1, clusters=150
Model fitted! Parameters updated.
Mean RMSE: 1.141414111900156
Testing alpha=0.15, clusters=50
Model fitted! Parameters updated.
Mean RMSE: 1.1603923447066868
Testing alpha=0.15, clusters=100
Model fitted! Parameters updated.
Mean RMSE: 1.0851258033435416
Testing alpha=0.15, clusters=150
Model fitted! Parameters updated.
Mean RMSE: 1.1718569812299953
Testing alpha=0.2, clusters=50
Model fitted! Parameters updated.
Mean RMSE: 1.0100738633112216
Testing alpha=0.2, clusters=100
Model fitted! Parameters updated.
Mean RMSE: 1.0002334839327713
Testing alpha=0.2, clusters=150
Model fitted! Parameters updated.
Mean RMSE: 1.0069956227244
Best params (alpha, clusters): (0.1, 100), RMSE: 0.9961615256573507


In [21]:
alpha = []
clusters = []
rmses = list(results.values())  # This line is fine as is.

for t in results.keys():
    alpha.append(t[0])  # t is a tuple (alpha, clusters)
    clusters.append(t[1])

In [22]:
import plotly.graph_objects as go

# Assuming alpha, clusters, and rmses are lists with corresponding elements
x = list(alpha)  # Ensure these are lists or arrays
y = list(clusters)
z = list(rmses)

# Create a 3D scatter plot
fig = go.Figure(data=[go.Scatter3d(x=x, y=y, z=z, mode='markers', 
                                   marker=dict(size=5, color=z, colorscale='Viridis', opacity=0.8))])

# Set plot layout
fig.update_layout(title='3D Scatter Plot of RMSEs by Alpha and Cluster Count', scene=dict(
                    xaxis_title='Alpha',
                    yaxis_title='Clusters',
                    zaxis_title='RMSE'))

# Show plot
fig.show()
