In [1]:
import pandas as pd
import numpy as np
from scipy import stats
#Similarity Scoring
from sklearn.metrics import jaccard_score
from scipy.spatial.distance import pdist, squareform
from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error
from math import sqrt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import lil_matrix

import seaborn as sns
from matplotlib import pyplot as plt
import plotly.express as px

In [2]:
!pip install surprise



In [3]:
from surprise import SVD, accuracy, SVDpp
from surprise import Dataset, Reader
from surprise.model_selection import cross_validate
from surprise.model_selection.split import train_test_split
from collections import defaultdict

# imports

In [4]:
modern_boardgames_df = pd.read_csv('games_sample.csv')

In [63]:
users_df = pd.read_csv('user_sample.csv')

In [6]:
themes_df = pd.read_csv('themes_sample.csv')

In [7]:
mechanics_df = pd.read_csv('mechanics_sample.csv')

# Baseline

In [8]:
reader = Reader(rating_scale=(1,10))
data = Dataset.load_from_df(users_df[['Username','BGGId','Rating']], reader)
train, test = train_test_split(data, test_size=.2, random_state=42)

In [9]:
# baseline model
algo = SVD(random_state = 42)
algo.fit(train)
predictions = algo.test(test)

In [10]:
def get_top_picks(predictions, num_recs = 10):
    """Return the top-N recommendation for each user from a set of predictions.

    predictions(list of Prediction objects): The list of predictions, as returned by suprise's algorithm.
    num_recs (int): The number of recommendation to output for each user. Default is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size num_recs.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:num_recs]

    return top_n

top_n = get_top_picks(predictions, num_recs=10)

# # Print the recommended items for each user
# for uid, user_ratings in top_n.items():
#     print(uid, [iid for (iid, _) in user_ratings])

In [11]:
# evaluate the rmse result of the prediction and ground thuth
accuracy.rmse(predictions)

RMSE: 1.4051


1.4051410392683918

# Cosine

In [16]:
# Merge the dataframes on 'BGGId'
combined = pd.merge(themes_df, mechanics_df, on='BGGId', how='inner')
combined

Unnamed: 0,BGGId,Adventure,Fantasy,Books / Libraries,Fighting / Combat,Environmental,Medical / Science,Economic,Industry / Manufacturing,Transportation,...,Board & Grid Mechanics,Economics & Resource Management,Physical / Dexterity,Puzzle Solving / Memory,Action Mechanics,Legacy Games,Drawing,Special End Of Game Conditions,Solo,Secrets / Deception
0,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
1,2,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,0,0,0,1,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,5,0,0,0,0,0,0,1,0,0,...,1,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20966,346703,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20967,346965,0,0,0,0,0,0,0,0,0,...,1,0,0,1,0,0,0,1,0,0
20968,347521,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
20969,348955,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0


In [18]:
def compute_sparse_cosine_similarity(data, top_n=10): # add top_n here
    """
    Compute a sparse cosine similarity matrix for the board games.

    data: The dataframe containing game data (including the 'combined' feature for mechanics + themes).
    features_col: The column name containing the one-hot encoded feature matrix.

    Returns: A sparse cosine similarity matrix.
    """
    # Extract the feature matrix from the 'combined' DataFrame
    feature_matrix = combined.values  # This is a 2D numpy array

    # Calculate cosine similarity (dense matrix)
    similarity_matrix = cosine_similarity(feature_matrix)

    # Create a sparse matrix to store the top N similarities
    sparse_similarity_matrix = lil_matrix(similarity_matrix.shape)

    # For each game, keep only the top N most similar games
    for i in range(similarity_matrix.shape[0]):
        # Get the indices of the top N similar games (excluding the game itself)
        top_n_indices = np.argsort(similarity_matrix[i])[::-1][1:top_n+1] # top_n was not in scope
        for j in top_n_indices:
            sparse_similarity_matrix[i, j] = similarity_matrix[i, j]

    return sparse_similarity_matrix

In [19]:
cosine_matrix = compute_sparse_cosine_similarity(combined)

In [20]:
def compute_top_n_sparse_cosine_similarity(data, features_col='combined', top_n=10):
    """
    Compute a sparse cosine similarity matrix and store only the top N most similar games.

    data: The dataframe containing game data (including the 'combined' feature for mechanics + themes).
    features_col: The column name containing the one-hot encoded feature matrix.
    top_n: The number of top similar games to store for each game.

    Returns: A sparse matrix containing only the top N similarities.
    """
    # Extract the one-hot encoded feature matrix
    feature_matrix = data[features_col].values

    # Calculate cosine similarity (this will be dense, then we convert to sparse)
    similarity_matrix = cosine_similarity(feature_matrix)

    # Create a sparse matrix to store the top N similarities
    sparse_similarity_matrix = lil_matrix(similarity_matrix.shape)

    # For each game, keep only the top N most similar games
    for i in range(similarity_matrix.shape[0]):
        # Get the top N similar games (excluding the game itself)
        top_n_indices = np.argsort(similarity_matrix[i])[::-1][1:top_n+1]
        for j in top_n_indices:
            sparse_similarity_matrix[i, j] = similarity_matrix[i, j]

    return sparse_similarity_matrix

In [21]:
def get_content_based_recommendations(game_id, n=5):
    """
    Get recommendations for a user, using the sparse cosine similarity matrix.

    user_id: The ID of the user.
    game_id: The ID of the game the user has rated.
    sparse_cosine_matrix: The sparse cosine similarity matrix.
    n: Number of recommendations to return.

    Returns: List of top-N recommended game IDs.
    """
    # Get the row index of the game in the DataFrame
    game_idx = data[data['BGGId'] == game_id].index[0]

    # Get similarities for this game from the sparse matrix
    similarities = cosine_matrix[game_idx].toarray().flatten()

    # Sort the similarities in descending order
    top_n_indices = np.argsort(similarities)[::-1][:n]

    # Get the corresponding game IDs
    recommended_game_ids = data.iloc[top_n_indices]['BGGId'].tolist()

    return recommended_game_ids

In [22]:
def generate_predictions(game_id, username, similarity_matrix, data, users_data, n=5):
    try:
        # Find the index of the game in the data DataFrame
        idx = data[data['BGGId'] == game_id].index[0]
    except IndexError:
        # print(f"Error: Game with ID {game_id} not found in data.")
        return None

    # Ensure the index is within the bounds of the similarity matrix
    if idx >= similarity_matrix.shape[0]:
        # print(f"Error: Index {idx} is out of bounds for the similarity matrix.")
        return None

    # Get the pairwise similarity scores for the game
    sim_scores = [(i, similarity_matrix[idx, i]) for i in range(len(data)) if i != idx]

    # Sort the games based on the similarity scores (in descending order)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the top n most similar games
    top_n_games = [i[0] for i in sim_scores[:n]]

    # print(f"Top {n} similar games for game {game_id}: {top_n_games}")

    # Get the ratings for these games by the given user
    user_ratings = users_data[users_data['Username'] == username]

    # print(f"User {username} has rated the following games: {user_ratings[['BGGId', 'Rating']]}")

    # Check if 'Rating' and 'BGGId' exist in users_data
    if 'Rating' not in user_ratings.columns or 'BGGId' not in user_ratings.columns:
        # print(f"Error: Missing 'Rating' or 'BGGId' column in user ratings for user {username}.")
        return None

    # Get the ratings for the top n games that this user has rated
    rated_games = user_ratings[user_ratings['BGGId'].isin(data.iloc[top_n_games]['BGGId'])]

    # print(f"User {username} has rated the following top {n} similar games: {rated_games[['BGGId', 'Rating']]}")

    if len(rated_games) > 0:
        # If the user has rated any of the top n games, calculate the predicted rating as the mean of those ratings
        predicted_rating = np.mean(rated_games['Rating'])
        # print(f"Predicted Rating for User {username}, Game {game_id}: {predicted_rating}")
    else:
        # If the user hasn't rated any of the top n games, fallback to the mean rating of the top n games
        top_n_ratings = [data.iloc[i]['AvgRating'] for i in top_n_games if pd.notnull(data.iloc[i]['AvgRating'])]

        # print(f"Ratings for top {n} similar games: {top_n_ratings}")

        if len(top_n_ratings) > 0:
            # If there are valid ratings for the top n games, use the mean of those as the fallback prediction
            predicted_rating = np.mean(top_n_ratings)
            # print(f"Fallback Predicted Rating (mean of top n similar games) for User {username}, Game {game_id}: {predicted_rating}")
        else:
            # If none of the top n games have ratings, fallback to the global average rating
            global_avg_rating = np.mean([rating for rating in data['AvgRating'] if pd.notnull(rating)])
            # print(f"Fallback Predicted Rating (global avg) for User {username}, Game {game_id}: {global_avg_rating}")
            predicted_rating = global_avg_rating

    return predicted_rating

In [23]:
def compute_rmse(predictions, actual_ratings):
    mse = mean_squared_error(actual_ratings, predictions)
    rmse = sqrt(mse)
    return rmse

def evaluate_rmse_sampled(users_df, data, similarity_matrix, similarity_type='Cosine', n=5, sample_size=200):
    """
    Evaluate RMSE for a random sample of users. This function computes the RMSE for predictions generated
    using either Jaccard or Cosine similarity matrices.
    """
    # Randomly sample users for evaluation
    sampled_users = users_df.sample(n=sample_size, random_state=42)

    actual_ratings = []
    predicted_ratings = []

    # Loop through the sampled users and predict the rating for each game they rated
    for _, row in sampled_users.iterrows():
        game_id = row['BGGId']
        username = row['Username']
        actual_rating = row['Rating']

        # Check if 'Rating' and 'BGGId' exist in the sampled user data
        if 'Rating' not in row or 'BGGId' not in row:
            # print(f"Missing 'Rating' or 'BGGId' for user {username} (game {game_id}). Skipping.")
            continue

        # print(f"Evaluating: User {username}, Game {game_id}, Actual Rating: {actual_rating}")

        # Generate predictions using the specified similarity matrix
        predicted_rating = generate_predictions(game_id, username, similarity_matrix, data, users_df, n)

        # If a prediction was successfully made, append it to the lists
        if predicted_rating is not None:
            actual_ratings.append(actual_rating)
            predicted_ratings.append(predicted_rating)

    # Check if we have ratings to compute RMSE
    if len(actual_ratings) == 0:
        # print("No valid ratings to compute RMSE.")
        return None

    # Compute RMSE
    rmse = compute_rmse(predicted_ratings, actual_ratings)
    return rmse

In [24]:
cosine_rmse = evaluate_rmse_sampled(users_df, modern_boardgames_df, cosine_matrix, similarity_type='Cosine', n=3, sample_size=1000)

In [25]:
print(f'Cosine Similarity RMSE (Sampled): {cosine_rmse}')

Cosine Similarity RMSE (Sampled): 1.7267741078278938


# Hybrid

In [67]:
def calculate_hybrid_prediction(user_id, item_id, algo, similarity_matrix, data, users_data, alpha=0.5, n=5):
    """
    Generates a hybrid prediction by combining SVD and cosine similarity predictions, with the option to weight them.

    Parameters:
        user_id (str): The ID of the user for whom we are predicting.
        item_id (int): The ID of the item (game/movie/etc.) for which we are predicting.
        algo: The SVD model object used for collaborative filtering.
        similarity_matrix: The precomputed cosine similarity matrix.
        data: The data used for generating predictions (could be user-item ratings or some other dataset).
        users_data: The user data used for similarity calculations (e.g., user features).
        alpha (float): Weight for the SVD prediction. Default is 0.5 (equal weight).
        n (int): The number of nearest neighbors to use for cosine similarity (default is 5).

    Returns:
        float: The hybrid prediction (a weighted average of SVD and cosine similarity predictions).
    """

    # Get the SVD prediction
    svd_prediction = algo.predict(user_id, item_id).est

    # Get the cosine similarity prediction
    cosine_prediction = generate_predictions(item_id, user_id, similarity_matrix, data, users_data, n)

    # If no cosine prediction is available, only use the SVD prediction
    if cosine_prediction is None:
        hybrid_prediction = svd_prediction
    else:
        # Combine the predictions using the alpha weighting
        hybrid_prediction = (alpha * svd_prediction) + ((1 - alpha) * cosine_prediction)

    return hybrid_prediction

# Usage example:
user_id = '8wolves'  # Replace with the actual user ID
item_id = 300580  # Replace with the actual item ID (game ID, movie ID, etc.)
alpha = 0.5  # Weight for the SVD prediction (can be adjusted)

# Calling the hybrid prediction function
hybrid_prediction_result = calculate_hybrid_prediction(user_id, item_id, algo, cosine_matrix, modern_boardgames_df, users_df, alpha=alpha, n=5)

# Print the result
print(f"Predicted rating of item {item_id} by user {user_id}: {hybrid_prediction_result}")


Predicted rating of item 300580 by user 8wolves: 6.383480149260041


In [60]:
def evaluate_hybrid_rmse_sampled(users_df, data, algo, similarity_matrix, alpha=0.5, n=5, sample_size=200):
    """
    Evaluate RMSE for a random sample of users using a hybrid prediction approach.
    """

    actual_ratings = []
    predicted_ratings = []

    for _, row in users_df.iterrows():
        game_id = row['BGGId']
        username = row['Username']
        actual_rating = row['Rating']

        if 'Rating' not in row or 'BGGId' not in row:
            continue

        hybrid_prediction = calculate_hybrid_prediction(username, game_id, algo, similarity_matrix, data, users_df, alpha, n)

        if hybrid_prediction is not None:
            actual_ratings.append(actual_rating)
            predicted_ratings.append(hybrid_prediction)

    if len(actual_ratings) == 0:
        return None

    rmse = compute_rmse(predicted_ratings, actual_ratings)
    return rmse

# Evaluate the RMSE for the hybrid approach
hybrid_rmse = evaluate_hybrid_rmse_sampled(users_df, modern_boardgames_df, algo, cosine_matrix, alpha=0.5, n=5, sample_size=1000)
print(f"Hybrid RMSE (Sampled): {hybrid_rmse}")

KeyboardInterrupt: 

In [68]:
def calculate_hybrid_recommendations(user_id, algo, similarity_matrix, data, users_data, alpha=0.5, n=5):
    """
    Generates top 10 hybrid recommendations for a user by combining SVD and cosine similarity predictions.

    Parameters:
        user_id (str): The ID of the user for whom we are predicting.
        algo: The SVD model object used for collaborative filtering.
        similarity_matrix: The precomputed cosine similarity matrix.
        data: The data used for generating predictions (user-item ratings).
        users_data: The user data used for similarity calculations (e.g., user features).
        alpha (float): Weight for the SVD prediction. Default is 0.5 (equal weight).
        n (int): The number of nearest neighbors to use for cosine similarity (default is 5).

    Returns:
        list: A list of tuples with item IDs and their corresponding hybrid predicted ratings, sorted in descending order.
    """
    # Get all items
    item_ids = data['BGGId'].unique()  # Assuming data has an 'item_id' column

    # Initialize a dictionary to hold predictions
    predictions = {}

    for item_id in item_ids:
        # Get the SVD prediction for the current item
        svd_prediction = algo.predict(user_id, item_id).est

        # Get the cosine similarity prediction for the current item
        cosine_prediction = generate_predictions(item_id, user_id, similarity_matrix, data, users_data, n)

        # Combine the predictions using the alpha weighting
        if cosine_prediction is None:
            hybrid_prediction = svd_prediction  # Only use SVD prediction if cosine prediction is not available
        else:
            hybrid_prediction = (alpha * svd_prediction) + ((1 - alpha) * cosine_prediction)

        # Store the prediction for the current item
        predictions[item_id] = hybrid_prediction

    # Convert predictions to a DataFrame for easier manipulation
    predictions_df = pd.DataFrame(predictions.items(), columns=['item_id', 'predicted_rating'])

    # Get the top 10 recommended items based on hybrid predictions
    top_10_recommendations = predictions_df.sort_values(by='predicted_rating', ascending=False).head(10)

    return top_10_recommendations

# Usage example:
user_id = '8wolves'  # me!
alpha = 0.5  # Weight for the SVD prediction (can be adjusted)

# Calling the hybrid recommendation function
top_10_recommendations_result = calculate_hybrid_recommendations(user_id, algo, cosine_matrix, modern_boardgames_df, users_df, alpha=alpha, n=5)

# Print the top 10 recommended items
print("Top 10 recommended items for user:", user_id)
print(top_10_recommendations_result)

Top 10 recommended items for user: 8wolves
      item_id  predicted_rating
4052     7935          7.908259
6763    23421          7.761527
9153    62922          7.758215
8054    36314          7.711883
8476    40258          7.711527
8447    39939          7.708630
7992    35476          7.674713
5779    15999          7.662067
7807    33434          7.646235
7424    29383          7.633836
