Step 1: Get data

In [41]:
import pandas as pd

# Load the CSV file
users = pd.read_csv('rating.csv')
anime = pd.read_csv('anime.csv')
print(users.head())
print(anime.head())


   user_id  anime_id  rating
0        1        20      -1
1        1        24      -1
2        1        79      -1
3        1       226      -1
4        1       241      -1
   anime_id                              name  \
0     32281                    Kimi no Na wa.   
1      5114  Fullmetal Alchemist: Brotherhood   
2     28977                          Gintama°   
3      9253                       Steins;Gate   
4      9969                     Gintama&#039;   

                                               genre   type episodes  rating  \
0               Drama, Romance, School, Supernatural  Movie        1    9.37   
1  Action, Adventure, Drama, Fantasy, Magic, Mili...     TV       64    9.26   
2  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.25   
3                                   Sci-Fi, Thriller     TV       24    9.17   
4  Action, Comedy, Historical, Parody, Samurai, S...     TV       51    9.16   

   members  
0   200630  
1   793665  
2   114262 

In [42]:
from sklearn.model_selection import train_test_split
import pickle

In [43]:
# just crop data to be first 500,000
users = users[:1000000]
train_users, test_users = train_test_split(users, test_size=0.1)

In [44]:
from collections import defaultdict

# get some useful info from users
users = set()
animes = set()
usersPerAnime = defaultdict(set) # Maps an anime to the user who rated it
animesPerUser = defaultdict(set) # Maps a user to the anime that they rated
reviewsPerUser = defaultdict(list)
reviewsPerAnime = defaultdict(list)
ratingDict = {} # To retrieve a rating for a specific user/item pair

# item = book in this case
for d in users:
    review = d['review_id']
    user = d['user_id']
    users.add(user)
    anime = d['anime_id']
    animes.add(anime)
    usersPerAnime[anime].add(user)
    animesPerUser[user].add(anime)
    reviewsPerAnime[anime].append(d)
    reviewsPerUser[user].append(d)
    ratingDict[(user,anime)] = d['rating']


In [45]:
# # Load from pickle files
# with open('train_users.pkl', 'rb') as f:
#     train_users = pickle.load(f)
# 
# with open('test_users.pkl', 'rb') as f:
#     test_users = pickle.load(f)

In [46]:
# train_users = pd.DataFrame(columns=users.columns)
# test_users = pd.DataFrame(columns=users.columns)
# 
# prev_user = None
# count = 0
# 
# for idx, row in users.iterrows():
#     current_user = row['user_id']
#     
#     # Check if different user, restart count
#     if prev_user != current_user:
#         count = 0
#         prev_user = current_user
#     
#     # Increment count
#     count += 1
#     
#     # Every 5th item (20%) goes to test set
#     if count % 5 == 0:
#         test_users_alt = pd.concat([test_users, pd.DataFrame([row])]) 
#     else:
#         train_users_alt = pd.concat([train_users, pd.DataFrame([row])])
# 
# # Reset indices
# train_users_alt = train_users.reset_index(drop=True)
# test_users_alt = test_users.reset_index(drop=True)
# 
# # Random sampling to exactly 300,000 rows
# if len(train_users) > 300000:
#     train_users = train_users.sample(n=300000, random_state=42).reset_index(drop=True)
#     
# if len(test_users) > 10000:
#     test_users = test_users.sample(n=300000, random_state=42).reset_index(drop=True)
# 
# 
# # Print the sizes of the datasets
print(f"train set : {len(train_users)}, test set people: {len(test_users)}")
print(test_users.head())

train set : 900000, test set people: 100000
        user_id  anime_id  rating
156220     1551     30307       8
58622       611      1896      -1
100607     1034     15809      10
530426     5331      4720       8
459645     4744      5114      10


In [47]:
# # Save train and test dataframes
# with open('train_users.pkl', 'wb') as f:
#     pickle.dump(train_users, f)
# 
# with open('test_users.pkl', 'wb') as f:
#     pickle.dump(test_users, f)

In [48]:
# what is the rating scale? 
# print('max rating is ', max(train_users['rating'])) # its 10
# print('min rating is ', min(train_users['rating'])) # its -1

Now create baseline model for recommendation

In [49]:
from surprise import Dataset, Reader, SVDpp
from surprise.model_selection import train_test_split, cross_validate
from surprise import accuracy
from itertools import combinations
from scipy.spatial.distance import cosine

In [50]:
reader = Reader(rating_scale=(-1, 10))  # Assuming ratings from 1 to 5
train_data = Dataset.load_from_df(train_users, reader)  # Assuming ratings_df is your ratings dataframe
trainset = train_data.build_full_trainset()
test_data = Dataset.load_from_df(test_users, reader)
svdpp = SVDpp(n_factors=3, n_epochs=20, lr_all=0.007, reg_all=0.02, verbose=True)


In [51]:
svdpp.fit(trainset)

 processing epoch 0
 processing epoch 1
 processing epoch 2
 processing epoch 3
 processing epoch 4
 processing epoch 5
 processing epoch 6
 processing epoch 7
 processing epoch 8
 processing epoch 9
 processing epoch 10
 processing epoch 11
 processing epoch 12
 processing epoch 13
 processing epoch 14
 processing epoch 15
 processing epoch 16
 processing epoch 17
 processing epoch 18
 processing epoch 19


<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x107a62d60>

In [52]:
from surprise.model_selection import KFold

kf = KFold(random_state=0)  # folds will be the same for all algorithms.
cross_validate(svdpp, test_data, ["rmse","mae"],kf)

 processing epoch 0
 processing epoch 1
 processing epoch 2
 processing epoch 3
 processing epoch 4
 processing epoch 5
 processing epoch 6
 processing epoch 7
 processing epoch 8
 processing epoch 9
 processing epoch 10
 processing epoch 11
 processing epoch 12
 processing epoch 13
 processing epoch 14
 processing epoch 15
 processing epoch 16
 processing epoch 17
 processing epoch 18
 processing epoch 19
 processing epoch 0
 processing epoch 1
 processing epoch 2
 processing epoch 3
 processing epoch 4
 processing epoch 5
 processing epoch 6
 processing epoch 7
 processing epoch 8
 processing epoch 9
 processing epoch 10
 processing epoch 11
 processing epoch 12
 processing epoch 13
 processing epoch 14
 processing epoch 15
 processing epoch 16
 processing epoch 17
 processing epoch 18
 processing epoch 19
 processing epoch 0
 processing epoch 1
 processing epoch 2
 processing epoch 3
 processing epoch 4
 processing epoch 5
 processing epoch 6
 processing epoch 7
 processing epoch 8


{'test_rmse': array([2.50515915, 2.49598847, 2.51622981, 2.52748539, 2.60830336]),
 'test_mae': array([1.69521731, 1.68843092, 1.70955328, 1.7036978 , 1.77628598]),
 'fit_time': (0.8848099708557129,
  0.8438119888305664,
  0.8538579940795898,
  0.8514468669891357,
  0.8563671112060547),
 'test_time': (0.4562990665435791,
  0.4413421154022217,
  0.4351181983947754,
  0.4782261848449707,
  0.44852495193481445)}

Now we need to test diversity

In [53]:
from collections import defaultdict
from sklearn.metrics import jaccard_score
import numpy as np
import pandas as pd
from itertools import combinations


def jaccard_similarity(set1, set2):
    """Compute Jaccard similarity between two sets."""
    return len(set1 & set2) / len(set1 | set2) if len(set1 | set2) > 0 else 0


def calculate_diversity(predictions, anime_df, embedding_matrix, popularity_dict):
    """Calculate an enhanced diversity score for a list of anime recommendations."""
    
    anime_ids = predictions  # List of recommended anime IDs
    genre_sets = []
    latent_vectors = []
    popularity_scores = []

    for anime_id in anime_ids:
        genre_str = anime_df.loc[anime_df['anime_id'] == anime_id, 'genre']
        if not genre_str.empty:
            genre_sets.append(set(genre_str.values[0].split(', ')))
        else:
            genre_sets.append(set())  # Handle missing genres

        # Retrieve latent representation from embeddings (if available)
        if anime_id in embedding_matrix:
            latent_vectors.append(embedding_matrix[anime_id])
        
        # Retrieve popularity score (if available)
        popularity_scores.append(popularity_dict.get(anime_id, 0.5))  # Default to 0.5 if missing

    # Compute pairwise genre similarities
    genre_similarities = [jaccard_similarity(g1, g2) for g1, g2 in combinations(genre_sets, 2)]
    
    # Compute pairwise embedding (latent space) dissimilarities
    latent_dissimilarities = [1 - cosine(v1, v2) for v1, v2 in combinations(latent_vectors, 2) if v1 is not None and v2 is not None]

    # Compute average popularity score (higher means more mainstream recommendations)
    avg_popularity = sum(popularity_scores) / len(popularity_scores)

    # Calculate final diversity components
    genre_diversity = 1 - (sum(genre_similarities) / len(genre_similarities)) if genre_similarities else 1
    latent_diversity = sum(latent_dissimilarities) / len(latent_dissimilarities) if latent_dissimilarities else 1
    long_tail_penalty = 1 - avg_popularity  # Encourages recommending lesser-known anime

    # Weighted combination
    diversity_score = (0.4 * genre_diversity) + (0.4 * latent_diversity) + (0.2 * long_tail_penalty)

    return diversity_score


    # Compute pairwise genre similarities
    similarities = [jaccard_similarity(g1, g2) for g1, g2 in combinations(genre_sets, 2)]

    # Diversity is 1 - average similarity
    diversity_score = 1 - (sum(similarities) / len(similarities)) if similarities else 1
    return diversity_score


def get_recommendations_with_diversity(test_users, svdpp, anime_df, n=5):
    """
    Get anime recommendations and calculate diversity scores.
    
    Args:
        test_users: DataFrame with user information
        svdpp: Trained SVD++ model
        anime_df: DataFrame with anime information
        n: Number of top recommendations to consider
        
    Returns:
        float: Average diversity score across all test users
    """
    diversity_scores = []
    
    for _, user in test_users.iterrows():
        uid = int(user["user_id"])
        
        # Find similar users and candidate anime
        similar_users = get_similar_users(uid, k=10)
        candidate_anime = get_candidate_anime(similar_users)
        
        if not candidate_anime:
            continue
            
        # Get predictions for candidate anime only
        predictions = []
        for iid in candidate_anime:
            pred = svdpp.predict(uid, int(iid))
            predictions.append((iid, pred.est))
        
        # Get top N recommendations
        top_n_anime = [iid for iid, _ in sorted(predictions, key=lambda x: x[1], reverse=True)[:n]]
        
        # Calculate diversity score for this user's recommendations
        diversity_score = calculate_diversity(top_n_anime, anime_df)
        diversity_scores.append(diversity_score)
    
    # Return average diversity
    return sum(diversity_scores) / len(diversity_scores) if diversity_scores else 0


def get_similar_users(user_id, k=10):
    """Find k most similar users based on Jaccard similarity."""
    if user_id not in animesPerUser:
        return []
    
    user_watched = animesPerUser[user_id]
    similarities = {}

    for other_user, other_watched in animesPerUser.items():
        if other_user == user_id:
            continue
        similarities[other_user] = jaccard_similarity(user_watched, other_watched)
    
    # Return top k similar users
    return [uid for uid, _ in sorted(similarities.items(), key=lambda x: x[1], reverse=True)[:k]]


def get_candidate_anime(similar_users):
    """Get anime watched by similar users."""
    if not similar_users:
        return []
    
    candidate_anime = set()
    for user in similar_users:
        candidate_anime.update(animesPerUser[user])
    
    return list(candidate_anime)


In [54]:
# Run recommendations and calculate diversity score
avg_diversity = get_recommendations_with_diversity(test_users, svdpp, anime, n=5)

# Print the final diversity score
print(f"Average Diversity Score: {float(avg_diversity)}")


Average Diversity Score: 0.0
