In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/anime-recommendations-database/rating.csv
/kaggle/input/anime-recommendations-database/anime.csv


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score, average_precision_score
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix # For ALS, representing user-item matrix efficiently

# --- 1. Load and Preprocess Data ---
# Load datasets (assuming the paths are correct in your environment)
anime_df = pd.read_csv('/kaggle/input/anime-recommendations-database/anime.csv', delimiter=',')
rating_df = pd.read_csv('/kaggle/input/anime-recommendations-database/rating.csv', delimiter=',')

# Clean rating data by removing -1 and averaging duplicate ratings
rating_clean = rating_df[rating_df['rating'] != -1]
rating_clean = rating_clean.groupby(['user_id', 'anime_id']).agg({'rating': 'mean'}).reset_index()

# Merge anime and rating data for content features
merged_content_df = pd.merge(rating_clean, anime_df, on='anime_id', how='inner')

# Prepare the data for embedding for TF-IDF: Combine name, genre, type, and episodes
# Handle potential NaN values by filling them with empty strings
anime_df['name'] = anime_df['name'].fillna('')
anime_df['genre'] = anime_df['genre'].fillna('')
anime_df['type'] = anime_df['type'].fillna('')
anime_df['episodes'] = anime_df['episodes'].fillna('')

# Remove commas from the 'genre' column for TF-IDF
anime_df['genre'] = anime_df['genre'].str.replace(',', ' ')


# Create a combined text string for each anime for TF-IDF
anime_combined_features = []
for index, row in anime_df.iterrows():
    combined_text = f"{row['name']} genre: {row['genre']} type: {row['type']} episodes: {row['episodes']}"
    anime_combined_features.append(combined_text)

anime_id_to_name = dict(zip(anime_df['anime_id'], anime_df['name'])) # Still useful for displaying recommendations
name_to_anime_id = dict(zip(anime_df['name'], anime_df['anime_id'])) # Still useful for reverse lookup


# Create mappings for unique user IDs and anime IDs for the user-item matrix (for ALS)
# This is crucial for creating a sparse matrix for collaborative filtering
unique_user_ids = rating_clean['user_id'].unique()
unique_anime_ids = anime_df['anime_id'].unique()

user_to_idx = {user_id: idx for idx, user_id in enumerate(unique_user_ids)}
anime_to_idx = {anime_id: idx for idx, anime_id in enumerate(unique_anime_ids)}
idx_to_anime = {idx: anime_id for anime_id, idx in anime_to_idx.items()}


# --- 2. TF-IDF Implementation (Content-Based Recommendation) ---
print("Building TF-IDF model...")
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000) # Limit features for performance
tfidf_matrix = tfidf_vectorizer.fit_transform(anime_combined_features)
print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")

# Calculate cosine similarity based on TF-IDF vectors
print("Calculating cosine similarity matrix based on TF-IDF...")
tfidf_similarity_matrix = cosine_similarity(tfidf_matrix)
print("TF-IDF cosine similarity matrix calculated.")

# Create mappings from anime_id to its index in the TF-IDF matrix
tfidf_anime_id_to_index = {anime_id: idx for idx, anime_id in enumerate(anime_df['anime_id'])}
tfidf_index_to_anime_id = {idx: anime_id for anime_id, idx in tfidf_anime_id_to_index.items()}


def get_tfidf_nearest_neighbors(anime_id: int, similarity_matrix: np.ndarray, top_n: int = 10):
    """
    Finds the top_n most similar animes for a given anime_id based on TF-IDF similarity.
    Excludes the anime itself.
    """
    if anime_id not in tfidf_anime_id_to_index:
        return []

    anime_idx = tfidf_anime_id_to_index[anime_id]
    similarities = similarity_matrix[anime_idx]

    # Get indices of top_n+1 most similar items (including itself)
    # Use argsort to get indices, then reverse to get descending order of similarity
    top_similar_indices = similarities.argsort()[::-1][1:top_n + 1] # [1:] to exclude itself

    # Map indices back to anime_ids and their similarity scores
    nearest_neighbors = []
    for idx in top_similar_indices:
        neighbor_anime_id = tfidf_index_to_anime_id[idx]
        neighbor_similarity = similarities[idx]
        nearest_neighbors.append((neighbor_anime_id, neighbor_similarity))

    return nearest_neighbors


def recommend_by_tfidf(user_id: int, rating_data: pd.DataFrame, similarity_matrix: np.ndarray, top_k: int = 10):
    """
    Generates content-based recommendations for a user based on their watched anime and similar items
    using TF-IDF cosine similarity.
    """
    user_watched_anime = rating_data[rating_data['user_id'] == user_id]
    if user_watched_anime.empty:
        return []

    recommended_anime_scores = {}
    already_watched_anime_ids = set(user_watched_anime['anime_id'].tolist())

    for _, row in user_watched_anime.iterrows():
        watched_anime_id = row['anime_id']
        watched_rating = row['rating']

        if watched_anime_id not in tfidf_anime_id_to_index:
            continue

        neighbors = get_tfidf_nearest_neighbors(watched_anime_id, similarity_matrix, top_n=50)

        for neighbor_anime_id, similarity_score in neighbors:
            if neighbor_anime_id not in already_watched_anime_ids and neighbor_anime_id in anime_id_to_name:
                if neighbor_anime_id not in recommended_anime_scores:
                    recommended_anime_scores[neighbor_anime_id] = 0
                recommended_anime_scores[neighbor_anime_id] += similarity_score * watched_rating # Weighted by user's rating

    sorted_recommendations = sorted(recommended_anime_scores.items(), key=lambda item: item[1], reverse=True)

    final_recommendations = []
    for anime_id, score in sorted_recommendations[:top_k]:
        anime_name = anime_id_to_name.get(anime_id, f"Unknown Anime (ID: {anime_id})")
        final_recommendations.append({'anime_id': anime_id, 'name': anime_name, 'score': score})

    return final_recommendations


# --- 3. ALS (Alternating Least Squares) Conceptual Approach (Collaborative Filtering) ---

print("\n--- ALS (Alternating Least Squares) Conceptual Approach ---")
print("Preparing user-item matrix for collaborative filtering...")

# Create the user-item interaction matrix (sparse matrix is essential for large datasets)
# Filter rating_clean to only include users and animes present in our mappings
filtered_ratings = rating_clean[
    rating_clean['user_id'].isin(user_to_idx) &
    rating_clean['anime_id'].isin(anime_to_idx)
].copy()

# Map original user_id and anime_id to their respective indices
filtered_ratings['user_idx'] = filtered_ratings['user_id'].map(user_to_idx)
filtered_ratings['anime_idx'] = filtered_ratings['anime_id'].map(anime_to_idx)

# Create a sparse matrix: rows are users, columns are animes, values are ratings
# This matrix will be used by ALS models
user_item_matrix = csr_matrix((
    filtered_ratings['rating'],
    (filtered_ratings['user_idx'], filtered_ratings['anime_idx'])
), shape=(len(unique_user_ids), len(unique_anime_ids)))

print(f"User-item matrix shape: {user_item_matrix.shape} (users x animes)")
print("This sparse matrix is the primary input for ALS models.")

print("\n--- How ALS works conceptually: ---")
print("ALS (Alternating Least Squares) is a matrix factorization algorithm primarily used for collaborative filtering.")
print("It decomposes the user-item interaction matrix (like the one created above) into two lower-dimensional matrices:")
print("1. User latent features matrix (U)")
print("2. Item latent features matrix (V)")
print("Such that their product approximates the original user-item matrix (R ≈ U * V^T).")
print("These latent features capture underlying preferences or characteristics that are not explicitly observed.")
print("The algorithm iteratively optimizes U and V by holding one fixed and solving for the other, alternating until convergence.")

print("\n--- Generating recommendations with ALS (Conceptual): ---")
print("In a practical implementation, you would use a library like 'implicit' (for implicit feedback, but can adapt for explicit) or 'LightFM':")
print("1. Initialize and train an ALS model on the 'user_item_matrix'.")
print("   e.g., `model = implicit.als.AlternatingLeastSquares(...)`")
print("   e.g., `model.fit(user_item_matrix)`")
print("2. Once trained, you can use the model to recommend items for a specific user:")
print("   e.g., `recommendations = model.recommend(user_id_idx, user_item_matrix[user_id_idx], N=10)`")
print("   The `recommend` method usually considers items the user has not yet interacted with.")
print("3. These recommendations would then be evaluated using MAP@K and NDCG@K, similar to the TF-IDF approach.")
print("\n*Note: A full, runnable ALS implementation is beyond the scope of a single, self-contained Python file without external library installations.*")


# --- 4. Evaluation (MAP@10 and NDCG@10) ---

# To evaluate, we need to split data into train and test sets.
# We'll use the train set to generate recommendations and the test set as ground truth.
train_rating, test_rating = train_test_split(rating_clean, test_size=0.2, random_state=42)

# Ensure all anime_ids in test_rating are present in our processed anime_df
test_rating = test_rating[test_rating['anime_id'].isin(anime_df['anime_id'])]

def calculate_map_at_k(recommended_items, ground_truth_items, k=10):
    """
    Calculates Mean Average Precision at K (MAP@K).
    recommended_items: List of recommended item IDs (ordered by relevance).
    ground_truth_items: Set of relevant item IDs.
    """
    if not ground_truth_items:
        return 0.0

    relevant_count = 0
    precision_sum = 0.0

    for i, item_id in enumerate(recommended_items[:k]):
        if item_id in ground_truth_items:
            relevant_count += 1
            precision_sum += relevant_count / (i + 1)
    
    return precision_sum / min(len(ground_truth_items), k) if relevant_count > 0 else 0.0


def calculate_ndcg_at_k(recommended_items, ground_truth_items, k=10):
    """
    Calculates Normalized Discounted Cumulative Gain at K (NDCG@K).
    recommended_items: List of recommended item IDs (ordered by relevance).
    ground_truth_items: Set of relevant item IDs.
    """
    if not ground_truth_items:
        return 0.0

    relevance = [1 if item_id in ground_truth_items else 0 for item_id in recommended_items[:k]]

    # Ensure y_true and y_score have the same length (k)
    # Pad relevance if less than k
    if len(relevance) < k:
        relevance.extend([0] * (k - len(relevance)))
    
    y_true = np.asarray([relevance])
    y_score = np.asarray([np.arange(k, 0, -1)]) # A simple descending score for ranking

    try:
        return ndcg_score(y_true, y_score)
    except ValueError as e:
        print(f"NDCG calculation error for a user: {e}. Returning 0.0.")
        return 0.0


print("\nStarting evaluation for TF-IDF based recommendations (MAP@10 and NDCG@10)...")
user_ids_to_evaluate = test_rating['user_id'].unique()
map_scores_tfidf = []
ndcg_scores_tfidf = []

# Filter users who have ratings in the training set
users_with_train_data = train_rating['user_id'].unique()
user_ids_for_evaluation = [uid for uid in user_ids_to_evaluate if uid in users_with_train_data]

# --- Optimization: Limit the number of users for evaluation ---
MAX_EVAL_USERS = 500 # Adjust this number based on desired evaluation speed and thoroughness
if len(user_ids_for_evaluation) > MAX_EVAL_USERS:
    print(f"Limiting evaluation to the first {MAX_EVAL_USERS} users for faster results.")
    user_ids_for_evaluation = user_ids_for_evaluation[:MAX_EVAL_USERS]
# --- End Optimization ---


if not user_ids_for_evaluation:
    print("No users with sufficient data in both train and test sets for evaluation. Please check your data split.")
else:
    for user_id in tqdm(user_ids_for_evaluation, desc="Evaluating users (TF-IDF)"):
        # Get ground truth from the test set
        # Consider ratings >= 7 as "relevant"
        ground_truth_anime_ids = set(test_rating[
            (test_rating['user_id'] == user_id) & (test_rating['rating'] >= 7)
        ]['anime_id'].tolist())

        if not ground_truth_anime_ids:
            continue # Skip users with no relevant items in test set in the test set

        # Generate recommendations based on the training data using TF-IDF
        recommendations_tfidf = recommend_by_tfidf(user_id, train_rating, tfidf_similarity_matrix, top_k=10)
        recommended_anime_ids_tfidf = [rec['anime_id'] for rec in recommendations_tfidf]

        # Calculate MAP@10
        map_scores_tfidf.append(calculate_map_at_k(recommended_anime_ids_tfidf, ground_truth_anime_ids, k=10))

        # Calculate NDCG@10
        ndcg_scores_tfidf.append(calculate_ndcg_at_k(recommended_anime_ids_tfidf, ground_truth_anime_ids, k=10))

    avg_map_at_10_tfidf = np.mean(map_scores_tfidf) if map_scores_tfidf else 0
    avg_ndcg_at_10_tfidf = np.mean(ndcg_scores_tfidf) if ndcg_scores_tfidf else 0

    print(f"\n--- TF-IDF Evaluation Results ---")
    print(f"Average MAP@10 (TF-IDF): {avg_map_at_10_tfidf:.4f}")
    print(f"Average NDCG@10 (TF-IDF): {avg_ndcg_at_10_tfidf:.4f}")



Building TF-IDF model...
TF-IDF matrix shape: (12294, 5000)
Calculating cosine similarity matrix based on TF-IDF...
TF-IDF cosine similarity matrix calculated.

--- ALS (Alternating Least Squares) Conceptual Approach ---
Preparing user-item matrix for collaborative filtering...
User-item matrix shape: (69600, 12294) (users x animes)
This sparse matrix is the primary input for ALS models.

--- How ALS works conceptually: ---
ALS (Alternating Least Squares) is a matrix factorization algorithm primarily used for collaborative filtering.
It decomposes the user-item interaction matrix (like the one created above) into two lower-dimensional matrices:
1. User latent features matrix (U)
2. Item latent features matrix (V)
Such that their product approximates the original user-item matrix (R ≈ U * V^T).
These latent features capture underlying preferences or characteristics that are not explicitly observed.
The algorithm iteratively optimizes U and V by holding one fixed and solving for the othe

Evaluating users (TF-IDF): 100%|██████████| 500/500 [02:04<00:00,  4.02it/s]


--- TF-IDF Evaluation Results ---
Average MAP@10 (TF-IDF): 0.0155
Average NDCG@10 (TF-IDF): 0.1216



