# Anime Recommendation System
## A Jupyter Notebook designed to provide personalized anime recommendations based on user preferences using data from MyAnimelist.

In [105]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.neighbors import NearestNeighbors
import pandas as pd
import numpy as np
from itertools import product
from ast import literal_eval
from IPython.display import display, HTML
import xml.etree.ElementTree as ET
from collections import Counter

In [106]:
# Load the dataset
anime_data = pd.read_csv('../datasets/anime/anime_data.csv')
anime_data.head()

Unnamed: 0,mal_id,url,image,trailer,title,title_en,title_jp,title_synonyms,type,source,...,members,favorites,synopsis,season,year,genres,explicit_genres,themes,demographic,studios
0,52991,https://myanimelist.net/anime/52991/Sousou_no_...,https://cdn.myanimelist.net/images/anime/1015/...,https://www.youtube.com/watch?v=ZEkwCGJ3o7M,Sousou no Frieren,Frieren: Beyond Journey's End,葬送のフリーレン,['Frieren at the Funeral'],TV,Manga,...,907857,53529,During their decade-long quest to defeat the D...,fall,2023,"['Adventure', 'Drama', 'Fantasy']",[],[],['Shounen'],['Madhouse']
1,5114,https://myanimelist.net/anime/5114/Fullmetal_A...,https://cdn.myanimelist.net/images/anime/1208/...,https://www.youtube.com/watch?v=--IcmZkvL0Q,Fullmetal Alchemist: Brotherhood,Fullmetal Alchemist: Brotherhood,鋼の錬金術師 FULLMETAL ALCHEMIST,['Hagane no Renkinjutsushi: Fullmetal Alchemis...,TV,Manga,...,3420908,228575,After a horrific alchemy experiment goes wrong...,spring,2009,"['Action', 'Adventure', 'Drama', 'Fantasy']",[],['Military'],['Shounen'],['Bones']
2,9253,https://myanimelist.net/anime/9253/Steins_Gate,https://cdn.myanimelist.net/images/anime/1935/...,https://www.youtube.com/watch?v=27OZc-ku6is,Steins;Gate,Steins;Gate,STEINS;GATE,[],TV,Visual novel,...,2621367,191530,Eccentric scientist Rintarou Okabe has a never...,spring,2011,"['Drama', 'Sci-Fi', 'Suspense']",[],"['Psychological', 'Time Travel']",,['White Fox']
3,28977,https://myanimelist.net/anime/28977/Gintama°,https://cdn.myanimelist.net/images/anime/3/720...,,Gintama°,Gintama Season 4,銀魂°,"[""Gintama' (2015)""]",TV,Manga,...,645869,16826,"Gintoki, Shinpachi, and Kagura return as the f...",spring,2015,"['Action', 'Comedy', 'Sci-Fi']",[],"['Gag Humor', 'Historical', 'Parody', 'Samurai']",['Shounen'],['Bandai Namco Pictures']
4,38524,https://myanimelist.net/anime/38524/Shingeki_n...,https://cdn.myanimelist.net/images/anime/1517/...,https://www.youtube.com/watch?v=hKHepjfj5Tw,Shingeki no Kyojin Season 3 Part 2,Attack on Titan Season 3 Part 2,進撃の巨人 Season3 Part.2,[],TV,Manga,...,2349899,59580,Seeking to restore humanity's diminishing hope...,spring,2019,"['Action', 'Drama', 'Suspense']",[],"['Gore', 'Military', 'Survival']",['Shounen'],['Wit Studio']


In [107]:
# One-hot encode the columns
encoder = OneHotEncoder(sparse_output=False)

# Encode 'genres' column by combining all genres into one string per row, then applying one-hot encoding
genres_encoded = encoder.fit_transform(anime_data["genres"].apply(lambda x: [', '.join(eval(x))]).to_list())

# Similarly, encode 'studios' column
studios_encoded = encoder.fit_transform(anime_data["studios"].apply(lambda x: [', '.join(eval(x))]).to_list())

# Themes 
themes_encoded = encoder.fit_transform(anime_data["themes"].apply(lambda x: [', '.join(eval(x))]).to_list())

# Reshape the demographic data into a 2D array before passing it to the encoder
demographic_encoded = encoder.fit_transform(
    anime_data["demographic"].apply(lambda x: x if x != 'Unknown' else '').values.reshape(-1, 1)
)


In [108]:
weights = {'genres': 1, 
           'themes': 1, 
           'demographic': 0.1,
           'studios': 0.1,
           'score': 0.05,
           'popularity': 0.01}

In [109]:
# Combine the encoded genres, studios, and demographic with the numerical columns like 'score' and 'popularity' using the best weights
features = pd.concat([
    pd.DataFrame(genres_encoded) * weights['genres'], 
    pd.DataFrame(themes_encoded) * weights['themes'],
    pd.DataFrame(demographic_encoded) * weights['demographic'], 
    pd.DataFrame(studios_encoded) * weights['studios'], 
    anime_data["score"] * weights['score'], 
    anime_data["popularity"] * weights['popularity']
], axis=1)

# Ensure all column names are strings
features.columns = features.columns.astype(str)


In [110]:
# Define the k-NN model
knn = NearestNeighbors(n_neighbors=5, metric='cosine')
# Train the model
knn.fit(features)

In [111]:
def display_anime_details(anime_title, sources):
    # Filter anime based on the title or synonyms
    anime_details = anime_data[
        (anime_data["title"] == anime_title) | 
        (anime_data["title_en"] == anime_title) | 
        (anime_data["title_jp"] == anime_title) |
        anime_data["title_synonyms"].apply(lambda synonyms: anime_title in literal_eval(synonyms) if pd.notna(synonyms) else False)
    ]

    # Ensure the anime exists in the dataset
    if not anime_details.empty:
        # Extract required details
        title = anime_details["title"].values[0]
        title_jp = anime_details["title_jp"].values[0]
        synopsis = anime_details["synopsis"].values[0]
        score = anime_details["score"].values[0]
        series_type = anime_details["type"].values[0]
        episodes = anime_details["episodes"].values[0]
        genres = ', '.join(literal_eval(anime_details["genres"].values[0]))
        themes = ', '.join(literal_eval(anime_details["themes"].values[0]))
        
        demographic_raw = anime_details["demographic"].values[0]

        # Check if demographic_raw is not None and is not NaN or an empty string
        if pd.notna(demographic_raw) and demographic_raw != '':
            try:
                demographic = ', '.join(literal_eval(demographic_raw))  # Use literal_eval if you are sure about the format
            except (ValueError, SyntaxError):  # Handle potential errors
                demographic = demographic_raw  # Fallback to raw value if literal_eval fails
        else:
            demographic = None

        season_raw = anime_details["season"].values[0]

        # Check if season_raw is None or not a string before capitalizing
        if season_raw is not None and isinstance(season_raw, str):
            season = season_raw.capitalize()
        else:
            # If season is None, fall back to aired information without "Unknown"
            aired_info = anime_details["aired"].values[0] if pd.notna(anime_details["aired"].values[0]) else None
            season = aired_info.split(',')[0].strip() if aired_info else None


        year = anime_details["year"].values[0]
        image_url = anime_details["image"].values[0]  # Image URL for the anime
        mal_url = anime_details["url"].values[0]  # MAL URL for the anime
        trailer_url = anime_details["trailer"].values[0]  # Trailer URL if available

        # Header and basic details (Markdown style for formatting)
        display(HTML(f"""
        <h1 style="font-family:Arial;">{title}</h1>
        <h3 style="font-family:Arial;">Because you watched: {sources}</h3>
        <img src="{image_url}" alt="{title}" style="width:225px;height:auto;float:left;margin-right:15px;">
        <p><b>Synopsis:</b> {synopsis}</p>
        <p><b>Score:</b> {score}</p>
        <p><b>Type:</b> {series_type}</p>
        <p><b>Episodes:</b> {episodes}</p>
        <p><b>Genres:</b> {genres}</p>
        {"<p><b>Themes:</b> " + themes + "</p>" if themes else ""}
        {"<p><b>Demographic:</b> " + demographic + "</p>" if demographic else ""}
        <p><b>Season:</b> {season} {year}</p>
        <p><b>MAL Page:</b> <a href="{mal_url}" target="_blank">Link to MyAnimeList</a></p>
        """))

        # Display trailer if available
        if trailer_url:
            display(HTML(f'<p><b>Trailer:</b> <a href="{trailer_url}" target="_blank">Watch Trailer</a></p>'))
        else:
            display(HTML(f'<p><b>Trailer:</b> No trailer available.</p>'))

        display(HTML('<hr>'))  # Horizontal line separator

    else:
        print(f"Anime titled '{anime_title}' not found in the dataset.")



In [112]:
def get_anime_recommendations(anime_title, k=5):
    # Check if the anime exists in the dataset (across all possible title columns)
    if not (
        anime_title in anime_data["title"].values or 
        anime_title in anime_data["title_jp"].values or 
        anime_title in anime_data["title_en"].values or 
        anime_data['title_synonyms'].apply(lambda x: anime_title in x).any()
    ):
        print(f"Anime '{anime_title}' not found in the dataset.")
        return []

    # Fetch the anime details
    anime_details = anime_data[
        (anime_data["title"] == anime_title) | 
        (anime_data["title_en"] == anime_title) | 
        (anime_data["title_jp"] == anime_title)
    ]

    if anime_details.empty:
        print(f"Anime '{anime_title}' not found in the dataset.")
        return []

    # Extract the first matching anime details
    anime_idx = anime_details.index[0]
    
    # Display the details of the selected anime (optional)
    title = anime_details["title"].values[0]
    # display_anime_details(title)

    # Use k-NN to find similar anime based on the features
    query_features = features.iloc[anime_idx].to_frame().T  # Get the features of the selected anime
    
    # Get k+1 neighbors (including the original anime itself)
    distances, indices = knn.kneighbors(query_features, n_neighbors=k+1)

    # Remove the original anime from the recommendations
    recommended_indices = indices.flatten()[1:]  # Remove the first one (itself)

    # Get the titles of the recommended anime based on the indices
    recommended_titles = anime_data.iloc[recommended_indices]["title"].values.tolist()

    return recommended_titles  # Return the list of recommended titles


In [113]:
# Load the anime profile data from an XML file

tree = ET.parse("../datasets/anime/animelist.xml")
root = tree.getroot()

anime_list = root.findall('anime')

In [114]:
print(len(anime_list))

# Sort anime_list by 'my_score' as a numerical value (use 0 for missing/invalid scores)
anime_list.sort(key=lambda x: int(x.find('my_score').text or 0) if x.find('my_score') is not None else 0, reverse=True)

# Print the sorted list of anime titles and their corresponding scores
for anime in anime_list:
    title = anime.find('series_title').text
    score = anime.find('my_score').text if anime.find('my_score') is not None else "N/A"
    print(f"{title} - Score: {score}")

245
Chainsaw Man - Score: 10
Kanojo, Okarishimasu - Score: 10
Kanojo, Okarishimasu 2nd Season - Score: 10
Kimi no Na wa. - Score: 10
Kimi no Suizou wo Tabetai - Score: 10
Made in Abyss: Retsujitsu no Ougonkyou - Score: 10
Mushoku Tensei II: Isekai Ittara Honki Dasu - Score: 10
Mushoku Tensei II: Isekai Ittara Honki Dasu Part 2 - Score: 10
Mushoku Tensei: Isekai Ittara Honki Dasu - Score: 10
Mushoku Tensei: Isekai Ittara Honki Dasu Part 2 - Score: 10
Sousou no Frieren - Score: 10
Vinland Saga - Score: 10
Vinland Saga Season 2 - Score: 10
Violet Evergarden - Score: 10
Violet Evergarden Movie - Score: 10
Bocchi the Rock! - Score: 9
Fullmetal Alchemist: Brotherhood - Score: 9
Fumetsu no Anata e - Score: 9
Gabriel DropOut - Score: 9
Gintama - Score: 9
Gintama' - Score: 9
Gintama': Enchousen - Score: 9
Grand Blue - Score: 9
Horimiya - Score: 9
JoJo no Kimyou na Bouken (TV) - Score: 9
JoJo no Kimyou na Bouken Part 3: Stardust Crusaders - Score: 9
JoJo no Kimyou na Bouken Part 3: Stardust Crus

In [115]:

# Initialize a set to store existing anime titles for fast lookup
existing_titles = {anime.find('series_title').text for anime in anime_list}

# Filter the anime list by status (e.g., 'Watching' or 'Completed')
filtered_anime_list = [anime for anime in anime_list if anime.find('my_status').text in ['Watching', 'Completed']]

# Filter the anime list by score (>= 8)
filtered_anime_list = [anime for anime in filtered_anime_list if int(anime.find('my_score').text or 0) >= 8]

print(len(filtered_anime_list))

# Initialize an empty dictionary to store recommendations
recommended = {}

for anime in filtered_anime_list:
    anime_title = anime.find('series_title').text
    recoanime = get_anime_recommendations(anime_title, 5)

    # Check if the recommended anime titles are not already in the anilist
    for rec in recoanime:
        if rec not in existing_titles:
            # If the recommendation is new, add it to the dictionary
            recommended.setdefault(anime_title, []).append(rec)

103


In [116]:
# Counter to track how many times each anime is recommended
recommendation_count = Counter()

# Count how many times each anime is recommended and store which anime recommended it
recommendation_sources = {}

# Iterate through the 'recommended' dictionary
for anime_title, recs in recommended.items():
    for rec in recs:
        recommendation_count[rec] += 1
        # Track which anime the recommendation came from
        if rec not in recommendation_sources:
            recommendation_sources[rec] = []
        recommendation_sources[rec].append(anime_title)

# Sort recommendations by the most frequent first
most_recommended = recommendation_count.most_common()

# Print the total number of recommendations
print(f"Total recommendations: {len(recommended)}")

# Iterate through the sorted recommendations and print the most recommended first
for rec, count in most_recommended:
    # Print which anime this recommendation came from
    source_anime = recommendation_sources[rec]
    sources = ', '.join(source_anime)
    
    # Check if the recommended anime exists in the 'anime_data' DataFrame
    if rec in anime_data["title"].values:
        # Get the index of the recommended anime in the 'anime_data' DataFrame
        idx = anime_data[anime_data["title"] == rec].index[0]
        # Display the details of the recommended anime using the index
        display_anime_details(anime_data.iloc[idx]["title"], sources)  # Access the title for details
    else:
        print(f"Recommended anime '{rec}' not found in the dataset.")
        

Total recommendations: 95
