In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import regex as re

In [2]:
#Load the data
anime_data = pd.read_csv('anime.csv')
rating_data = pd.read_csv('rating.csv')

In [3]:
missing_ratings = anime_data['rating'].isnull()
anime_data = anime_data[~missing_ratings]
anime_data = anime_data.reset_index(drop=True)
anime_data
# anime_id -- myanimelist.net's unique id identifying an anime.
# name -- full name of anime.
# genre -- comma separated list of genres for this anime.
# type -- movie, TV, OVA, etc.
# episodes -- how many episodes in this show. (1 if movie).
# rating -- average rating out of 10 for this anime.
# members -- number of community members that are in this anime's group

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama Season 4,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama Season 2;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12059,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12060,5543,Under World,Hentai,OVA,1,4.28,183
12061,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12062,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175


In [4]:
rating_data
# user_id -- non identifiable randomly generated user id.
# anime_id -- the anime that this user has rated.
# rating -- rating out of 10 this user has assigned (-1 if the user watched it but didn't assign a rating).

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1
...,...,...,...
7813732,73515,16512,7
7813733,73515,17187,9
7813734,73515,22145,10
7813735,73516,790,9


In [5]:
# Data Preprocessing:
# Clean the datasets by handling missing values, removing duplicates, and addressing any data quality issues.
rating_data_cleaned = rating_data[rating_data.rating != -1]
rating_data_cleaned
# Ensure the datasets are in a suitable format for analysis.

Unnamed: 0,user_id,anime_id,rating
47,1,8074,10
81,1,11617,10
83,1,11757,10
101,1,15451,10
153,2,11771,10
...,...,...,...
7813732,73515,16512,7
7813733,73515,17187,9
7813734,73515,22145,10
7813735,73516,790,9


In [6]:
other_anime_data = pd.read_csv('animes.csv')
other_anime_data.rename(columns={'uid': 'anime_id'}, inplace=True)
data = anime_data.merge(other_anime_data, on='anime_id', how='inner')
data = data[['anime_id', 'name', 'genre_x', 'type', 'episodes_x', 'rating', 'synopsis']]
data.rename(columns={'genre_x': 'genre', 'episodes_x': 'episodes'}, inplace=True)
data = data.drop_duplicates(subset='anime_id', keep='first')
data = data.reset_index(drop=True)
data

Unnamed: 0,anime_id,name,genre,type,episodes,rating,synopsis
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,"Mitsuha Miyamizu, a high school girl, yearns t..."
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,"""In order for something to be obtained, someth..."
2,28977,Gintama Season 4,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,"Gintoki, Shinpachi, and Kagura return as the f..."
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,The self-proclaimed mad scientist Rintarou Oka...
4,9969,Gintama Season 2;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,"After a one-year hiatus, Shinpachi Shimura ret..."
...,...,...,...,...,...,...,...
11948,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,A young man receives a life-size nude girl in ...
11949,5543,Under World,Hentai,OVA,1,4.28,Based on the cartoon by one of the greatest Ko...
11950,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,Tetsuya was the product of his mother being ra...
11951,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,Based on the manga by Satou Masaaki \r\n


In [7]:
content_data = data.copy()
null_values = data[['name', 'genre', 'type', 'synopsis']].isnull().any(axis=1)
content_data = data[~null_values]
content_data = content_data.reset_index(drop=True)
content_data

Unnamed: 0,anime_id,name,genre,type,episodes,rating,synopsis
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,"Mitsuha Miyamizu, a high school girl, yearns t..."
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,"""In order for something to be obtained, someth..."
2,28977,Gintama Season 4,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,"Gintoki, Shinpachi, and Kagura return as the f..."
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,The self-proclaimed mad scientist Rintarou Oka...
4,9969,Gintama Season 2;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,"After a one-year hiatus, Shinpachi Shimura ret..."
...,...,...,...,...,...,...,...
11593,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,A young man receives a life-size nude girl in ...
11594,5543,Under World,Hentai,OVA,1,4.28,Based on the cartoon by one of the greatest Ko...
11595,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,Tetsuya was the product of his mother being ra...
11596,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,Based on the manga by Satou Masaaki \r\n


In [8]:
def preprocess_data():
    df = content_data.copy()
    df['genre'] = df['genre'].str.lower()
    df['synopsis'] = df['synopsis'].str.lower()
    df['genre'] = df['genre'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))
    df['synopsis'] = df['synopsis'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))
    df['genre_synopsis'] = df['genre'] + ' ' + df['synopsis']
    return df

In [9]:
# Step 4: Generate item profiles
def generate_item_profiles(df):
    tfidf_vectorizer = TfidfVectorizer()
    item_profiles = tfidf_vectorizer.fit_transform(df['genre_synopsis'])
    return item_profiles

In [10]:
#calculate cosine similarity
def calculate_item_similarity(item_profiles):
    item_similarity = cosine_similarity(item_profiles)
    return item_similarity

In [11]:
def find_similar_items(genres, type_input, anime_names, item_similarity, df, top_n=100):
    similar_items = []
    for name in anime_names:
        item_indices = df[df['name'].str.contains(name, case=False)].index
        similarity_scores = item_similarity[item_indices][0]  
        sorted_indices = np.argsort(similarity_scores)[::-1]
        top_similar_indices = sorted_indices[1:top_n + 1]
        top_similar_items = df.iloc[top_similar_indices]['name'].values.tolist()
        similar_items.extend(top_similar_items)
    return similar_items

In [12]:
def get_recommendations():
    genre_inputs = input("Enter genres separated by commas: ")
    genre_names = [name.strip() for name in genre_inputs.split(',')]
    genre_names = [genre.lower() for genre in genre_names]
    type_input = input("Enter type: ")
    type_input = type_input.lower()
    anime_input = input("Enter anime names separated by commas: ")
    anime_names = [name.strip() for name in anime_input.split(',')]
    df = preprocess_data()
    item_profiles = generate_item_profiles(df)
    item_similarity = calculate_item_similarity(item_profiles)
    filtered_table = df[df['name'].isin(find_similar_items(genre_names, type_input, anime_names, item_similarity, df))]
    filtered_table = filtered_table[filtered_table['type'].str.lower() == type_input]
    for i in genre_names:
        filtered_table = filtered_table[filtered_table['genre'].str.contains(i, case=False)]
    filtered_table = filtered_table.nlargest(10, 'rating')
    if len(filtered_table) < 10:
        # If there are less than 10 rows, fill additional rows with recommendations based on just anime_name
        remaining_rows = 10 - len(filtered_table)
        additional_rows = df[df['name'].isin(find_similar_items(genre_names, type_input, anime_names, item_similarity, df, top_n=25))]
        print(additional_rows)
        filtered_table = pd.concat([filtered_table, additional_rows])
        filtered_table = filtered_table.nlargest(10, 'rating')
    output = filtered_table.merge(content_data, on='anime_id', how='inner')[['type_y', 'name_x', 'genre_y', 'synopsis_y']].rename(columns={'type_y': 'type', 'name_x': 'name', 'genre_y': 'genre', 'synopsis_y': 'synopsis'})
    return output

In [15]:
#run this code cell to get recommendations
#input: genres, type (Movie, TV-Show, etc.), anime names
get_recommendations()

Unnamed: 0,type,name,genre,synopsis
0,TV,Fate/stay night: Unlimited Blade Works 2nd Season,"Action, Fantasy, Magic, Shounen, Supernatural","In the midst of the Fifth Holy Grail War, Cast..."
1,TV,Berserk,"Action, Adventure, Demons, Drama, Fantasy, Hor...","Born from the corpse of his mother, a young me..."
2,TV,Neon Genesis Evangelion,"Action, Dementia, Drama, Mecha, Psychological,...","In the year 2015, the world stands on the brin..."
3,TV,Macross F,"Action, Mecha, Military, Music, Romance, Sci-F...",Following a catastrophic war against a race of...
4,TV,Nejimaki Seirei Senki: Tenkyou no Alderamin,"Action, Adventure, Fantasy, Military",Ikta Solork is a carefree young man who only w...
5,TV,Arslan Senki (TV),"Action, Adventure, Drama, Fantasy, Historical,...",The year is 320. Under the rule of the bellige...
6,TV,No.6,"Action, Sci-Fi","Many years ago, after the end of a bloody worl..."
7,TV,World Trigger,"Action, School, Sci-Fi, Shounen, Supernatural",When a gate to another world suddenly opens on...
8,TV,Arslan Senki (TV): Fuujin Ranbu,"Action, Adventure, Drama, Fantasy, Historical,...","Continuing on his quest to retake Ecbatana, Pr..."
9,TV,Aldnoah.Zero,"Action, Mecha, Sci-Fi",The discovery of a hypergate on the Moon once ...
