### Importing Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.metrics.pairwise import linear_kernel,cosine_similarity
from scipy.sparse import csr_matrix

import warnings
warnings.filterwarnings("ignore")
import pickle

### importing Dataset

This dataset contains information on user preference data from 73,516 users on 12,294 anime. Each user is able to add anime to their completed list and give it a rating and this dataset is a compilation of those ratings.

#### anime.csv

- anime_id : myanimelist.net's unique id identifying an anime.
- name : full name of anime.
- genre : comma separated list of genres for this anime.
- type : movie, TV, OVA, etc.
- episodes : how many episodes in this show. (1 if movie).
- rating : average rating out of 10 for this anime.
- members : number of community members that are in this anime's "group".

#### rating.csv

- user_id : non identifiable randomly generated user id.
- anime_id : the anime that this user has rated.
- rating : rating out of 10 this user has assigned (-1 if the user watched it but didn't assign a rating).

- source ~ https://www.kaggle.com/datasets/CooperUnion/anime-recommendations-database

In [2]:
anime = pd.read_csv("data/anime.csv")
ratings= pd.read_csv("data/rating.csv")

In [3]:
anime.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [4]:
anime.shape

(12294, 7)

In [5]:
anime.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


In [6]:
ratings.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


In [7]:
ratings.shape

(7813737, 3)

In [8]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7813737 entries, 0 to 7813736
Data columns (total 3 columns):
 #   Column    Dtype
---  ------    -----
 0   user_id   int64
 1   anime_id  int64
 2   rating    int64
dtypes: int64(3)
memory usage: 178.8 MB


Checking null values in anime data frame

In [9]:
anime.isnull().sum()

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

Dropping Null Values in anime data frame

In [10]:
anime.dropna(inplace=True)

In [11]:
anime.isnull().sum()

anime_id    0
name        0
genre       0
type        0
episodes    0
rating      0
members     0
dtype: int64

In [12]:
# No Duplicates present in anime data frame
anime.duplicated().sum()

0

In [13]:
ratings.isnull().sum()

user_id     0
anime_id    0
rating      0
dtype: int64

In [14]:
ratings.duplicated().sum()

1

In [15]:
ratings.drop_duplicates(inplace=True)
print(ratings.duplicated().sum())

0


# Feature Engineering

In [16]:
# Since -1 represents missing ratings. i will fill them with NaN rather than dropping.
ratings['rating'] = ratings['rating'].apply(lambda x: np.nan if x==-1 else x)
ratings.head(2)

Unnamed: 0,user_id,anime_id,rating
0,1,20,
1,1,24,


In [17]:
ratings['user_id'].nunique()

73515

In [18]:
anime.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [19]:
anime['genre'].nunique()

3229

In [20]:
anime['type'].unique()

array(['Movie', 'TV', 'OVA', 'Special', 'Music', 'ONA'], dtype=object)

In [21]:
anime['type'].value_counts()

type
TV         3668
OVA        3284
Movie      2259
Special    1670
ONA         648
Music       488
Name: count, dtype: int64

#### Merging the datasets

In [22]:
merged_df = ratings.merge(anime, left_on = 'anime_id', right_on = 'anime_id', suffixes= ['_user', ''])
print(merged_df.shape)
merged_df.head(2)

(7813610, 9)


Unnamed: 0,user_id,anime_id,rating_user,name,genre,type,episodes,rating,members
0,1,20,,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,220,7.81,683297
1,3,20,8.0,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,220,7.81,683297


In [23]:
# For collaborative filtering we'll need to create a pivot table of users on one axis and tv show names along the other. 
# The pivot table will help us in defining the similarity between users and shows to better predict who will like what.

colab_df = merged_df[['user_id', 'name', 'rating_user']]
colab_df= colab_df[colab_df['user_id']<=10000]
colab_piv = colab_df.pivot_table(index='user_id', columns='name', values='rating_user')

In [24]:
colab_piv.shape

(9467, 7924)

In [25]:
# Value Normalization and Filling NaN Values

piv_norm = colab_piv.apply(lambda x: (x-np.mean(x))/(np.max(x)-np.min(x)), axis=1)

# Drop all columns containing only zeros representing users who did not rate

piv_norm.fillna(0, inplace=True)

In [26]:
piv_norm.head()

name,&quot;Bungaku Shoujo&quot; Kyou no Oyatsu: Hatsukoi,&quot;Bungaku Shoujo&quot; Memoire,&quot;Bungaku Shoujo&quot; Movie,.hack//G.U. Returner,.hack//G.U. Trilogy,.hack//G.U. Trilogy: Parody Mode,.hack//Gift,.hack//Intermezzo,.hack//Liminality,.hack//Quantum,...,gdgd Fairies Movie: tte Iu Eiga wa Dou kana...?,iDOLM@STER Xenoglossia,iDOLM@STER Xenoglossia Specials,s.CRY.ed,xxxHOLiC,xxxHOLiC Kei,xxxHOLiC Movie: Manatsu no Yoru no Yume,xxxHOLiC Rou,xxxHOLiC Shunmuki,◯
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,-0.26168,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
# Transpose
piv_norm = piv_norm.T

# Drop all columns containing only zeros representing users who did not rate any anime.
piv_norm = piv_norm.loc[:, (piv_norm != 0).any(axis=0)]

In [28]:
piv_norm.head()

user_id,3,5,7,8,10,11,12,14,16,17,...,9991,9992,9993,9994,9995,9996,9997,9998,9999,10000
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
&quot;Bungaku Shoujo&quot; Kyou no Oyatsu: Hatsukoi,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,-0.130098,0.0,0.0,0.0,0.0,0.0
&quot;Bungaku Shoujo&quot; Memoire,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,-0.018987,0.0,0.0,0.0,0.0,0.0
&quot;Bungaku Shoujo&quot; Movie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.160976,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
.hack//G.U. Returner,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
.hack//G.U. Trilogy,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.092124,0.0,0.0,0.0,0.0,0.0


In [29]:
piv_norm.shape

(7924, 8661)

In [30]:
# convert to sparse matrix format for the similarity computation as in the dataframe more than 2/3rd values are 0.
piv_sparse = csr_matrix(piv_norm.values)

### Cosine Similarity

In [31]:
#model based on anime similarity
anime_similarity = cosine_similarity(piv_sparse)

#Df of anime similarities
ani_sim_df = pd.DataFrame(anime_similarity, index = piv_norm.index, columns = piv_norm.index)

In [32]:
ani_sim_df.head(2)

name,&quot;Bungaku Shoujo&quot; Kyou no Oyatsu: Hatsukoi,&quot;Bungaku Shoujo&quot; Memoire,&quot;Bungaku Shoujo&quot; Movie,.hack//G.U. Returner,.hack//G.U. Trilogy,.hack//G.U. Trilogy: Parody Mode,.hack//Gift,.hack//Intermezzo,.hack//Liminality,.hack//Quantum,...,gdgd Fairies Movie: tte Iu Eiga wa Dou kana...?,iDOLM@STER Xenoglossia,iDOLM@STER Xenoglossia Specials,s.CRY.ed,xxxHOLiC,xxxHOLiC Kei,xxxHOLiC Movie: Manatsu no Yoru no Yume,xxxHOLiC Rou,xxxHOLiC Shunmuki,◯
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
&quot;Bungaku Shoujo&quot; Kyou no Oyatsu: Hatsukoi,1.0,0.215921,0.217107,-0.027137,-0.011489,-0.053508,-0.01844,0.008172,-0.039032,0.038659,...,0.0,-0.008973,0.00815,0.00144,0.030337,0.025256,0.053544,0.074282,0.040303,0.0
&quot;Bungaku Shoujo&quot; Memoire,0.215921,1.0,0.412676,-0.092209,-0.042651,-0.160894,-0.059462,-0.02493,-0.039077,0.086153,...,0.0,-0.007546,-0.016147,-0.024695,-0.008115,-0.011057,-0.014199,0.004943,-0.002507,0.0


In [33]:
ani_sim_df.shape

(7924, 7924)

## Collaborative Recommender

In [34]:
# This function will return the top 10 shows with the highest cosine similarity value and show match percent

def anime_recommendation(ani_name):   
    try:
    
        number = 1
        print('Recommended because you watched {}:\n'.format(ani_name))
        for anime in ani_sim_df.sort_values(by = ani_name, ascending = False).index[1:11]:
            print(f'#{number}: {anime}, {round(ani_sim_df[anime][ani_name]*100,2)}% match')
            number +=1
    except IndexError:
        # If the anime name is not found, handle the exception
        return "Anime not found. Please enter a valid anime name."
    except Exception as e:
        # Handle other potential exceptions
        return f"An error occurred. Please Try again.{e}"

In [35]:
anime_recommendation('Naruto')

Recommended because you watched Naruto:

#1: Naruto: Shippuuden Movie 1, 11.7% match
#2: Bleach, 9.75% match
#3: Naruto Movie 1: Dai Katsugeki!! Yuki Hime Shinobu Houjou Dattebayo!, 9.31% match
#4: Naruto: Shippuuden Movie 2 - Kizuna, 7.79% match
#5: Naruto: Shippuuden Movie 6 - Road to Ninja, 7.5% match
#6: Naruto: Shippuuden Movie 5 - Blood Prison, 7.41% match
#7: Gunslinger Stratos, 6.8% match
#8: Sword Gai, 6.79% match
#9: Dragon Ball Z, 6.43% match
#10: Naruto: Shippuuden Movie 3 - Hi no Ishi wo Tsugu Mono, 6.08% match


In [36]:
anime_recommendation('Shingeki no Kyojin')

Recommended because you watched Shingeki no Kyojin:

#1: Fullmetal Alchemist: Brotherhood, 28.29% match
#2: Steins;Gate, 24.84% match
#3: Death Note, 24.2% match
#4: Code Geass: Hangyaku no Lelouch R2, 23.16% match
#5: Code Geass: Hangyaku no Lelouch, 22.54% match
#6: One Punch Man, 22.45% match
#7: Hunter x Hunter (2011), 21.62% match
#8: Kiseijuu: Sei no Kakuritsu, 20.94% match
#9: Shingeki no Kyojin: Kuinaki Sentaku, 18.48% match
#10: Psycho-Pass, 18.13% match


## Content based Recommender

In [37]:
def content_based_recommendation(anime_name, anime_df=anime):
    try:
        # Step 1: TF-IDF Vectorization -- Converts text data into numerical vectors
        # Captures the importance of each word in the 'genre' column. 
        # Words that are frequent in a specific anime's genre but rare across all anime will have higher weights

        tfidf_vectorizer = TfidfVectorizer(stop_words='english')
        tfidf_matrix = tfidf_vectorizer.fit_transform(anime['genre'])


        # Step 2: Calculate Similarity Scores
        # measures the cosine of the angle between two vectors identifying how similar two anime's are.

        cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
        
        # Step 3: Get Index of the Anime to locate the position of the target anime in the similarity scores

        anime_index = anime_df[anime_df['name'] == anime_name].index[0]

        # Step 4: Get Similarity Scores -- Retrieves the similarity scores of the target anime with all other anime.

        sim_scores = list(enumerate(cosine_sim[anime_index]))

        # Step 5: Sort Scores in Descending Order

        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

        # Step 6: Get Top N Recommendations

        top_anime_indices = [i[0] for i in sim_scores[1:11]]
        top_anime_names = anime_df['name'].iloc[top_anime_indices]

        top_10_anime = top_anime_names.reset_index(drop=True)

        return top_10_anime

    except IndexError:
        # If the anime name is not found, handle the exception
        return "Anime not found. Please enter a valid anime name."
    except Exception as e:
        # Handle other potential exceptions
        return f"An error occurred. Please Try again.{e}"

In [38]:
content_based_recommendation('Naruto')

0                                   Naruto: Shippuuden
1                                               Naruto
2    Boruto: Naruto the Movie - Naruto ga Hokage ni...
3                                          Naruto x UT
4          Naruto: Shippuuden Movie 4 - The Lost Tower
5    Naruto: Shippuuden Movie 3 - Hi no Ishi wo Tsu...
6                 Naruto Shippuuden: Sunny Side Battle
7    Naruto Soyokazeden Movie: Naruto to Mashin to ...
8                              Kyutai Panic Adventure!
9           Naruto: Shippuuden Movie 6 - Road to Ninja
Name: name, dtype: object

In [39]:
content_based_recommendation('Death Note')

0                          Death Note Rewrite
1                             Mousou Dairinin
2               Higurashi no Naku Koro ni Kai
3               Higurashi no Naku Koro ni Rei
4                            Mirai Nikki (TV)
5           Mirai Nikki (TV): Ura Mirai Nikki
6                   Higurashi no Naku Koro ni
7                                     Monster
8                                   AD Police
9    Higurashi no Naku Koro ni Kaku: Outbreak
Name: name, dtype: object

In [40]:
content_based_recommendation('One Piece')

0    One Piece: Episode of Merry - Mou Hitori no Na...
1    One Piece: Episode of Nami - Koukaishi no Nami...
2    One Piece: Episode of Sabo - 3 Kyoudai no Kizu...
3               One Piece Film: Strong World Episode 0
4    One Piece: Episode of Luffy - Hand Island no B...
5                One Piece Movie 4: Dead End no Bouken
6    One Piece Movie 9: Episode of Chopper Plus - F...
7                   One Piece: Adventure of Nebulandia
8                 One Piece Movie 5: Norowareta Seiken
9              One Piece: Umi no Heso no Daibouken-hen
Name: name, dtype: object

In [41]:
pickle.dump(ani_sim_df,open("anime_sim.pkl",'wb'))
pickle.dump(anime,open("anime.pkl",'wb'))