In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score,recall_score,f1_score
import random
import warnings
warnings.filterwarnings('ignore')

In [2]:
df=pd.read_csv('anime.csv')

In [3]:
df

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
...,...,...,...,...,...,...,...
12289,9316,Toushindai My Lover: Minami tai Mecha-Minami,Hentai,OVA,1,4.15,211
12290,5543,Under World,Hentai,OVA,1,4.28,183
12291,5621,Violence Gekiga David no Hoshi,Hentai,OVA,4,4.88,219
12292,6133,Violence Gekiga Shin David no Hoshi: Inma Dens...,Hentai,OVA,1,4.98,175


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


### Data Preprocessing

In [5]:
df.dropna(subset=['genre','rating'],inplace=True)

In [6]:
df['genre_list']=df['genre'].str.split(',')

In [7]:
genre_df=df[['anime_id','genre_list']]

### Feature Extraction

In [8]:
mlb=MultiLabelBinarizer()

In [9]:
mlb

In [10]:
onehot_encoded_genres=pd.DataFrame(mlb.fit_transform(genre_df['genre_list']),columns=mlb.classes_,index=genre_df['anime_id'])

In [11]:
onehot_encoded_genres

Unnamed: 0_level_0,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,Game,Harem,...,Shoujo,Shounen,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire,Yaoi
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
32281,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5114,1,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
28977,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9253,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9969,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9316,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5543,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5621,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6133,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
ratings_df=df[['anime_id','rating']]

In [13]:
ratings_df

Unnamed: 0,anime_id,rating
0,32281,9.37
1,5114,9.26
2,28977,9.25
3,9253,9.17
4,9969,9.16
...,...,...
12289,9316,4.15
12290,5543,4.28
12291,5621,4.88
12292,6133,4.98


In [14]:
merged_df=onehot_encoded_genres.merge(ratings_df,left_index=True,right_on='anime_id')

In [15]:
merged_df.fillna(0,inplace=True)

In [16]:
merged_df.head()

Unnamed: 0,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,Game,Harem,...,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire,Yaoi,anime_id,rating
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,32281,9.37
1,1,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,5114,9.26
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,28977,9.25
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,9253,9.17
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,9969,9.16


### Recommendation System

In [17]:
def get_recommendations(anime_title,merged_df,top_n=10):
    anime_index=df[df['name']==anime_title].index
    if anime_index.empty:
        return None

    anime_index=anime_index[0]
    x=merged_df.drop(['anime_id','rating'],axis=1)
    model=NearestNeighbors(metric='cosine',algorithm='brute')
    model.fit(x)
    distances,indices=model.kneighbors(x.iloc[anime_index].values.reshape(1,-1),n_neighbors=top_n+1)
    similar_anime_indices=indices.flatten()[1:]
    recommended_anime=df.loc[similar_anime_indices,'name'].tolist()

    return recommended_anime

In [18]:
all_anime_titles=df['name'].unique()

In [19]:
random_anime_title=random.choice(all_anime_titles)

In [20]:
print(f"Selected anime: {random_anime_title}")

Selected anime: Hana no Zundamaru


In [21]:
recommendations=get_recommendations(random_anime_title,merged_df,top_n=10)

In [22]:
if recommendations:
    print("\nRecommended anime:")
    for anime in recommendations:
        print(anime)
else:
    print(f"\nNo recommendations found for {random_anime_title}")


Recommended anime:
Hayan Ma-eum Baeggu
Kiki to Lala no Hakuchouza no Ohimesama
Hawaiian Rock&#039;n Roll
Nanatsu no Umi no Tico Specials
Robot Taekwon V 3tan! Sujung Teukgongdae
Rusuban
Chibi Neko Chobi/Chibi Neko Kobi to Tomodachi
Robocar Poli 2
Rio: Rainbow Gate!
Hato no Oyome-san


### Evaluation

In [23]:
x=merged_df.drop(['anime_id','rating'],axis=1)
y=merged_df['rating'].astype(int)

In [24]:
x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.8,random_state=100)

In [25]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(9613, 82)
(2404, 82)
(9613,)
(2404,)


In [26]:
knn=KNeighborsClassifier(n_neighbors=5)

In [27]:
knn.fit(x_train,y_train)

In [28]:
y_pred=knn.predict(x_test) 

In [29]:
accuracy=accuracy_score(y_test,y_pred)
precision=precision_score(y_test,y_pred,average='weighted')
recall=recall_score(y_test,y_pred,average='weighted')
f1=f1_score(y_test,y_pred,average='weighted')

In [30]:
print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')

Accuracy: 0.41
Precision: 0.40
Recall: 0.41
F1 Score: 0.39


The recommendation system has been implemented using cosine similarity on the anime dataset and evaluated using metrics such as accuracy, precision, recall, and F1-score. The system recommends anime based on similar genres and user ratings.

The evaluation results indicate that the model has an accuracy of approximately 41%. This suggests that there's room for improvement in the recommendation system's performance.

Potential areas for enhancement:

Feature Engineering: We can experiment with additional features or refine the existing ones to capture more nuanced similarities between anime.

Algorithm Selection: We can explore other recommendation algorithms or fine-tune the parameters of the current KNN algorithm to improve accuracy.

Data Quality: We can address potential issues in the dataset, such as inconsistencies or biases, that might impact the recommendations.

User Preferences: We can incorporate user-specific preferences or viewing history to personalize the recommendations further.

#### Interview Questions

1. Can you explain the difference between user-based and item-based collaborative filtering?

2. What is collaborative filtering, and how does it work?