Recommendation System

In [1]:
import warnings
warnings.filterwarnings('ignore')
from collections import Counter

# Import necessary packages
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Load Dataset anime.csv
df = pd.read_csv("anime.csv")

In [3]:
df.shape

(12294, 7)

In [4]:
df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12294 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12294 non-null  int64  
 1   name      12294 non-null  object 
 2   genre     12232 non-null  object 
 3   type      12269 non-null  object 
 4   episodes  12294 non-null  object 
 5   rating    12064 non-null  float64
 6   members   12294 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 672.5+ KB


In [6]:
len(df.genre.unique())

3265

In [7]:
len(df.rating.unique())

599

In [8]:
len(df.name.unique())

12292

In [9]:
df.rating.unique()

array([ 9.37,  9.26,  9.25,  9.17,  9.16,  9.15,  9.13,  9.11,  9.1 ,
        9.06,  9.05,  9.04,  8.98,  8.93,  8.92,  8.88,  8.84,  8.83,
        8.82,  8.81,  8.8 ,  8.78,  8.77,  8.76,  8.75,  8.74,  8.73,
        8.72,  8.71,  8.69,  8.68,  8.67,  8.66,  8.65,  8.64,  8.62,
        8.61,  8.6 ,  8.59,  8.58,  8.57,  8.56,  8.55,  8.54,  8.53,
        8.52,  8.51,  8.5 ,  8.49,  8.48,  8.47,  8.46,  8.45,  8.44,
        8.43,  8.42,  8.41,  8.4 ,  8.39,  8.38,  8.37,  8.36,  8.35,
        8.34,  8.33,  8.32,  8.31,  8.3 ,  8.29,  8.28,  8.27,  8.26,
        8.25,  8.24,  8.23,  8.22,  8.21,  8.2 ,  8.19,  8.18,  8.17,
        8.16,  8.15,  8.14,  8.13,  8.12,  8.11,  8.1 ,  8.09,  8.08,
        8.07,  8.06,  8.05,  8.04,  8.03,  8.02,  8.01,  8.  ,  7.99,
        7.98,  7.97,  7.96,  7.95,  7.94,  7.93,  7.92,  7.91,  7.9 ,
        7.89,  7.88,  7.87,  7.86,  7.85,  7.84,  7.83,  7.82,  7.81,
        7.8 ,  7.79,  7.78,  7.77,  7.76,  7.75,  7.74,  7.73,  7.72,
        7.71,  7.7 ,

In [10]:
df.groupby('name')['rating'].mean().sort_values(ascending=False)[:6]

Unnamed: 0_level_0,rating
name,Unnamed: 1_level_1
Taka no Tsume 8: Yoshida-kun no X-Files,10.0
Spoon-hime no Swing Kitchen,9.6
Mogura no Motoro,9.5
Kimi no Na wa.,9.37
Kahei no Umi,9.33
Fullmetal Alchemist: Brotherhood,9.26


In [11]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
anime_id,12294.0,14058.221653,11455.294701,1.0,3484.25,10260.5,24794.5,34527.0
rating,12064.0,6.473902,1.026746,1.67,5.88,6.57,7.18,10.0
members,12294.0,18071.338864,54820.676925,5.0,225.0,1550.0,9437.0,1013917.0


In [12]:
df.isnull().sum()

Unnamed: 0,0
anime_id,0
name,0
genre,62
type,25
episodes,0
rating,230
members,0


In [13]:
# handling missing values
df = df.dropna(subset=["genre"])
df["rating"] = df["rating"].fillna(df["rating"].median())
df["type"] = df["type"].fillna(df["type"].mode()[0])
df['episodes'] = df['episodes'].replace({'Unknown': 0, '?': 0}).astype(int)  # Handle 'episodes' column: Replace 'Unknown' or '?' with 0 and convert to numeric

Feature Selection (Genre-Based):

In [14]:
df["genre_list"] = df["genre"].fillna("").apply(lambda x: [g.strip() for g in x.split(",")])
# converting the genre column from a single comma-separated string into a clean list of individual genres for each anime.

In [15]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
item_feature_matrix = mlb.fit_transform(df["genre_list"])
item_feature_matrix

array([[0, 0, 0, ..., 0, 0, 0],
       [1, 1, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

MultiLabelBinarizer converts multiple labels per record into a binary (0/1) matrix

In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test = train_test_split(
    item_feature_matrix, test_size=0.2, random_state=42
)

In [17]:
df["genre_list"] = df["genre"].str.split(", ")
df["genre_list"]

Unnamed: 0,genre_list
0,"[Drama, Romance, School, Supernatural]"
1,"[Action, Adventure, Drama, Fantasy, Magic, Mil..."
2,"[Action, Comedy, Historical, Parody, Samurai, ..."
3,"[Sci-Fi, Thriller]"
4,"[Action, Comedy, Historical, Parody, Samurai, ..."
...,...
12289,[Hentai]
12290,[Hentai]
12291,[Hentai]
12292,[Hentai]


In [18]:
from sklearn.metrics.pairwise import cosine_similarity

item_item_similarity = cosine_similarity(X_train)
item_item_similarity

array([[1.        , 0.35355339, 0.        , ..., 0.        , 0.25      ,
        0.37796447],
       [0.35355339, 1.        , 0.        , ..., 0.        , 0.35355339,
        0.26726124],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.18898224],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.21821789],
       [0.25      , 0.35355339, 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.37796447, 0.26726124, 0.18898224, ..., 0.21821789, 0.        ,
        1.        ]])

In [19]:
from sklearn.metrics.pairwise import cosine_similarity,euclidean_distances

cosine_similarity -- how similar two vectors are based on angle, not magnitude.

euclidean_distance -- straight-line distance between two points.

Cosine_Similarity(Item based)

In [20]:
similarity_matrix = cosine_similarity(X_train)
similarity_matrix

array([[1.        , 0.35355339, 0.        , ..., 0.        , 0.25      ,
        0.37796447],
       [0.35355339, 1.        , 0.        , ..., 0.        , 0.35355339,
        0.26726124],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.18898224],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.21821789],
       [0.25      , 0.35355339, 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.37796447, 0.26726124, 0.18898224, ..., 0.21821789, 0.        ,
        1.        ]])

In [21]:
def recommend_by_name(anime_name, top_n=10):

    idx = df[df["name"] == anime_name].index # Get index of the anime based on its name

    if len(idx) == 0:
        return "Anime not found in dataset"

    idx = idx[0]

    sim_scores = list(enumerate(similarity_matrix[idx])) # Compute similarity scores of selected anime with all others
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n+1] # desc order sorting

    anime_indices = [i[0] for i in sim_scores]

    return df.iloc[anime_indices][["name", "genre", "rating"]]

In [22]:
recommend_by_name("Naruto", top_n=10)
# more recommended are...type name and get
# future obj: use embedded nlp

Unnamed: 0,name,genre,rating
841,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",7.81
2033,Detective Conan OVA 01: Conan vs. Kid vs. Yaiba,"Adventure, Comedy, Mystery, Shounen",7.37
2156,Mini Van,"Comedy, Game, Parody, Slice of Life",7.34
2801,Nanatsu-iro★Drops,"Magic, Romance, School",7.16
3467,Choujikuu Seiki Orguss 02,"Action, Adventure, Mecha, Romance, Sci-Fi, Sho...",6.98
3529,Naruto: Dai Katsugeki!! Yuki Hime Shinobu Houj...,"Action, Comedy, Fantasy, Martial Arts, Shounen...",6.96
7579,Kakegae no,Drama,5.27
8098,Aikatsu Stars! Movie,"Music, School, Shoujo, Slice of Life",7.0
8163,Ari to Kirigirisu,"Kids, Music",3.0
9447,Mak Dau Xiang Dang Dang,Comedy,6.81


In [23]:
from sklearn.metrics import precision_score, recall_score, f1_score

In [24]:
from sklearn.metrics import precision_score, recall_score, f1_score

y_true = [] # truth labels
y_pred = [] # predicted labels

threshold = 0.6

for i in range(len(X_test)):
    # Compute cosine similarity between test sample and all training samples
    sims = cosine_similarity(X_test[i].reshape(1, -1), X_train)[0]
    pred = (sims > threshold).astype(int)

    true = (sims > 0.8).astype(int)

    y_pred.extend(pred)
    y_true.extend(true)

In [25]:
precision = precision_score(y_true, y_pred)
recall    = recall_score(y_true, y_pred)
f1        = f1_score(y_true, y_pred)

precision, recall, f1

(0.3036725888770016, 1.0, 0.4658724766754337)

Interview Questions:

`Problem 1` Can you explain the difference between user-based and item-based collaborative filtering?

User-Based Collaborative Filtering:

*   Finds similar users based on preferences
*   Recommends items liked by similar users
*   Performance degrades with large user bases
*   List item

Item-Based Collaborative Filtering:


*   Finds similar items based on user interactions
*   Recommends items similar to those already liked by the user
*   More scalable and stable
*   Widely used in production systems (Amazon, Netflix)










`Problem 2` What is collaborative filtering, and how does it work?


Collaborative Filtering is a recommendation technique that predicts user preferences by leveraging past interactions and similarities among users or items, and it works by analyzing interaction patterns to recommend items liked by similar users or related to previously liked items.

Working Steps:

1. Collect user–item interaction data (ratings, clicks, purchases)

2. Build a user–item matrix

3. Compute similarity between users or items

4. Identify nearest neighbors based on similarity

5. Generate recommendations from neighbors’ preferences