### Step 1: Data Preprocessing

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
# Load the dataset
df = pd.read_csv('anime.csv')

In [3]:
# Display the first few rows of the dataset
df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [4]:
# Check for missing values
df.isnull().sum()

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

In [5]:
# Drop rows with missing values (or handle them as appropriate for the dataset)
df.dropna(inplace=True)

In [6]:

# Verify that there are no more missing values
df.isnull().sum()

anime_id    0
name        0
genre       0
type        0
episodes    0
rating      0
members     0
dtype: int64

In [7]:
# Display dataset information
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12017 entries, 0 to 12293
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   anime_id  12017 non-null  int64  
 1   name      12017 non-null  object 
 2   genre     12017 non-null  object 
 3   type      12017 non-null  object 
 4   episodes  12017 non-null  object 
 5   rating    12017 non-null  float64
 6   members   12017 non-null  int64  
dtypes: float64(1), int64(2), object(4)
memory usage: 751.1+ KB


In [8]:
# Display summary statistics
df.describe()

Unnamed: 0,anime_id,rating,members
count,12017.0,12017.0,12017.0
mean,13638.001165,6.478264,18348.88
std,11231.076675,1.023857,55372.5
min,1.0,1.67,12.0
25%,3391.0,5.89,225.0
50%,9959.0,6.57,1552.0
75%,23729.0,7.18,9588.0
max,34519.0,10.0,1013917.0


### Step 2: Feature Extraction

In [9]:
# Convert genres to a set of binary features
df['genre'] = df['genre'].str.split(', ')
df_genre = df['genre'].str.join('|').str.get_dummies()

In [10]:
# Normalize the user ratings
df['rating'] = df['rating'] / df['rating'].max()

In [11]:
# Combine genre and normalized rating features
df_features = pd.concat([df_genre, df['rating']], axis=1)

### Step 3: Recommendation System

In [12]:
from sklearn.metrics.pairwise import cosine_similarity

In [13]:
# Compute the cosine similarity matrix
cosine_sim = cosine_similarity(df_features, df_features)

In [17]:
# Function to recommend similar anime
def recommend_anime(title, cosine_sim=cosine_sim, df=df, top_n=10):
     # Get the index of the anime that matches the title
    idx = df.index[df['name'] == title].tolist()[0]

    # Get the pairwise similarity scores of all anime with that anime
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the anime based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the top_n most similar anime
    sim_scores = sim_scores[1:top_n+1]

    # Get the anime indices
    anime_indices = [i[0] for i in sim_scores]

    # Return the top_n most similar anime
    return df.iloc[anime_indices][['name', 'rating']]

# Example usage
recommend_anime('Naruto')

Unnamed: 0,name,rating
615,Naruto: Shippuuden,0.794
1103,Boruto: Naruto the Movie - Naruto ga Hokage ni...,0.768
486,Boruto: Naruto the Movie,0.803
1343,Naruto x UT,0.758
1472,Naruto: Shippuuden Movie 4 - The Lost Tower,0.753
1573,Naruto: Shippuuden Movie 3 - Hi no Ishi wo Tsu...,0.75
2458,Naruto Shippuuden: Sunny Side Battle,0.726
2997,Naruto Soyokazeden Movie: Naruto to Mashin to ...,0.711
175,Katekyo Hitman Reborn!,0.837
7628,Kyutai Panic Adventure!,0.521


### Step 4: Evaluation

In [18]:
# Split the dataset into training and testing sets
train, test = train_test_split(df, test_size=0.2, random_state=42)

In [19]:
# Evaluate using precision, recall, and F1-score
from sklearn.metrics import precision_score, recall_score, f1_score

In [20]:
# Implement evaluation metrics based on top-n recommendations for test set
# ...

# Example: Define precision, recall, and F1-score calculation functions
def precision_at_k(actual, predicted, k):
    act_set = set(actual[:k])
    pred_set = set(predicted[:k])
    return len(act_set & pred_set) / float(k)

In [21]:
def recall_at_k(actual, predicted, k):
    act_set = set(actual[:k])
    pred_set = set(predicted[:k])
    return len(act_set & pred_set) / float(len(act_set))

In [22]:
def f1_at_k(actual, predicted, k):
    precision = precision_at_k(actual, predicted, k)
    recall = recall_at_k(actual, predicted, k)
    if precision + recall == 0:
        return 0.0
    return 2 * (precision * recall) / (precision + recall)

### Interview Questions

1. Can you explain the difference between user-based and item-based collaborative filtering?

User-based collaborative filtering recommends items by finding users similar to the target user and suggesting items those similar users liked. Item-based collaborative filtering, on the other hand, recommends items by finding items similar to those the target user has liked and suggesting those similar items.

2. What is collaborative filtering, and how does it work?

Collaborative filtering is a method of making automatic predictions (filtering) about the interests of a user by collecting preferences or taste information from many users (collaborating). It works by finding patterns in user behavior or item attributes and using those patterns to make recommendations. It can be user-based, finding similar users, or item-based, finding similar items.