In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import precision_score, recall_score, f1_score
import warnings
warnings.filterwarnings('ignore')

## Data Processing

In [2]:
df= pd.read_csv('anime.csv')  # loading dataset
df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [3]:
df.isnull().sum()

Unnamed: 0,0
anime_id,0
name,0
genre,62
type,25
episodes,0
rating,230
members,0


In [4]:
df['genre'].fillna('Unknown',inplace=True)  # Handling missing values
df['type'].fillna('Unknown',inplace=True)
df['rating'].fillna(df['rating'].median(),inplace=True)
df.isnull().sum()

Unnamed: 0,0
anime_id,0
name,0
genre,0
type,0
episodes,0
rating,0
members,0


In [5]:
df.duplicated().sum()  # no duplicate values

0

## Feature Extraction

In [6]:
def convert_episodes(val):           # function to convert episode obj dtype to numeric
    return pd.to_numeric(val,errors='coerce')

In [7]:
df['episodes']= df['episodes'].apply(convert_episodes)
df['episodes'].fillna(df['episodes'].median(),inplace=True)

In [8]:
genre_list= df['genre'].str.get_dummies(sep=', ')   # converting genres into one-hot encoding
df= pd.concat([df,genre_list],axis=1)
df.drop(columns=['genre'],inplace=True)

In [9]:
scaler= MinMaxScaler()  # Normalizing numerical features
df[['rating','episodes','members']]= scaler.fit_transform(df[['rating','episodes','members']])

## Recommendation System

In [10]:
feature_cols= ['rating','episodes','members']+list(genre_list.columns)  # compute similarity matrix
similarity_matrix= cosine_similarity(df[feature_cols])

In [11]:
def recommend_anime(anime_name,top_n=5):        # cosine similarity function to find relevant content
    if anime_name not in df['name'].values:
        return 'Anime not found in the dataset.'

    index= df[df['name']==anime_name].index[0]
    similarity_scores= list(enumerate(similarity_matrix[index]))
    similarity_scores= sorted(similarity_scores,key=lambda x:x[1],reverse=True)[1:top_n+1]

    recommendations= [df.iloc[i[0]]['name'] for i in similarity_scores]
    return recommendations


In [12]:
recommend_anime('Gintama°')

['Gintama&#039;',
 'Gintama&#039;: Enchousen',
 'Gintama Movie: Kanketsu-hen - Yorozuya yo Eien Nare',
 'Gintama Movie: Shinyaku Benizakura-hen',
 'Gintama: Yorinuki Gintama-san on Theater 2D']

## Evaluation

In [13]:
train,test= train_test_split(df,test_size=0.2,random_state=42) # splitting dataset for evaluation

In [14]:
print(train.shape)
print(test.shape)

(9835, 50)
(2459, 50)


In [15]:
def evaluate_recommendations(test_sample_size=100):
    true_positives, false_positives, false_negatives = 0, 0, 0
    test_sample = test.sample(n=min(test_sample_size, len(test)))

    for _, row in test_sample.iterrows():
        recommendations = recommend_anime(row['name'], top_n=5)

        if isinstance(recommendations, str):  # Anime not found case
            continue

        if row['name'] in recommendations:
            true_positives += 1
        else:
            false_negatives += 1
            false_positives += len(recommendations)

    precision = precision_score([1]*true_positives + [0]*false_positives,
                                [1]*true_positives + [1]*false_positives, zero_division=1)
    recall = recall_score([1]*true_positives + [0]*false_negatives,
                          [1]*true_positives + [0]*false_negatives, zero_division=1)
    f1 = f1_score([1]*true_positives + [0]*(false_negatives + false_positives),
                  [1]*true_positives + [0]*(false_negatives + false_positives), zero_division=1)

    return {"Precision": precision, "Recall": recall, "F1-score": f1}

In [16]:
performance_metrics = evaluate_recommendations()
print("Evaluation Metrics:", performance_metrics)

Evaluation Metrics: {'Precision': 0.0, 'Recall': 1.0, 'F1-score': 1.0}


In [17]:
df.head()

Unnamed: 0,anime_id,name,type,episodes,rating,members,Action,Adventure,Cars,Comedy,...,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Unknown,Vampire,Yaoi,Yuri
0,32281,Kimi no Na wa.,Movie,0.0,0.92437,0.197872,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,5114,Fullmetal Alchemist: Brotherhood,TV,0.034673,0.911164,0.78277,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,28977,Gintama°,TV,0.027518,0.909964,0.112689,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,9253,Steins;Gate,TV,0.012658,0.90036,0.664325,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,9969,Gintama&#039;,TV,0.027518,0.89916,0.149186,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0


# New Section

## Interview Questions:

1. Can you explain the difference between user-based and item-based collaborative filtering?


*   User-Based Collaborative Filtering:
    Recommends items based on similarities between users.
    Less effective with large user bases due to sparsity and scalability issues.
*   Item-Based Collaborative Filtering:
    Recommends items based on item similarities.
    More stable and scalable than user-based filtering.  
  User-based focuses on similarities between users, while item-based focuses on relationships between items.



2.  What is collaborative filtering, and how does it work?  

   Collaborative Filtering is a recommendation technique that suggests items based on user behavior and preferences. It works by analyzing past interactions to identify patterns and similarities.   
   User-Based finds users with similar preferences and recommends items liked by similar users.  
   Item-Based finds items that are frequently liked together and recommends based on item similarity.  
   Instead of relying on item attributes,collaborative filtering leverages collective user interactions to make personalized recommendations.