## Data Preprocessing

In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

In [5]:
df = pd.read_csv('anime.csv')

In [6]:
df.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [7]:
df.isnull().sum()

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

In [8]:
df['genre'] = df['genre'].fillna('Unknown')

In [9]:
df['rating'] = df['rating'].fillna(df['rating'].median())

In [10]:
df['episodes'] = pd.to_numeric(df['episodes'], errors='coerce')
df['episodes'] = df['episodes'].fillna(df['episodes'].median())

In [11]:
print("\nMissing Values after Cleaning:")
print(df.isnull().sum())


Missing Values after Cleaning:
anime_id     0
name         0
genre        0
type        25
episodes     0
rating       0
members      0
dtype: int64


## Feature Extraction

In [12]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(df['genre'])

In [13]:
scaler = MinMaxScaler()
numerical_features = df[['rating', 'members']]
scaled_numerical_features = scaler.fit_transform(numerical_features)

In [14]:
from scipy.sparse import hstack
final_features = hstack([tfidf_matrix, scaled_numerical_features])

## Recommendation System

In [30]:
from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(final_features, final_features)

In [34]:
def get_recommendations(title, cosine_sim_matrix=cosine_sim, data=df, top_n=10):
    """
    Given an anime title, this function returns a list of the top N most similar anime
    based on the pre-computed cosine similarity matrix.

    Parameters:
    - title (str): The name of the target anime.
    - cosine_sim_matrix (np.array): The pre-computed cosine similarity matrix.
    - data (pd.DataFrame): The original DataFrame.
    - top_n (int): The number of recommendations to return.

    Returns:
    - pd.DataFrame: A DataFrame containing the recommended anime, their ratings,
                    and similarity scores.
    """
    try:
        idx = data[data['name'] == title].index[0]
    except IndexError:
        return "Anime not found in the dataset."

    sim_scores = list(enumerate(cosine_sim_matrix[idx]))

    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    sim_scores = sim_scores[1:top_n+1]

    anime_indices = [i[0] for i in sim_scores]

    similarity_scores = [i[1] for i in sim_scores]

    recommendations = data.iloc[anime_indices][['name', 'genre', 'rating']]
    recommendations['similarity_score'] = similarity_scores
    
    return recommendations

In [35]:
print("\nRecommendations for 'Kimi no Na wa.':")
print(get_recommendations('Kimi no Na wa.', top_n=5))

print("\nRecommendations for 'Fullmetal Alchemist: Brotherhood':")
print(get_recommendations('Fullmetal Alchemist: Brotherhood', top_n=10))

print("\nRecommendations for 'Naruto':")
print(get_recommendations('Naruto', top_n=5))


Recommendations for 'Kimi no Na wa.':
                                       name  \
159                            Angel Beats!   
223                                 Clannad   
1111  Aura: Maryuuin Kouga Saigo no Tatakai   
986                       Shakugan no Shana   
208           Kokoro ga Sakebitagatterunda.   

                                                  genre  rating  \
159         Action, Comedy, Drama, School, Supernatural    8.39   
223   Comedy, Drama, Romance, School, Slice of Life,...    8.30   
1111       Comedy, Drama, Romance, School, Supernatural    7.67   
986   Action, Drama, Fantasy, Romance, School, Super...    7.74   
208                              Drama, Romance, School    8.32   

      similarity_score  
159           1.643629  
223           1.628530  
1111          1.625539  
986           1.600383  
208           1.588398  

Recommendations for 'Fullmetal Alchemist: Brotherhood':
                                               name  \
200          

## Evaluation

In [36]:
def get_genres(anime_name, df):
    """Retrieves the list of genres for a given anime name."""
    try:
        genres_str = df[df['name'] == anime_name]['genre'].iloc[0]
        return set(genres_str.split(', '))
    except IndexError:
        return set()

def evaluate_recommendations(target_anime, recommended_list, df, k=10):
    """
    A simplified evaluation function based on genre overlap.
    
    Parameters:
    - target_anime (str): The name of the anime for which recommendations were made.
    - recommended_list (pd.DataFrame): The DataFrame of recommended anime.
    - df (pd.DataFrame): The original DataFrame.
    - k (int): The number of recommendations considered (Top K).

    Returns:
    - dict: A dictionary with precision, recall, and F1-score.
    """
    target_genres = get_genres(target_anime, df)
    
    if not target_genres:
        return {'precision': 0, 'recall': 0, 'f1_score': 0}

    hits = 0
    for _, row in recommended_list.head(k).iterrows():
        rec_genres = get_genres(row['name'], df)
        if not rec_genres.isdisjoint(target_genres):
            hits += 1

    precision = hits / k
    recall = hits / len(recommended_list.head(k))
    
    if (precision + recall) > 0:
        f1 = 2 * (precision * recall) / (precision + recall)
    else:
        f1 = 0
        
    return {'precision': precision, 'recall': recall, 'f1_score': f1}

In [37]:
target_anime_eval = 'Kimi no Na wa.'
recommendations = get_recommendations(target_anime_eval)
evaluation_results = evaluate_recommendations(target_anime_eval, recommendations, df)

print(f"\nEvaluation for '{target_anime_eval}':")
print(f"Precision@10: {evaluation_results['precision']:.2f}")
print(f"Recall@10: {evaluation_results['recall']:.2f}")
print(f"F1-Score: {evaluation_results['f1_score']:.2f}")


Evaluation for 'Kimi no Na wa.':
Precision@10: 1.00
Recall@10: 1.00
F1-Score: 1.00


## Interview questions

1.What is Collaborative Filtering?

Collaborative filtering is a technique used by recommendation systems to make automatic predictions about a user's interests by collecting preferences or taste information from many other users. The fundamental idea behind it is that if a person A has the same opinion as a person B on a particular item, then A is more likely to have B's opinion on a different item than a randomly chosen person.


The core principle is to find users or items that are similar to each other. It operates on the principle of "similar people like similar things" (user-based) or "people who like this item also like that item" (item-based).


**User-Based vs. Item-Based Collaborative Filtering
The primary difference between user-based and item-based collaborative filtering lies in how they find similarities to generate recommendations.

2.User-Based Collaborative Filtering

This approach, also known as user-to-user or neighborhood-based filtering, works by first identifying a group of users who are similar to the target user. Similarity is typically measured by looking at the items they have both liked, rated, or viewed. Once a group of "neighbors" is found, the system recommends items that these neighbors liked but the target user has not yet interacted with.

*How it Works: Find a user A who is similar to the target user B. Recommend items that A liked and B has not yet seen.

*Analogy: A music streaming service finds that you and a friend have rated 20 of the same songs identically. The service then recommends a new album that your friend rated highly, but you have not yet heard.


**Item-Based Collaborative Filtering

This approach, also known as item-to-item filtering, is more common and often more scalable. It works by identifying a group of items that are similar to the items the user has already liked or rated. The system then recommends those similar items. The similarity between two items is calculated by looking at the set of users who have rated both items.

*How it Works: Find an item Y that is similar to an item X that the user liked. Recommend item Y to the user.

*Analogy: An e-commerce site notices that customers who bought "Item A" also frequently bought "Item B" and "Item C". When a new customer buys "Item A", the site recommends "Item B" and "Item C", regardless of how similar the new customer is to others. This is a very efficient and common approach.