# Anime Recommendation Engine

In [None]:
# 1.
# Import core data science libraries (required)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import warnings
warnings.filterwarnings('ignore')
# Import Machine Learning libraries (required)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# For Jupyter widgets (optional - for interactive features)
try:
    import ipywidgets as widgets
    from IPython.display import display, clear_output
    WIDGETS_AVAILABLE = True
except ImportError:
    WIDGETS_AVAILABLE = False
    print("ipywidgets not available. Interactive features will be limited.")

In [None]:
# 2. 
# Create the base AnimeRecommendationEngine Class
class AnimeRecommendationEngine:
    def __init__(self, csv_path='data/anime.csv'):
        """Initialize the recommendation engine with anime dataset"""
        self.df = None
        self.tfidf_matrix = None
        self.cosine_sim = None
        self.load_data(csv_path)

    def load_data(self, csv_path):
        """Load and preprocess the anime dataset"""
        try:
            print("Loading anime dataset...")
            self.df = pd.read_csv(csv_path, 
                                 encoding='UTF-8',
                                 on_bad_lines='skip')
            print(f"Dataset loaded successfully! {len(self.df)} anime entries found!")


            # Basic data cleaning - Remove any rows where 'name' is missing
            self.df = self.df.dropna(subset=['name'])

            
            # Fill missing values - convert floats/ints to strings
            self.df['anime_id'] = self.df['anime_id'].fillna('')
            self.df['genre'] = self.df['genre'].fillna('')
            self.df['type'] = self.df['type'].fillna('')
            self.df['episodes'] = self.df['episodes'].fillna(0).astype(str)
            
            # Convert 'rating' to numeric, coerce errors, then fill NaN, then convert to string
            self.df['rating'] = pd.to_numeric(self.df['rating'], errors='coerce')
            self.df['rating'] = self.df['rating'].fillna(0.0) # Fill with float 0.0 for consistency
            rating_str = self.df['rating'].astype(str)
            # DO NOT convert rating to string outside of the combined features column

            # Convert 'members' to numeric, coerce errors, then fill NaN, then convert to string
            self.df['members'] = pd.to_numeric(self.df['members'], errors='coerce')
            self.df['members'] = self.df['members'].fillna(0) # Fill with integer 0
            self.df['members'] = self.df['members'].astype(str) # Convert to string for combined_features
            
            # Create combined features for similarity calculation
            # This string will get treated as a document for NLP
            self.df['combined_features'] = (
                self.df['name'] + ' ' + 
                self.df['genre'] + ' ' + 
                self.df['type'] + ' ' + 
                self.df['episodes'] + ' ' +
                rating_str + ' ' +
                self.df['members']
            )

            # Build TF-IDF matrix
            self.build_similarity_matrix()
            
            print("Data preprocessing completed!")
            self.show_dataset_info()
            
        except FileNotFoundError:
            print(f"Error: Could not find {csv_path}")
            print("Please make sure the anime.csv file is in the same directory.")
        except Exception as e:
            print(f"Error loading data: {e}")

    def build_similarity_matrix(self):
        """Build TF-IDF matrix and calculate cosine similarity"""
        print("Building similarity matrix...")
        
        # Create custom class of TF-IDF vectorizer
        # Stop words english removes common filler words
        tfidf = TfidfVectorizer(
            stop_words='english',
            max_features=5000,
            ngram_range=(1, 2)
        )
        # unigrams and bigrams for richer text representation
        
        # Fit and transform the combined features 
        self.tfidf_matrix = tfidf.fit_transform(self.df['combined_features'])
        
        # Calculate cosine similarity matrix - 1 is identical, 0 is completely dissimlar
        self.cosine_sim = cosine_similarity(self.tfidf_matrix, self.tfidf_matrix)
        
        print("Similarity matrix built successfully!")
    
    def show_dataset_info(self):
        """Display basic information about the dataset"""
        print("\n" + "="*50)
        print("DATASET OVERVIEW")
        print("="*50)
        print(f"Total anime entries: {len(self.df)}")
        print(f"Columns: {list(self.df.columns)}")

        # Show distribution of ratings
        if 'rating' in self.df.columns:
            print(f"\nAverage rating: {self.df['rating'].mean():.2f}")
            print(f"Rating range: {self.df['rating'].min():.1f} - {self.df['rating'].max():.1f}")
        
        # Show top genres
        all_genres = []
        for genres in self.df['genre'].dropna():
            all_genres.extend([g.strip() for g in genres.split(',')])
        
        genre_counts = Counter(all_genres)
        print(f"\nTop 10 genres:")
        for genre, count in genre_counts.most_common(10):
            if genre:  # Skip empty strings
                print(f"  {genre}: {count}")

    def get_recommendations(self, anime_name, num_recommendations=10):
        """Get anime recommendations based on similarity"""
        try:
            # Find the anime in the dataset
            idx = self.df[self.df['name'].str.contains(anime_name, case=False, na=False)].index
            
            if len(idx) == 0:
                return None, f"Anime '{anime_name}' not found in dataset."
            
            # Use the first match if multiple found
            anime_idx = idx[0]
            anime_info = self.df.iloc[anime_idx]
            
            # Get similarity scores for this anime
            sim_scores = list(enumerate(self.cosine_sim[anime_idx]))
            
            # Sort by similarity score (excluding the anime itself)
            sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:num_recommendations+1]
            
            # Get recommended anime indices
            anime_indices = [i[0] for i in sim_scores]
            
            # Create recommendations dataframe
            recommendations = self.df.iloc[anime_indices][
                ['name', 'genre', 'rating', 'episodes', 'type', 'members']
            ].copy()
            
            # Add similarity scores
            recommendations['Similarity'] = [score[1] for score in sim_scores]
            
            return anime_info, recommendations
            
        except Exception as e:
            return None, f"Error getting recommendations: {e}"

    def search_anime(self, query, max_results=10):
        """Search for anime by name"""
        matches = self.df[self.df['name'].str.contains(query, case=False, na=False)]
        return matches[['name', 'genre', 'rating', 'episodes']].head(max_results)
    
    def explore_genre(self, genre, top_n=15):
        """Explore anime by genre"""
        genre_anime = self.df[self.df['genre'].str.contains(genre, case=False, na=False)]
        
        if len(genre_anime) == 0:
            print(f"No anime found for genre '{genre}'")
            return None
        
        # Sort by score
        top_anime = genre_anime.nlargest(top_n, 'rating')[
            ['name', 'rating', 'episodes', 'genre', 'type']
        ]
        
        return top_anime
    
    def plot_genre_distribution(self, top_n=15):
        """Plot distribution of top genres"""
        all_genres = []
        for genres in self.df['genre'].dropna():
            all_genres.extend([g.strip() for g in genres.split(',')])
        
        genre_counts = Counter(all_genres)
        top_genres = dict(genre_counts.most_common(top_n))
        
        # Remove empty strings
        top_genres = {k: v for k, v in top_genres.items() if k}
        
        plt.figure(figsize=(12, 8))
        plt.barh(list(top_genres.keys()), list(top_genres.values()))
        plt.title(f'Top {top_n} Anime Genres Distribution')
        plt.xlabel('Number of Anime')
        plt.tight_layout()
        plt.show()
    
    def plot_score_distribution(self):
        """Plot distribution of anime scores"""
        plt.figure(figsize=(10, 6))
        plt.hist(self.df['rating'].dropna(), bins=50, edgecolor='black', alpha=0.7)
        plt.title('Distribution of Anime Ratings')
        plt.xlabel('rating')
        plt.ylabel('Frequency')
        plt.axvline(self.df['rating'].mean(), color='red', linestyle='--', 
                   label=f'Mean: {self.df["rating"].mean():.2f}')
        plt.legend()
        plt.tight_layout()
        plt.show()

In [None]:
# 3. 
def create_interactive_interface(engine):
    """Create interactive widgets for the recommendation engine"""
    if not WIDGETS_AVAILABLE:
        print("Interactive widgets not available. Use the engine methods directly.")
        return
    
    # Search widget
    search_box = widgets.Text(
        placeholder='Enter anime name...',
        description='Search:',
        style={'description_width': 'initial'}
    )
    
    search_button = widgets.Button(
        description='Search Anime',
        button_style='info'
    )
    
    # Recommendation widgets
    rec_box = widgets.Text(
        placeholder='Enter anime name for recommendations...',
        description='Get Recs:',
        style={'description_width': 'initial'}
    )
    
    num_recs = widgets.IntSlider(
        value=10,
        min=5,
        max=20,
        step=1,
        description='Number:',
        style={'description_width': 'initial'}
    )
    
    rec_button = widgets.Button(
        description='Get Recommendations',
        button_style='success'
    )
    
    # Genre exploration
    genre_box = widgets.Text(
        placeholder='Enter genre (e.g., Action, Romance)...',
        description='Genre:',
        style={'description_width': 'initial'}
    )
    
    genre_button = widgets.Button(
        description='Explore Genre',
        button_style='warning'
    )
    
    # Output area
    output = widgets.Output()
    
    def on_search_click(b):
        with output:
            clear_output()
            if search_box.value:
                results = engine.search_anime(search_box.value)
                print(f"Search results for '{search_box.value}':")
                print(results.to_string(index=False))
            else:
                print("Please enter an anime name to search.")
    
    def on_rec_click(b):
        with output:
            clear_output()
            if rec_box.value:
                anime_info, recommendations = engine.get_recommendations(
                    rec_box.value, num_recs.value
                )
                if anime_info is not None:
                    print(f"Based on: {anime_info['name']}")
                    print(f"genre: {anime_info['genre']}")
                    print(f"rating: {anime_info['rating']}")
                    print("\nRecommendations:")
                    print(recommendations.to_string(index=False))
                else:
                    print(recommendations)  # Error message
            else:
                print("Please enter an anime name for recommendations.")
    
    def on_genre_click(b):
        with output:
            clear_output()
            if genre_box.value:
                results = engine.explore_genre(genre_box.value)
                if results is not None:
                    print(f"Top anime in '{genre_box.value}' genre:")
                    print(results.to_string(index=False))
            else:
                print("Please enter a genre to explore.")
    
    # Connect buttons to functions
    search_button.on_click(on_search_click)
    rec_button.on_click(on_rec_click)
    genre_button.on_click(on_genre_click)
    
    # Display interface
    print("Interactive Anime Recommendation Engine")
    print("="*50)
    
    display(widgets.VBox([
        widgets.HTML("<h3>Search Anime</h3>"),
        widgets.HBox([search_box, search_button]),
        
        widgets.HTML("<h3>Get Recommendations</h3>"),
        widgets.HBox([rec_box, num_recs]),
        rec_button,
        
        widgets.HTML("<h3>Explore by Genre</h3>"),
        widgets.HBox([genre_box, genre_button]),
        
        widgets.HTML("<h3>Results</h3>"),
        output
    ]))

In [None]:
# 4.
# Main execution
def main():
    print("Anime Recommendation Engine")
    print("="*50)
    
    # Initialize the recommendation engine
    engine = AnimeRecommendationEngine('../data/anime.csv')
    
    if engine.df is not None:
        # Create interactive interface if widgets available
        if WIDGETS_AVAILABLE:
            create_interactive_interface(engine)
        else:
            # Provide example usage
            print("\n" + "="*50)
            print("EXAMPLE USAGE")
            print("="*50)
            
            # Example search
            print("\n1. Search for anime:")
            search_results = engine.search_anime("Attack on Titan")
            print(search_results)
            
            # Example recommendations
            print("\n2. Get recommendations:")
            anime_info, recs = engine.get_recommendations("Death Note", 5)
            if anime_info is not None:
                print(f"\nBased on: {anime_info['name']}")
                print("Recommendations:")
                print(recs[['name', 'rating', 'Similarity']])
            
            # Example genre exploration
            print("\n3. Explore Action genre:")
            action_anime = engine.explore_genre("Action", 5)
            print(action_anime[['name', 'rating']])
        
        # Show visualizations
        print("\n" + "="*50)
        print("DATASET VISUALIZATIONS")
        print("="*50)
        
        engine.plot_genre_distribution()
        engine.plot_score_distribution()
        
        return engine
    else:
        print("Failed to load dataset. Please check the file path.")
        return None

# Run the main function
if __name__ == "__main__":
    anime_engine = main()