In [91]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

class MovieRecommendationSystem:
    """
    Class to handle the movie recommendation system based on movie title, genre, and description.

    Attributes:
    - movies_df: pandas.DataFrame
        DataFrame containing movie data with columns: 'title', 'genre', 'description'.
    - tfidf_matrix: numpy.ndarray
        Matrix representation of the movie descriptions using TF-IDF vectorization.
    - cosine_sim_matrix: numpy.ndarray
        Matrix representing the 
        similarity between movie descriptions.
    """

    def __init__(self, movies_df):
        """
        Constructor to instantiate the MovieRecommendationSystem class.

        Parameters:
        - movies_df: pandas.DataFrame
            DataFrame containing movie data with columns: 'title', 'genre', 'description'.
        """

        self.movies_df = movies_df

        # Preprocess the movie descriptions using TF-IDF vectorization
        tfidf = TfidfVectorizer(stop_words='english')
        self.tfidf_matrix = tfidf.fit_transform(movies_df['description'])

        # Calculate the cosine similarity matrix
        self.cosine_sim_matrix = linear_kernel(self.tfidf_matrix, self.tfidf_matrix)

    def recommend_movies(self, title, genre, description, top_n=5):
        """
        Recommends top N movies based on the given title, genre, and description.

        Parameters:
        - title: str
            Title of the movie.
        - genre: str
            Genre of the movie.
        - description: str
            Description of the movie.
        - top_n: int, optional (default=5)
            Number of movies to recommend.

        Returns:
        - list:
            List of top N recommended movies.
        """

        # Create a new DataFrame with the given movie details
        new_movie = pd.DataFrame({'title': [title], 'genre': [genre], 'description': [description]})

        # Append the new movie to the existing DataFrame
        updated_movies_df = pd.concat([self.movies_df, new_movie], ignore_index=True)

        # Preprocess the updated movie descriptions using TF-IDF vectorization
        tfidf = TfidfVectorizer(stop_words='english')
        tfidf_matrix = tfidf.fit_transform(updated_movies_df['description'])

        # Calculate the cosine similarity matrix for the updated movie descriptions
        cosine_sim_matrix = linear_kernel(tfidf_matrix, tfidf_matrix)

        # Get the index of the new movie in the DataFrame
        new_movie_index = updated_movies_df.index[-1]

        # Calculate the similarity scores between the new movie and all other movies
        similarity_scores = list(enumerate(cosine_sim_matrix[new_movie_index]))

        # Sort the movies based on similarity scores
        sorted_movies = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

        # Get the top N recommended movies (excluding the new movie itself)
        top_movies = [updated_movies_df['title'][i] for i, _ in sorted_movies[1:top_n+1]]

        return top_movies


In [92]:
import re

def clean(text):
    # Menggunakan ekspresi reguler untuk menyaring hanya huruf
    text = re.sub("[^a-zA-Z\s]", "", text)
    return text

In [96]:
import pandas as pd
import numpy as np

# Read the anime dataset
anime = pd.read_csv("anime-dataset-2023.csv")
anime = anime[["Name", "Genres", "Synopsis"]].replace('UNKNOWN', np.nan)
anime = anime.rename(columns={"Name": "title", "Genres": "genre", "Synopsis": "description"})

anime = anime.replace('UNKNOWN', np.nan).dropna()
anime.reset_index(drop=True, inplace=True)

anime['description'] = anime['description'].astype(str)
anime['genre'] = anime['genre'].astype(str)

# Display the dataframe
anime.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19976 entries, 0 to 19975
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        19976 non-null  object
 1   genre        19976 non-null  object
 2   description  19976 non-null  object
dtypes: object(3)
memory usage: 468.3+ KB


In [98]:
anime = anime.astype(str)

In [94]:

import random

random_indices = random.sample(range(len(anime)), 1)

random = pd.concat([anime.iloc[index] for index in random_indices], axis=1).T

random = random.reset_index(drop=True)
random

Unnamed: 0,title,genre,description
0,Birthday Boy,"Action, Award Winning, Drama","Korean War, 1951. Little Manuk is playing on the streets of his village and dreaming of life at the front where his father is a soldier. He returns home to find a parcel on the doorstep and, thinking it is a birthday present, he opens it. But its contents will change his life.\n\n(Source: IMDB)"


In [95]:
recommendation_system = MovieRecommendationSystem(anime)

recommended_movies = recommendation_system.recommend_movies(random["title"], random["genre"], random['description'], 3)
print(recommendation_system)

AttributeError: 'Series' object has no attribute 'lower'