In [120]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [121]:
# Load the dataset
movies = pd.read_csv('dataset.csv')
movies = movies[['id', 'title', 'overview', 'genre']]
movies

Unnamed: 0,id,title,overview,genre
0,278,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,"Drama,Crime"
1,19404,Dilwale Dulhania Le Jayenge,"Raj is a rich, carefree, happy-go-lucky second...","Comedy,Drama,Romance"
2,238,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...","Drama,Crime"
3,424,Schindler's List,The true story of how businessman Oskar Schind...,"Drama,History,War"
4,240,The Godfather: Part II,In the continuing saga of the Corleone crime f...,"Drama,Crime"
...,...,...,...,...
9995,10196,The Last Airbender,"The story follows the adventures of Aang, a yo...","Action,Adventure,Fantasy"
9996,331446,Sharknado 3: Oh Hell No!,The sharks take bite out of the East Coast whe...,"Action,TV Movie,Science Fiction,Comedy,Adventure"
9997,13995,Captain America,"During World War II, a brave, patriotic Americ...","Action,Science Fiction,War"
9998,2312,In the Name of the King: A Dungeon Siege Tale,A man named Farmer sets out to rescue his kidn...,"Adventure,Fantasy,Action,Drama"


In [122]:
# Drop the 'overview' column
new_data = movies.drop(columns=['overview'])

In [123]:
# Vectorize the 'genre' column
cv = CountVectorizer(max_features=10000, stop_words='english')
vector = cv.fit_transform(new_data['genre'].values.astype('U')).toarray()

In [124]:
# Compute the cosine similarity
similarity = cosine_similarity(vector)

In [125]:
# Function to recommend movies by genre
def recommend_by_genre(genre, top_n=5):
    # Filter movies matching the given genre
    genre_movies = new_data[new_data['genre'].str.contains(genre, case=False, na=False)]
    
    if genre_movies.empty:
        print(f"No movies found for the genre: {genre}")
        return
    
    # Store recommendations
    recommended_movies = []
    
    print(f"Top {top_n} recommendations for genre: {genre}")
    for index, row in genre_movies.iterrows():
        movie_index = row.name  # Get the movie index in the dataset
        distances = list(enumerate(similarity[movie_index]))
        distances = sorted(distances, reverse=True, key=lambda x: x[1])[1:]  # Skip the movie itself
        
        # Add the top similar movies
        for i in distances:
            movie_title = new_data.iloc[i[0]]['title']
            if movie_title not in recommended_movies:  # Avoid duplicates
                recommended_movies.append(movie_title)
            if len(recommended_movies) >= top_n:  # Stop when top_n is reached
                break
        if len(recommended_movies) >= top_n:
            break

    # Print the recommendations
    for idx, movie in enumerate(recommended_movies[:top_n], 1):
        print(f"{idx}: {movie}")

In [126]:
recommend_by_genre("Adventure", top_n=5)

Top 5 recommendations for genre: Adventure
1: The Lord of the Rings: The Fellowship of the Ring
2: The Lord of the Rings: The Two Towers
3: Pirates of the Caribbean: The Curse of the Black Pearl
4: Shang-Chi and the Legend of the Ten Rings
5: The Hobbit: The Desolation of Smaug


In [127]:
pickle.dump(similarity, open('similarity.pkl', 'wb'))
pickle.dump(new_data, open('movies_list.pkl', 'wb'))