<a href="https://colab.research.google.com/github/paulusshewamre/content-collab-hybrid-recsys/blob/main/ContentBased.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import os
import pandas as pd
import kagglehub
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [6]:
# Download Netflix dataset
path = kagglehub.dataset_download("shivamb/netflix-shows")

# Load the CSV file
df = pd.read_csv(os.path.join(path, "netflix_titles.csv"))

Using Colab cache for faster access to the 'netflix-shows' dataset.


In [7]:
# Keep only useful columns
df = df[['title', 'listed_in']].dropna()

In [9]:
# Vectorize genres/categories
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['listed_in'])

In [10]:
# Compute cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [11]:
# Recommendation function
def recommend_content(movie_title, top_n=5):
    if movie_title not in df['title'].values:
        return f"'{movie_title}' not found in dataset."

    idx = df[df['title'] == movie_title].index[0]
    scores = list(enumerate(cosine_sim[idx]))
    scores = sorted(scores, key=lambda x: x[1], reverse=True)

    recommended = [df['title'][i[0]] for i in scores[1:top_n+1]]
    return recommended

In [12]:
# Example
print("Content-based:", recommend_content("Inception"))  # might not exist in Netflix
print("Content-based:", recommend_content("Breaking Bad"))  # should work

Content-based: ['Chappie', 'Green Lantern', 'Beowulf', 'Mortal Kombat', 'Star Trek']
Content-based: ['Dare Me', 'The Blacklist', 'Ozark', 'Designated Survivor', 'Breaking Bad']
