# NOTEBOOK 4: CONTENT-BASED FILTERING 
## Content-Based Recommendation using:
1. Movie genres (one-hot encoded)
2. User tags (TF-IDF vectorization)
3. Cosine similarity for movie-to-movie matching

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import os
import warnings

# Machine Learning
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MultiLabelBinarizer

# Evaluation
from sklearn.metrics import precision_score, recall_score, f1_score

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

warnings.filterwarnings('ignore')
sns.set_style('whitegrid')

print("=" * 80)
print("ðŸŽ¬ CONTENT-BASED RECOMMENDER SYSTEM")
print("=" * 80)
print(f"ðŸ“… Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"ðŸŽ¯ Goal: Build content-based recommendations using genres and NLP")
print("=" * 80)

ðŸŽ¬ CONTENT-BASED RECOMMENDER SYSTEM
ðŸ“… Started: 2025-10-27 15:03:51
ðŸŽ¯ Goal: Build content-based recommendations using genres and NLP


## 2. LOAD PROCESSED DATA

In [None]:
print("\n" + "=" * 80)
print("LOADING PROCESSED DATA")
print("=" * 80)

# Define paths
BASE_DIR = os.path.dirname(os.getcwd()) if os.path.basename(os.getcwd()) == 'notebooks' else os.getcwd()
PROCESSED_DIR = os.path.join(BASE_DIR, 'data', 'processed')
MODELS_DIR = os.path.join(BASE_DIR, 'models')
RESULTS_DIR = os.path.join(BASE_DIR, 'reports', 'results')

# Create directories
os.makedirs(MODELS_DIR, exist_ok=True)
os.makedirs(RESULTS_DIR, exist_ok=True)

print(f"\nData directory: {PROCESSED_DIR}")
print(f"Models directory: {MODELS_DIR}")
print(f"Results directory: {RESULTS_DIR}")

# Load datasets
print("\nLoading datasets...")
movies_features = pd.read_csv(os.path.join(PROCESSED_DIR, 'movies_features.csv'))
movie_tags = pd.read_csv(os.path.join(PROCESSED_DIR, 'movie_tags_aggregated.csv'))
train = pd.read_csv(os.path.join(PROCESSED_DIR, 'train.csv'))
test = pd.read_csv(os.path.join(PROCESSED_DIR, 'test.csv'))

print("Data loaded successfully")

# Display dataset info
print("\n" + "=" * 80)
print("DATASET SUMMARY")
print("=" * 80)

datasets = {
    'Movies (with features)': movies_features,
    'Movie tags (aggregated)': movie_tags,
    'Train ratings': train,
    'Test ratings': test
}

for name, df in datasets.items():
    print(f"\n{name}:")
    print(f"  Shape: {df.shape[0]:,} rows x {df.shape[1]} columns")
    print(f"  Memory: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

# Check data quality
print("\n" + "=" * 80)
print("DATA QUALITY CHECK")
print("=" * 80)

print(f"\nMovies with genres: {movies_features['has_genres'].sum():,} ({movies_features['has_genres'].mean()*100:.1f}%)")
print(f"Movies with tags: {movies_features['has_tags'].sum():,} ({movies_features['has_tags'].mean()*100:.1f}%)")
print(f"Movies with ratings: {movies_features['rating_count'].notna().sum():,} ({movies_features['rating_count'].notna().mean()*100:.1f}%)")

# Calculate coverage
movies_with_content = movies_features[
    (movies_features['has_genres'] == 1) | 
    (movies_features['has_tags'] == 1)
]

print(f"\nMovies with content features (genres OR tags):")
print(f"  {len(movies_with_content):,} ({len(movies_with_content)/len(movies_features)*100:.1f}%)")

# Sample preview
print("\n" + "=" * 80)
print("DATA PREVIEW")
print("=" * 80)
print("\nSample movies:")
print(movies_features[['movieId', 'title', 'genres', 'release_year', 'rating_mean']].head(3).to_string())




LOADING PROCESSED DATA

Data directory: c:\Users\mhfou\Documents\MovieRecommenderSystem\data\processed
Models directory: c:\Users\mhfou\Documents\MovieRecommenderSystem\models
Results directory: c:\Users\mhfou\Documents\MovieRecommenderSystem\reports\results

Loading datasets...
Data loaded successfully

DATASET SUMMARY

Movies (with features):
  Shape: 62,423 rows x 23 columns
  Memory: 45.70 MB

Movie tags (aggregated):
  Shape: 45,249 rows x 4 columns
  Memory: 15.20 MB

Train ratings:
  Shape: 20,000,076 rows x 4 columns
  Memory: 610.35 MB

Test ratings:
  Shape: 5,000,019 rows x 4 columns
  Memory: 152.59 MB

DATA QUALITY CHECK

Movies with genres: 57,361 (91.9%)
Movies with tags: 45,249 (72.5%)
Movies with ratings: 59,047 (94.6%)

Movies with content features (genres OR tags):
  59,514 (95.3%)

DATA PREVIEW

Sample movies:
   movieId                    title                                       genres  release_year  rating_mean
0        1         Toy Story (1995)  Adventure|An

## 3. GENRE-BASED CONTENT PREPARATION

In [4]:
print("\n" + "=" * 80)
print("GENRE-BASED CONTENT PREPARATION")
print("=" * 80)

# Filter movies with genres
movies_with_genres = movies_features[movies_features['has_genres'] == 1].copy()
print(f"\nMovies with genres: {len(movies_with_genres):,}")

# Parse genres into lists
print("\nParsing genre strings into lists...")
movies_with_genres['genres_list'] = movies_with_genres['genres'].str.split('|')

# Remove '(no genres listed)' if present
movies_with_genres['genres_list'] = movies_with_genres['genres_list'].apply(
    lambda x: [g for g in x if g != '(no genres listed)'] if isinstance(x, list) else []
)

# Check genre distribution
all_genres = []
for genres in movies_with_genres['genres_list']:
    all_genres.extend(genres)

print(f"Total genre tags: {len(all_genres):,}")
print(f"Unique genres: {len(set(all_genres))}")

# Display genre frequency
genre_counts = pd.Series(all_genres).value_counts()
print(f"\nTop 10 Most Common Genres:")
for i, (genre, count) in enumerate(genre_counts.head(10).items(), 1):
    print(f"   {i:2d}. {genre:15s}: {count:>6,} movies ({count/len(movies_with_genres)*100:>5.1f}%)")

# One-hot encode genres using MultiLabelBinarizer
print(f"\nOne-hot encoding genres...")
mlb = MultiLabelBinarizer()
genre_matrix = mlb.fit_transform(movies_with_genres['genres_list'])

print(f"Genre matrix created: {genre_matrix.shape[0]:,} movies x {genre_matrix.shape[1]} genres")

# Create DataFrame for easier handling
genre_features_df = pd.DataFrame(
    genre_matrix,
    columns=mlb.classes_,
    index=movies_with_genres['movieId']
)

print(f"\nGenre Feature Matrix:")
print(f"   Shape: {genre_features_df.shape}")
print(f"   Density: {(genre_matrix.sum() / genre_matrix.size) * 100:.2f}% (non-zero values)")
print(f"   Avg genres per movie: {genre_matrix.sum(axis=1).mean():.2f}")

# Display sample
print(f"\nSample of genre features (first 3 movies):")
print(genre_features_df.head(3))

print("\nGenre features ready for similarity calculation")
print("=" * 80)



GENRE-BASED CONTENT PREPARATION

Movies with genres: 57,361

Parsing genre strings into lists...
Total genre tags: 107,245
Unique genres: 19

Top 10 Most Common Genres:
    1. Drama          : 25,606 movies ( 44.6%)
    2. Comedy         : 16,870 movies ( 29.4%)
    3. Thriller       :  8,654 movies ( 15.1%)
    4. Romance        :  7,719 movies ( 13.5%)
    5. Action         :  7,348 movies ( 12.8%)
    6. Horror         :  5,989 movies ( 10.4%)
    7. Documentary    :  5,605 movies (  9.8%)
    8. Crime          :  5,319 movies (  9.3%)
    9. Adventure      :  4,145 movies (  7.2%)
   10. Sci-Fi         :  3,595 movies (  6.3%)

One-hot encoding genres...
Genre matrix created: 57,361 movies x 19 genres

Genre Feature Matrix:
   Shape: (57361, 19)
   Density: 9.84% (non-zero values)
   Avg genres per movie: 1.87

Sample of genre features (first 3 movies):
         Action  Adventure  Animation  Children  Comedy  Crime  Documentary  \
movieId                                           

In [None]:
movies_with_genres.head()

Unnamed: 0,movieId,title,genres,release_year,genres_list,genre_count,has_genres,rating_count,rating_mean,rating_std,...,last_rating_date,days_since_first_rating,days_since_last_rating,rating_period_days,popularity_score,popularity_category,tags_text,tag_count,unique_tag_count,has_tags
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995.0,"[Adventure, Animation, Children, Comedy, Fantasy]",5,1,57309.0,3.893708,0.921552,...,2019-11-20 21:23:42,8697.0,0.0,8696.0,10.95623,blockbuster,owned imdb top 250 pixar pixar time travel chi...,697,118,1
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995.0,"[Adventure, Children, Fantasy]",3,1,24228.0,3.251527,0.959851,...,2019-11-20 19:07:01,8697.0,0.0,8696.0,10.095306,blockbuster,robin williams time travel fantasy based on ch...,180,42,1
2,3,Grumpier Old Men (1995),Comedy|Romance,1995.0,"[Comedy, Romance]",2,1,11804.0,3.142028,1.008443,...,2019-11-11 02:30:45,8693.0,9.0,8683.0,9.376278,blockbuster,funny best friend duringcreditsstinger fishing...,29,22,1
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995.0,"[Comedy, Drama, Romance]",3,1,2523.0,2.853547,1.108531,...,2019-11-20 01:24:15,8693.0,0.0,8692.0,7.8336,blockbuster,based on novel or book chick flick divorce int...,11,8,1
4,5,Father of the Bride Part II (1995),Comedy,1995.0,[Comedy],1,1,11714.0,3.058434,0.996611,...,2019-11-06 09:36:58,8693.0,14.0,8678.0,9.368625,blockbuster,aging baby confidence contraception daughter g...,24,19,1


In [13]:
movies_features.head()

Unnamed: 0,movieId,title,genres,release_year,genres_list,genre_count,has_genres,rating_count,rating_mean,rating_std,...,last_rating_date,days_since_first_rating,days_since_last_rating,rating_period_days,popularity_score,popularity_category,tags_text,tag_count,unique_tag_count,has_tags
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995.0,"['Adventure', 'Animation', 'Children', 'Comedy...",5,1,57309.0,3.893708,0.921552,...,2019-11-20 21:23:42,8697.0,0.0,8696.0,10.95623,blockbuster,owned imdb top 250 pixar pixar time travel chi...,697,118,1
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995.0,"['Adventure', 'Children', 'Fantasy']",3,1,24228.0,3.251527,0.959851,...,2019-11-20 19:07:01,8697.0,0.0,8696.0,10.095306,blockbuster,robin williams time travel fantasy based on ch...,180,42,1
2,3,Grumpier Old Men (1995),Comedy|Romance,1995.0,"['Comedy', 'Romance']",2,1,11804.0,3.142028,1.008443,...,2019-11-11 02:30:45,8693.0,9.0,8683.0,9.376278,blockbuster,funny best friend duringcreditsstinger fishing...,29,22,1
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995.0,"['Comedy', 'Drama', 'Romance']",3,1,2523.0,2.853547,1.108531,...,2019-11-20 01:24:15,8693.0,0.0,8692.0,7.8336,blockbuster,based on novel or book chick flick divorce int...,11,8,1
4,5,Father of the Bride Part II (1995),Comedy,1995.0,['Comedy'],1,1,11714.0,3.058434,0.996611,...,2019-11-06 09:36:58,8693.0,14.0,8678.0,9.368625,blockbuster,aging baby confidence contraception daughter g...,24,19,1
