In [5]:
import os
import pandas as pd
import numpy as np
import ast
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [6]:
# ========= LOAD DATA =========
DATA_DIR = os.path.join('..', '..', 'data', 'cleaned')
TARGET_DATA_DIR = os.path.join('..', '..', 'data', 'cleaned','joblib_dataframes')

movies = joblib.load(os.path.join(TARGET_DATA_DIR, 'df_final.joblib'))

meta_files = {
    'actors':     'dim_movie_actors.csv',
    'directors':  'dim_movie_directors.csv',
    'producers':  'dim_movie_producers.csv',
    'writers':    'dim_movie_writers.csv',
    'categories': 'dim_movie_categories.csv',
}

meta_maps = {
    'actors': pd.read_csv(os.path.join(DATA_DIR, 'dim_movie_actors.csv')).set_index('actor_name_id')['actor_name'].to_dict(),
    'directors': pd.read_csv(os.path.join(DATA_DIR, 'dim_movie_directors.csv')).set_index('director_name_id')['director_name'].to_dict(),
    'producers': pd.read_csv(os.path.join(DATA_DIR, 'dim_movie_producers.csv')).set_index('producer_name_id')['producer_name'].to_dict(),
    'writers': pd.read_csv(os.path.join(DATA_DIR, 'dim_movie_writers.csv')).set_index('writer_name_id')['writer_name'].to_dict(),
    'categories': pd.read_csv(os.path.join(DATA_DIR, 'dim_movie_categories.csv')).set_index('movie_category_id')['movie_category'].to_dict(),
}

In [7]:
# ========= DECODE HELPER FUNCTION =========
def decode_ids(id_input, mapping_dict):
    try:
        # If it's a string, parse it
        if isinstance(id_input, str):
            id_list = ast.literal_eval(id_input)

        # If it's type int or np.int, parse it
        elif isinstance(id_input, (int, np.integer)):
            id_list = id_input

        # if it's already an array or list
        elif isinstance(id_input, (list, np.ndarray)):
            id_list = id_input
        else:
            return ''

        # Zorg dat alles in id_list strings zijn
        return '|'.join([mapping_dict.get(str(i), f'Unknown_{i}') for i in id_list])

    except Exception as e:
        print(f"Error decoding: {id_input} → {e}")
        return ''

In [8]:
# ========= FEAUTURE ENGINEERING =========
for name, mapping in meta_maps.items():
    movies[f'{name}_str'] = movies[name].apply(lambda x: decode_ids(x, mapping))

existing_cols = [f'{k}_str' for k in meta_maps.keys() if f'{k}_str' in movies.columns]
movies['combined_features'] = movies[existing_cols].fillna('').agg('|'.join, axis=1)

In [9]:
# ========= TF-IDF & COSINE SIMILARITY =========
"""
TF-IDF = Term Frequency – Inverse Document Frequency

- TF (Term Frequency):
  Measures how frequently a term appears in a document.
  Example: If "Will Smith" appears in 2 movies, its TF is high.

- IDF (Inverse Document Frequency):
  Measures how rare a term is across all documents.
  Example: If "Will Smith" appears in many movies, its IDF is low.
           If "Michael Bay" appears in only one movie, its IDF is high.

Why TF-IDF?
-----------
TF-IDF helps highlight important and unique words in text data while downweighting common, less informative terms.

Cosine Similarity
-----------------
- Treats every movie as a vector (based on TF-IDF features).
- Measures the angle between two vectors (movies).
- Range: 0   → no similarity
         1   → exactly the same
"""

tfidf = TfidfVectorizer(token_pattern=r'[^|]+')
tfidf_matrix = tfidf.fit_transform(movies['combined_features'])
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

title_to_index = pd.Series(movies.index, index=movies['title'])

In [10]:
# ========= RECOMMENDER FUNCTION =========
def content_recommendations(title, top_n=5):
    if title not in title_to_index:
        return ["Title not found."]
    idx = title_to_index[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    top_indices = [i for i, _ in sim_scores[1:top_n+1]]
    return movies['title'].iloc[top_indices].tolist()

# Example usage
print(content_recommendations("Bad Boys II", top_n=5))

['Hollywood Homicide', 'National Treasure: Book of Secrets', 'National Treasure', 'Pearl Harbor', "My Mom's New Boyfriend"]


In [11]:
# ========= CREATE OUTPUT FOR OTHER MODELING IN OTHER NOTEBOOKS =========
df_tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out())
df_tfidf.index = movies.index
df_movies_combined_rf = pd.concat([movies, df_tfidf], axis=1)

joblib.dump(df_movies_combined_rf, os.path.join(TARGET_DATA_DIR, "df_movies_combined_rf.joblib"))

['../../data/cleaned/joblib_dataframes/df_movies_combined_rf.joblib']