In [1]:
import pandas as pd

## Movie dataset

In [None]:
df_movies = pd.read_csv('./datasets/movies.csv')

In [3]:
df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
df_movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87585 entries, 0 to 87584
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  87585 non-null  int64 
 1   title    87585 non-null  object
 2   genres   87585 non-null  object
dtypes: int64(1), object(2)
memory usage: 2.0+ MB


## Ratings dataset

In [4]:
df_ratings = pd.read_csv('./datasets/ratings.csv')

In [5]:
df_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,17,4.0,944249077
1,1,25,1.0,944250228
2,1,29,2.0,943230976
3,1,30,5.0,944249077
4,1,32,5.0,943228858


In [6]:
df_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32000204 entries, 0 to 32000203
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
dtypes: float64(1), int64(3)
memory usage: 976.6 MB


In [7]:
df_ratings['rating'].describe()

count    3.200020e+07
mean     3.540396e+00
std      1.058986e+00
min      5.000000e-01
25%      3.000000e+00
50%      3.500000e+00
75%      4.000000e+00
max      5.000000e+00
Name: rating, dtype: float64

In [8]:
df = pd.merge(df_ratings, df_movies, on='movieId')

In [11]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
import numpy as np

# --- 1. Préparer les données communes ---

# Supprimer les genres manquants
df_movies['genres'] = df_movies['genres'].fillna("")

# Garde uniquement les films qui ont été notés
rated_movie_ids = df_ratings['movieId'].unique()
df_common_movies = df_movies[df_movies['movieId'].isin(rated_movie_ids)].reset_index(drop=True)

# Remappage movieId ↔ index
movie_id_to_index = pd.Series(df_common_movies.index, index=df_common_movies['movieId']).to_dict()
index_to_title = pd.Series(df_common_movies['title'].values, index=df_common_movies.index).to_dict()
title_to_index = pd.Series(df_common_movies.index, index=df_common_movies['title']).to_dict()

# --- 2. Matrice sparse des genres ---

vectorizer = CountVectorizer(tokenizer=lambda x: x.split('|'))
genre_matrix_sparse = vectorizer.fit_transform(df_common_movies['genres'])

# --- 3. Matrice sparse film-utilisateur (collaboratif) ---

# Recaler les indices movie/user pour matrice sparse
df_ratings_filtered = df_ratings[df_ratings['movieId'].isin(rated_movie_ids)].copy()
df_ratings_filtered['movie_index'] = df_ratings_filtered['movieId'].map(movie_id_to_index)
df_ratings_filtered['user_index'] = df_ratings_filtered['userId'].astype("category").cat.codes

movie_user_matrix = csr_matrix((
    df_ratings_filtered['rating'],
    (df_ratings_filtered['movie_index'], df_ratings_filtered['user_index'])
))

# --- 4. Fonction hybride ---

def recommend_hybrid(movie_title, top_n=5, alpha=0.5):
    """
    Recommandation hybride (ratings + genres)
    alpha = poids ratings (1 = full ratings, 0 = full genres)
    """
    if movie_title not in title_to_index:
        return []
    
    idx = title_to_index[movie_title]

    # Similarité par genres
    genre_sim = cosine_similarity(genre_matrix_sparse[idx], genre_matrix_sparse).flatten()

    # Similarité par ratings
    rating_sim = cosine_similarity(movie_user_matrix[idx], movie_user_matrix).flatten()

    # Score combiné
    combined_score = alpha * rating_sim + (1 - alpha) * genre_sim

    # Trier, exclure le film lui-même
    top_indices = combined_score.argsort()[::-1]
    top_indices = [i for i in top_indices if i != idx][:top_n]

    return [index_to_title[i] for i in top_indices]


In [14]:
# 60% basé sur les ratings, 40% sur les genres
print(recommend_hybrid("Toy Story (1995)", top_n=5, alpha=0.5))

['Toy Story 2 (1999)', 'Monsters, Inc. (2001)', 'Shrek (2001)', 'Finding Nemo (2003)', "Bug's Life, A (1998)"]


In [15]:
from collections import Counter

def recommend_hybrid_for_user(user_id, top_n=10, alpha=0.5, like_threshold=4.0):
    """
    Recommande des films à un utilisateur en combinant ratings et genres.
    - alpha : poids des ratings (0 = que genres, 1 = que ratings)
    - like_threshold : on considère qu’un utilisateur aime un film si rating ≥ seuil
    """
    if user_id not in df_ratings['userId'].unique():
        return []

    # 1. Films que l'utilisateur a aimés
    liked = df_ratings_filtered[
        (df_ratings_filtered['userId'] == user_id) &
        (df_ratings_filtered['rating'] >= like_threshold)
    ]['movie_index'].dropna().unique()

    if len(liked) == 0:
        return []

    # 2. Cumuler les recommandations hybrides pour chaque film aimé
    all_recos = []
    for idx in liked:
        recos = recommend_hybrid(index_to_title[idx], top_n=top_n, alpha=alpha)
        all_recos.extend(recos)

    # 3. Compter les apparitions (pondération implicite)
    reco_counts = Counter(all_recos)

    # 4. Supprimer les films déjà vus
    already_seen = df_ratings_filtered[df_ratings_filtered['userId'] == user_id]['movie_index'].map(index_to_title).dropna().tolist()
    filtered_recos = [title for title, _ in reco_counts.most_common() if title not in already_seen]

    return filtered_recos[:top_n]


In [16]:
print("Recommandations pour l'utilisateur 1 :")
print(recommend_hybrid_for_user(user_id=1, top_n=5, alpha=0.7))

Recommandations pour l'utilisateur 1 :
['Godfather, The (1972)', 'Total Recall (1990)', "One Flew Over the Cuckoo's Nest (1975)", 'Full Metal Jacket (1987)', 'Raising Arizona (1987)']
