In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import os

In [2]:
def read_ratings(ratings_csv, data_dir="../data/raw", original=True) -> pd.DataFrame:
    """
    Reads a ratings.csv from the data/raw folder.

    Parameters
    -------
    ratings_csv : str
        The csv file that will be read. Must be corresponding to a rating file.

    Returns
    -------
    pd.DataFrame
        The ratings DataFrame. Its columns are, in order:
        "userId", "movieId", "rating" and "timestamp".
    """
    data = pd.read_csv(os.path.join(data_dir, ratings_csv))
    
    # Je pense que c'est une erreure dans le code
    if original:
        temp = pd.DataFrame(LabelEncoder().fit_transform(data["movieId"]))
        data["movieId"] = temp
    return data

def read_movies(movies_csv, data_dir="../data/raw") -> pd.DataFrame:
    """
    Reads a movies.csv from the data/raw folder.

    Parameters
    -------
    movies_csv : str
        The csv file that will be read. Must be corresponding to a movie file.

    Returns
    -------
    pd.DataFrame
        The movies DataFrame. Its columns are binary and represent the movie genres.
    """
    # Read the CSV file
    df = pd.read_csv(os.path.join(data_dir, movies_csv))

    # Split the 'genres' column into individual genres
    genres = df["genres"].str.get_dummies(sep="|")

    # Concatenate the original movieId and title columns with the binary genre columns
    result_df = pd.concat([df[["movieId", "title"]], genres], axis=1)
    return result_df

In [3]:
user_ratings= read_ratings("ratings.csv", original = False)
movies = read_movies("movies.csv")

In [4]:
user_ratings = user_ratings[user_ratings["rating"]>=4]
movie_ratings_modified = user_ratings.merge(movies, on="movieId", how="inner")

movie_ratings_modified.groupby("userId").size().sort_values()

userId
66596        1
53274        1
32451        1
27120        1
93335        1
          ... 
131894    2349
118205    2377
8405      2503
82418     2655
125794    3177
Length: 138287, dtype: int64

In [7]:
user_ratings= read_ratings("ratings.csv")
movies = read_movies("movies.csv")

movie_ratings_modified = user_ratings.merge(movies, on="movieId", how="inner")

movie_ratings_modified.groupby("userId").size().sort_values()

userId
109901       1
8165         1
135049       1
50634        1
136880       1
          ... 
34576     3704
82418     3991
83090     5083
8405      5509
118205    6096
Length: 138485, dtype: int64

In [None]:
user_ratings.groupby("userId").size().sort_values()

In [None]:
movie_ratings[movie_ratings["userId"]==1].sort_values("rating")

In [None]:
user_matrix = movie_ratings.groupby("userId").agg(
    "mean",
)
display(user_matrix)

In [None]:
user_ratings = read_ratings("ratings.csv")
movies = read_movies("movies.csv")
user_matrix = create_user_matrix(user_ratings, movies)
movies = movies.drop("title", axis=1)
movies.to_csv("data/processed/movie_matrix.csv", index=False)
user_matrix.to_csv("data/processed/user_matrix.csv")
