In [1]:
import os
import pandas as pd
import ast
import joblib

In [3]:
# ========= LOAD CLEANED DATA =========
DATA_DIR = os.path.join('..', '..', 'data', 'cleaned')
TARGET_DATA_DIR = os.path.join('..', '..', 'data', 'cleaned', 'joblib_dataframes')

user_ratings = pd.read_csv(os.path.join(DATA_DIR, 'dim_movie_user_ratings.csv'))
fact_movies = pd.read_csv(os.path.join(DATA_DIR, 'fact_movies.csv'))
titles = pd.read_csv(os.path.join(DATA_DIR, 'dim_movie_titles.csv'))
calendar = pd.read_csv(os.path.join(DATA_DIR, 'dim_movie_calendar.csv'))

# ========= STANDARDIZE IDs =========
for df in [user_ratings, fact_movies, titles]:
    for col in ['imdbId', 'movie_imdb_id']:
        if col in df.columns:
            df[col] = df[col].astype(str).str.strip().str.lower().str.replace('tt', '', regex=False)

user_ratings['movie_imdb_id'] = user_ratings['imdbId']

# ========= MERGE USER RATINGS WITH MOVIE INFO =========
df = user_ratings.merge(fact_movies, on='movie_imdb_id', how='left') \
                 .merge(titles, on='movie_imdb_id', how='left') \
                 .merge(calendar, left_on='movie_release_date_id', right_on='date_id', how='left')
df = df[df['movie_title'].notna()].copy()

# ========= HELPER FUNCTION TO LOAD GROUPED METADATA =========
def load_grouped_metadata(bridge_file, dim_file, fact_key, bridge_key, dim_key, new_column):
    bridge_df = pd.read_csv(os.path.join(DATA_DIR, bridge_file))
    dim_df = pd.read_csv(os.path.join(DATA_DIR, dim_file))

    bridge_df = bridge_df.rename(columns={bridge_key: dim_key})
    bridge_df[fact_key] = bridge_df[fact_key].astype(str).str.strip().str.lower().str.replace('tt', '', regex=False)
    merged = bridge_df.merge(dim_df, on=dim_key, how='left')

    grouped = merged.groupby(fact_key)[dim_key].apply(lambda x: list(x.dropna().unique())).reset_index()
    grouped.rename(columns={dim_key: new_column}, inplace=True)

    return grouped

# ========= LOAD METADATA =========
actors     = load_grouped_metadata('bridge_fact_movies_dim_movie_actors.csv',     'dim_movie_actors.csv',     'movie_imdb_id', 'movie_person_name_id', 'actor_name_id',    'actors')
directors  = load_grouped_metadata('bridge_fact_movies_dim_movie_directors.csv',  'dim_movie_directors.csv',  'movie_imdb_id', 'movie_person_name_id', 'director_name_id', 'directors')
producers  = load_grouped_metadata('bridge_fact_movies_dim_movie_producers.csv',  'dim_movie_producers.csv',  'movie_imdb_id', 'movie_person_name_id', 'producer_name_id', 'producers')
writers    = load_grouped_metadata('bridge_fact_movies_dim_movie_writers.csv',    'dim_movie_writers.csv',    'movie_imdb_id', 'movie_person_name_id', 'writer_name_id',   'writers')
categories = load_grouped_metadata('bridge_fact_movies_dim_movie_categories.csv', 'dim_movie_categories.csv', 'movie_imdb_id', 'movie_category_id',    'movie_category_id','categories')

# ========= AGGREGATE USER DATA =========
user_ids_grouped     = df.groupby('movie_imdb_id')['userId'].apply(list).reset_index()
rating_ids_grouped   = df.groupby('movie_imdb_id')['ratingId'].apply(list).reset_index()
ratings_grouped      = df.groupby('movie_imdb_id')['rating'].apply(list).reset_index()

df_non_duplicated = df.drop_duplicates('movie_imdb_id')[[
    'movie_imdb_id', 'movie_title', 'movie_budget_$', 'movie_box_office_$',
    'movie_duration_minutes', 'movie_release_date_x'
]]

# ========= MERGE ALL GROUPED METADATA INTO FINAL DF =========
df_final = df_non_duplicated \
    .merge(user_ids_grouped, on='movie_imdb_id', how='left') \
    .merge(rating_ids_grouped, on='movie_imdb_id', how='left') \
    .merge(ratings_grouped, on='movie_imdb_id', how='left') \
    .merge(actors, on='movie_imdb_id', how='left') \
    .merge(directors, on='movie_imdb_id', how='left') \
    .merge(producers, on='movie_imdb_id', how='left') \
    .merge(writers, on='movie_imdb_id', how='left') \
    .merge(categories, on='movie_imdb_id', how='left')

# ========= RENAME COLUMNS =========
df_final.rename(columns={
    'movie_imdb_id': 'imdb_id',
    'movie_title': 'title',
    'movie_budget_$': 'budget_$',
    'movie_box_office_$': 'box_office_$',
    'movie_duration_minutes': 'duration_minutes',
    'movie_release_date_x': 'release_date',
    'userId': 'user_ids',
    'ratingId': 'user_rating_ids',
    'rating': 'users_ratings'
}, inplace=True)

# ========= CONVERT STRINGS TO LISTS =========
df_final['user_ids'] = df_final['user_ids'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
df_final['users_ratings'] = df_final['users_ratings'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# ========= EXPLODE USER-MOVIE RATINGS FOR MODELING =========
df_exploded = df_final[['imdb_id', 'user_ids', 'users_ratings']].explode(['user_ids', 'users_ratings'])

df_final_matrix = df_exploded.pivot_table(index='user_ids', columns='imdb_id', values='users_ratings')
df_final_matrix = df_final_matrix.fillna('')

# ========= OUTPUT =========
print("Preprocessing complete. The dataframes are:")
print("df_final (one row per movie):", df_final.shape)
print("df_final_matrix (one row per user_id):", df_final_matrix.shape)

joblib.dump(df_final, os.path.join(TARGET_DATA_DIR, "df_final.joblib"))
joblib.dump(df_final_matrix, os.path.join(TARGET_DATA_DIR, "df_final_matrix.joblib"))

Preprocessing complete. The dataframes are:
df_final (one row per movie): (5002, 14)
df_final_matrix (one row per user_id): (31486, 5002)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5002 entries, 0 to 5001
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   imdb_id           5002 non-null   object 
 1   title             5002 non-null   object 
 2   budget_$          2902 non-null   float64
 3   box_office_$      3251 non-null   float64
 4   duration_minutes  5002 non-null   float64
 5   release_date      5002 non-null   object 
 6   user_ids          5002 non-null   object 
 7   user_rating_ids   5002 non-null   object 
 8   users_ratings     5002 non-null   object 
 9   actors            4762 non-null   object 
 10  directors         4998 non-null   object 
 11  producers         4934 non-null   object 
 12  writers           4882 non-null   object 
 13  categories        5002 non-null   object 
dty

['../../data/cleaned/joblib_dataframes/df_final_matrix.joblib']