In [1]:
import pandas as pd

# Load original datasets from MovieLens (ml-25m) and TMDB (tmdb_5000).
ratings = pd.read_csv('/Users/mariostam/Documents/Projects/thesis/llm-powered-feature-engineering/data/ml-25m/ratings.csv')
links = pd.read_csv('/Users/mariostam/Documents/Projects/thesis/llm-powered-feature-engineering/data/ml-25m/links.csv')
movies_ml = pd.read_csv('/Users/mariostam/Documents/Projects/thesis/llm-powered-feature-engineering/data/ml-25m/movies.csv')
movies_tmdb = pd.read_csv('/Users/mariostam/Documents/Projects/thesis/llm-powered-feature-engineering/data/tmdb_5000_movies.csv')
credits_tmdb = pd.read_csv('/Users/mariostam/Documents/Projects/thesis/llm-powered-feature-engineering/data/tmdb_5000_credits.csv')

# Combine TMDB movie details with their corresponding credits.
credits_tmdb.rename(columns={'movie_id': 'id'}, inplace=True)
tmdb_df = movies_tmdb.merge(credits_tmdb, on='id')

# Add TMDB IDs and user ratings to the MovieLens dataset.
links = links.dropna(subset=['tmdbId'])
links['tmdbId'] = links['tmdbId'].astype('int64')

ml_df = movies_ml.merge(links, on='movieId')
ml_df = ml_df.merge(ratings, on='movieId')

# Create the master dataset by merging MovieLens and TMDB dataframes.
master_df = ml_df.merge(tmdb_df, left_on='tmdbId', right_on='id')

# Select and rename essential columns for the final dataset.
master_df = master_df[['userId', 'movieId', 'rating', 'title_x', 'overview', 'keywords']]
master_df.rename(columns={'userId': 'user_id', 'movieId': 'movie_id', 'title_x': 'title', 'overview': 'plot_overview', 'keywords': 'human_keywords'}, inplace=True)

# Ensure data integrity by removing rows with any missing values.
master_df.dropna(inplace=True)

# Save the unified and cleaned dataset to a CSV file.
master_df.to_csv('/Users/mariostam/Documents/Projects/thesis/llm-powered-feature-engineering/data/master_dataframe.csv', index=False)

print('Master dataframe created successfully!')
print(master_df.head())

Master dataframe created successfully!
   user_id  movie_id  rating      title  \
0        2         1     3.5  Toy Story   
1        3         1     4.0  Toy Story   
2        4         1     3.0  Toy Story   
3        5         1     4.0  Toy Story   
4        8         1     4.0  Toy Story   

                                       plot_overview  \
0  Led by Woody, Andy's toys live happily in his ...   
1  Led by Woody, Andy's toys live happily in his ...   
2  Led by Woody, Andy's toys live happily in his ...   
3  Led by Woody, Andy's toys live happily in his ...   
4  Led by Woody, Andy's toys live happily in his ...   

                                      human_keywords  
0  [{"id": 931, "name": "jealousy"}, {"id": 4290,...  
1  [{"id": 931, "name": "jealousy"}, {"id": 4290,...  
2  [{"id": 931, "name": "jealousy"}, {"id": 4290,...  
3  [{"id": 931, "name": "jealousy"}, {"id": 4290,...  
4  [{"id": 931, "name": "jealousy"}, {"id": 4290,...  
