In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import ast

movies = pd.read_csv('movies_metadata.csv')
ratings = pd.read_csv('ratings.csv')
links = pd.read_csv('links.csv')
credits = pd.read_csv('credits.csv')
keywords = pd.read_csv('keywords.csv')

# drop rows where data is missing
movies.dropna(inplace=True)
ratings.dropna(inplace=True)
links.dropna(inplace=True)
credits.dropna(inplace=True)
keywords.dropna(inplace=True)

# normalize numerical values
# neural networks perform better for fixed range
scaler = MinMaxScaler(feature_range=(0, 1))
ratings['rating'] = scaler.fit_transform(ratings[['rating']])

#extracting from json the names
def extract_names(obj):
    try:
        obj = ast.literal_eval(obj)
        return [d['name'] for d in obj]
    except (ValueError, SyntaxError):
        return []

movies['genres'] = movies['genres'].apply(extract_names)
credits['cast'] = credits['cast'].apply(extract_names)
keywords['keywords'] = keywords['keywords'].apply(extract_names)

# save new csv normalized
movies.to_csv('updated/movies_cleaned.csv', index=False)
ratings.to_csv('updated/ratings_cleaned.csv', index=False)
credits.to_csv('updated/credits_cleaned.csv', index=False)
keywords.to_csv('updated/keywords_cleaned.csv', index=False)



print("New csv normalized files saved")


  movies = pd.read_csv('movies_metadata.csv')


New csv normalized files saved


In [None]:
# Load credits.csv
credits_df = pd.read_csv('updated/credits_cleaned.csv')
movies_metadata_df = pd.read_csv('updated/movies_cleaned.csv')
keywords_df = pd.read_csv('updated/keywords_cleaned.csv')
ratings_df = pd.read_csv('updated/ratings_cleaned.csv')
links_df = pd.read_csv('links.csv')


# Merge movies_metadata with credits on 'id'
merged_df = pd.merge(movies_metadata_df, credits_df, on='id', how='left')
merged_df = pd.merge(merged_df, keywords_df, on='id', how='left')
merged_df = pd.merge(merged_df, links_df, left_on='id', right_on='tmdbId', how='left')
final_df = pd.merge(merged_df, ratings_df, left_on='id', right_on='movieId', how='left')


# Convert JSON-like strings to lists/dictionaries
final_df['cast'] = final_df['cast'].apply(ast.literal_eval)
final_df['crew'] = final_df['crew'].apply(ast.literal_eval)
final_df['keywords'] = final_df['keywords'].apply(ast.literal_eval)

print(final_df.head())

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Load data
ratings = pd.read_csv('ratings.csv')
movies_metadata = pd.read_csv('movies_metadata.csv', dtype={'id': str})  # Fix DtypeWarning
keywords = pd.read_csv('keywords.csv')
credits = pd.read_csv('credits.csv')
links = pd.read_csv('links.csv')

# Convert 'id' columns to string (to ensure consistency)
movies_metadata['id'] = movies_metadata['id'].astype(str)
keywords['id'] = keywords['id'].astype(str)
credits['id'] = credits['id'].astype(str)
links['tmdbId'] = links['tmdbId'].astype(str)

# Merge data
movies_metadata = pd.merge(movies_metadata, keywords, on='id', how='left')
movies_metadata = pd.merge(movies_metadata, credits, on='id', how='left')
movies_metadata = pd.merge(movies_metadata, links, left_on='id', right_on='tmdbId', how='left')
final_data = pd.merge(movies_metadata, ratings, left_on='id', right_on='movieId', how='left')

# Feature engineering
final_data['genres'] = final_data['genres'].apply(lambda x: [genre['name'] for genre in eval(x)] if isinstance(x, str) else [])
genres_df = final_data['genres'].apply(lambda x: '|'.join(x) if x else '')
genres_df = pd.get_dummies(genres_df.apply(lambda x: x.split('|')).apply(pd.Series).stack())

# Group by the index and sum to get one-hot encoded columns
genres_df = genres_df.groupby(level=0).sum()

# Combine genres with other features
X = pd.concat([final_data[['runtime', 'vote_average']], genres_df], axis=1)
y = final_data['rating']

# Handle missing data
X['runtime'].fillna(X['runtime'].mean(), inplace=True)
X['vote_average'].fillna(X['vote_average'].mean(), inplace=True)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
print(f"Root Mean Squared Error (RMSE): {rmse}")



ValueError: You are trying to merge on object and int64 columns for key 'id'. If you wish to proceed you should use pd.concat