In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from joblib import dump


In [2]:
# Load your dataset (if not already loaded)
df_movies = pd.read_csv("tmdb_movies_500_page_final.csv")

# Drop rows with nulls in key fields
df_model = df_movies.dropna(subset=[
    'budget', 'runtime', 'popularity', 'vote_average', 'vote_count', 'revenue', 'genre1'
])

df_model.head()

Unnamed: 0,title,budget,revenue,runtime,genres,popularity,vote_average,vote_count,belongs_to_collection,director,lead_actor,profit,profit_margin,release__year,production_company,genre1,genre2,genre3
0,Lilo & Stitch,80000000,273144151,85,"['Animation', 'Family', 'Comedy']",182.4251,7.547,6556,True,Chris Sanders,Daveigh Chase,193144151,0.707114,2002,Walt Disney Pictures,Animation,Family,Comedy
1,Moana 2,150000000,1059544057,100,"['Animation', 'Adventure', 'Family', 'Comedy']",114.242,7.081,2418,True,David G. Derrick Jr.,Auliʻi Cravalho,909544057,0.85843,2024,Walt Disney Pictures,Animation,Adventure,Family
2,Mufasa: The Lion King,200000000,721046090,118,"['Adventure', 'Family', 'Animation']",85.5401,7.397,2165,True,Barry Jenkins,Aaron Pierre,521046090,0.722625,2024,Walt Disney Pictures,Adventure,Family,Animation
3,Sonic the Hedgehog 3,122000000,486018457,110,"['Action', 'Science Fiction', 'Comedy', 'Family']",81.6122,7.715,2633,True,Jeff Fowler,Jim Carrey,364018457,0.748981,2024,Paramount Pictures,Action,Science Fiction,Comedy
4,Mission: Impossible - Dead Reckoning Part One,291000000,571125435,164,"['Action', 'Adventure', 'Thriller']",73.4752,7.532,4311,True,Christopher McQuarrie,Tom Cruise,280125435,0.49048,2023,Paramount Pictures,Action,Adventure,Thriller


In [3]:
# Encode the primary genre as numeric
df_model['genre1_encoded'] = df_model['genre1'].astype('category').cat.codes

# Define features and target
features = ['budget', 'runtime', 'vote_average', 'vote_count', 'genre1_encoded']
target = 'revenue'

X = df_model[features]
y = df_model[target]

In [4]:
# Train/test split (optional, for evaluation)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Evaluate (optional)
preds = rf_model.predict(X_test)
print("R²:", round(r2_score(y_test, preds), 4))
print("MAE:", round(mean_absolute_error(y_test, preds), 2))
print("RMSE:", round(np.sqrt(mean_squared_error(y_test, preds)), 2))

R²: 0.6771
MAE: 70347525.41
RMSE: 149981105.51


In [5]:
import os
from joblib import dump

# Create 'models' directory if it doesn't exist
os.makedirs("models", exist_ok=True)

# Save the model
dump(rf_model, 'models/final_rf_model.pkl')


['models/final_rf_model.pkl']

In [6]:
import os
import joblib
import pandas as pd
import ipywidgets as widgets
from IPython.display import display, Markdown

# Safely get the directory of this notebook (works in Voila and Jupyter)
base_dir = os.getcwd()

# Safe paths
model_path = os.path.join(base_dir, 'models', 'final_rf_model.pkl')
csv_path = os.path.join(base_dir, 'tmdb_movies_500_page_final.csv')

# Load model and data
model = joblib.load(model_path)
df_movies = pd.read_csv(csv_path)

# Prepare genre mapping
df_model = df_movies.dropna(subset=[
    'budget', 'runtime', 'vote_average', 'vote_count', 'revenue', 'genre1'
])
genre_mapping = {genre: code for code, genre in enumerate(df_model['genre1'].astype('category').cat.categories)}

# Input widgets
budget = widgets.FloatText(description='Budget ($M):', value=100)
runtime = widgets.IntSlider(description='Runtime (min):', min=60, max=240, value=120)
vote_avg = widgets.FloatSlider(description='Vote Avg:', min=0, max=10, value=7)
vote_count = widgets.IntText(description='Vote Count:', value=1000)
genre = widgets.Dropdown(options=list(genre_mapping.keys()), description='Genre:')

predict_btn = widgets.Button(description='Predict Revenue', button_style='success')

# Callback
prediction = 0
def predict_revenue(b):
    input_df = pd.DataFrame([{
        'budget': budget.value * 1_000_000,
        'runtime': runtime.value,
        'vote_average': vote_avg.value,
        'vote_count': vote_count.value,
        'genre1_encoded': genre_mapping.get(genre.value, -1)
    }])
    prediction.value = model.predict(input_df)[0]
    display(Markdown(f"### 🎬 Predicted Revenue: **${prediction:,.2f}**"))

predict_btn.on_click(predict_revenue)
prediction = widgets.FloatText(description='Prediction:', value=prediction)
# Display chatbot
display(Markdown("## 🎥 Movie Revenue Prediction Chatbot"))
display(budget, runtime, vote_avg, vote_count, genre, predict_btn, prediction)


## 🎥 Movie Revenue Prediction Chatbot

FloatText(value=100.0, description='Budget ($M):')

IntSlider(value=120, description='Runtime (min):', max=240, min=60)

FloatSlider(value=7.0, description='Vote Avg:', max=10.0)

IntText(value=1000, description='Vote Count:')

Dropdown(description='Genre:', options=('Action', 'Adventure', 'Animation', 'Comedy', 'Crime', 'Documentary', …

Button(button_style='success', description='Predict Revenue', style=ButtonStyle())

FloatText(value=0.0, description='Prediction:')

TypeError: unsupported format string passed to FloatText.__format__

### To run chatbot:

* pip install voila
* conda activate my_env
* voila movie_chatbot.ipynb
