In [None]:
import requests
import pandas as pd
from time import sleep
import os
import json

# Import API key from config file
from config import TMDB_API_KEY

# Get API key: from environment variable or config file
api_key = os.getenv("TMDB_API_KEY") or TMDB_API_KEY

# TMDB base URL
base_url = "https://api.themoviedb.org/3"

def fetch_movie_data(movie_id):
    """Fetch movie details from TMDB API."""
    url = f"{base_url}/movie/{movie_id}"
    params = {
        "api_key": api_key,
        "language": "en-US"
    }
    response = requests.get(url, params=params)
    
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error fetching data for movie ID {movie_id}: {response.status_code}")
        try:
            print(response.json())  # Print error details from API
        except:
            pass
        return None

def main():
    """Main pipeline to fetch movies and build a DataFrame."""
    # Example movie IDs (test small batch first)
    movie_ids = [550, 551, 552, 553, 554]
    
    movie_data_list = []

    for movie_id in movie_ids:
        movie_data = fetch_movie_data(movie_id)
        if movie_data:
            print("\n🔍 API Response for Movie ID:", movie_id)
            print(json.dumps(movie_data, indent=4))  # Pretty print JSON for clarity
            
            movie_data_list.append(movie_data)
        sleep(1)  # Be respectful of rate limits

    # Convert to DataFrame
    if movie_data_list:
        df = pd.DataFrame(movie_data_list)

        print("\n Fetched Movie DataFrame:")
        print(df.head())

        print("\n DataFrame Columns:")
        print(df.columns)

        # Save DataFrame (optional)
        # df.to_csv("tmdb_movies_raw.csv", index=False)

    else:
        print("No movie data fetched.")

if __name__ == "__main__":
    main()



In [None]:
url = f"https://api.themoviedb.org/3/discover/movie"
params = {
    "api_key": api_key,
    "language": "en-US",
    "primary_release_date.gte": "2000-01-01",
    "primary_release_date.lte": "2024-12-31",
    "page": 1
}

response = requests.get(url, params=params)
data = response.json()

print(f"Total movies matching query: {data['total_results']}")
print(f"Total pages: {data['total_pages']}")


In [None]:
#  Storage for results
movie_data = []

# Loop through TMDB's /discover/movie endpoint
for page in range(1, 400):  # Start with 2 pages for quick testing; increase to 201 later
    discover_url = f"{base_url}/discover/movie"
    params = {
        "api_key": api_key,
        "language": "en-US",
        "sort_by": "popularity.desc",
        "include_adult": False,
        "include_video": False,
        "page": page,
        "primary_release_date.gte": "2000-01-01",
        "primary_release_date.lte": "2024-12-31"
    }

    response = requests.get(discover_url, params=params)
    if response.status_code != 200:
        print(f"Failed on page {page} — Status: {response.status_code}")
        print(response.json())  # Show error
        break

    movies = response.json().get("results", [])

    for movie in movies:
        movie_id = movie["id"]

        # Fetch movie details
        movie_details = requests.get(f"{base_url}/movie/{movie_id}", params={"api_key": api_key}).json()
        credits = requests.get(f"{base_url}/movie/{movie_id}/credits", params={"api_key": api_key}).json()

        # Get director and top actor
        director = next((c["name"] for c in credits.get("crew", []) if c["job"] == "Director"), None)
        cast = [c["name"] for c in credits.get("cast", [])][:1]

        # Collect relevant fields
        movie_data.append({
            "title": movie_details.get("title"),
            "release_date": movie_details.get("release_date"),
            "budget": movie_details.get("budget"),
            "revenue": movie_details.get("revenue"),
            "runtime": movie_details.get("runtime"),
            "genres": [g["name"] for g in movie_details.get("genres", [])],
            "popularity": movie_details.get("popularity"),
            "vote_average": movie_details.get("vote_average"),
            "vote_count": movie_details.get("vote_count"),
            "original_language": movie_details.get("original_language"),
            "production_companies": [pc["name"] for pc in movie_details.get("production_companies", [])],
            "belongs_to_collection": movie_details.get("belongs_to_collection") is not None,
            "director": director,
            "lead_actor": cast[0] if cast else None
        })

    print(f"Page {page} complete — movies collected: {len(movie_data)}")
    sleep(0.3)

# Convert to DataFrame
df_movies = pd.DataFrame(movie_data)

# Print the DataFrame
df_movies # Preview first few rows


In [None]:
df_movies = df_movies[(df_movies['budget'] != 0) & (df_movies['revenue'] != 0)]

df_movies['profit'] = df_movies['revenue'] - df_movies['budget']
df_movies['profit_margin'] = df_movies['profit'] / df_movies['revenue']
print("\nFinal DataFrame with Profit and Profit Margin:")
print(df_movies[['title', 'release_date', 'budget', 'revenue', 'profit', 'profit_margin']].head())

df_movies

In [None]:
df_movies["release__year"] = pd.to_datetime(df_movies["release_date"], errors='coerce').dt.year

df_movies

In [None]:
df_movies = df_movies[df_movies["original_language"] == "en"]

df_movies

In [None]:
df_movies['production_company'] = df_movies['production_companies'].apply(
    lambda x: x[0] if isinstance(x, list) and len(x) > 0 else None
)

df_movies

In [None]:
df_movies.drop(columns=['release_date', 'original_language', 'production_companies'], inplace=True)

df_movies

In [None]:
df_movies.reset_index(drop=True, inplace=True)

df_movies

In [None]:
# Function to extract up to 3 genres
def extract_genres(genre_list):
    # Pad with None if fewer than 3 genres
    genres = genre_list[:3] + [None] * (3 - len(genre_list))
    return pd.Series(genres, index=["genre1", "genre2", "genre3"])
# Apply the function
df_movies[["genre1", "genre2", "genre3"]] = df_movies["genres"].apply(extract_genres)
# Optional: drop the original genres column if no longer needed
# df_movies.drop(columns=["genres"], inplace=True)
# Preview the result
df_movies.head()

In [None]:
# Export the final DataFrame to a CSV file
output_file = "tmdb_movies_final.csv"
df_movies.to_csv(output_file, index=False)
print(f"\nData exported to {output_file}")

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
# Load the dataset

In [18]:
# Load your dataset (if not already loaded)
df_movies = pd.read_csv("tmdb_movies_500_page_final.csv")

# Drop rows with nulls in key fields
df_movies = df_movies.dropna(subset=[
    'budget', 'runtime', 'popularity', 'vote_average', 'vote_count', 'revenue', 'genre1'
])

# # Optional: log-transform highly skewed features
# for col in ['budget', 'popularity', 'vote_count']:
#     df_movies[col] = np.log1p(df_movies[col])

df_movies.head()

Unnamed: 0,title,budget,revenue,runtime,genres,popularity,vote_average,vote_count,belongs_to_collection,director,lead_actor,profit,profit_margin,release__year,production_company,genre1,genre2,genre3
0,Lilo & Stitch,80000000,273144151,85,"['Animation', 'Family', 'Comedy']",182.4251,7.547,6556,True,Chris Sanders,Daveigh Chase,193144151,0.707114,2002,Walt Disney Pictures,Animation,Family,Comedy
1,Moana 2,150000000,1059544057,100,"['Animation', 'Adventure', 'Family', 'Comedy']",114.242,7.081,2418,True,David G. Derrick Jr.,Auliʻi Cravalho,909544057,0.85843,2024,Walt Disney Pictures,Animation,Adventure,Family
2,Mufasa: The Lion King,200000000,721046090,118,"['Adventure', 'Family', 'Animation']",85.5401,7.397,2165,True,Barry Jenkins,Aaron Pierre,521046090,0.722625,2024,Walt Disney Pictures,Adventure,Family,Animation
3,Sonic the Hedgehog 3,122000000,486018457,110,"['Action', 'Science Fiction', 'Comedy', 'Family']",81.6122,7.715,2633,True,Jeff Fowler,Jim Carrey,364018457,0.748981,2024,Paramount Pictures,Action,Science Fiction,Comedy
4,Mission: Impossible - Dead Reckoning Part One,291000000,571125435,164,"['Action', 'Adventure', 'Thriller']",73.4752,7.532,4311,True,Christopher McQuarrie,Tom Cruise,280125435,0.49048,2023,Paramount Pictures,Action,Adventure,Thriller


In [19]:
# One-hot encode the categorical features
df_model = pd.get_dummies(df_movies, columns=['production_company', 'director', 'lead_actor', 'genre1', 'genre2', 'genre3'], drop_first=True)
df_model.head()


Unnamed: 0,title,budget,revenue,runtime,genres,popularity,vote_average,vote_count,belongs_to_collection,profit,...,genre3_History,genre3_Horror,genre3_Music,genre3_Mystery,genre3_Romance,genre3_Science Fiction,genre3_TV Movie,genre3_Thriller,genre3_War,genre3_Western
0,Lilo & Stitch,80000000,273144151,85,"['Animation', 'Family', 'Comedy']",182.4251,7.547,6556,True,193144151,...,False,False,False,False,False,False,False,False,False,False
1,Moana 2,150000000,1059544057,100,"['Animation', 'Adventure', 'Family', 'Comedy']",114.242,7.081,2418,True,909544057,...,False,False,False,False,False,False,False,False,False,False
2,Mufasa: The Lion King,200000000,721046090,118,"['Adventure', 'Family', 'Animation']",85.5401,7.397,2165,True,521046090,...,False,False,False,False,False,False,False,False,False,False
3,Sonic the Hedgehog 3,122000000,486018457,110,"['Action', 'Science Fiction', 'Comedy', 'Family']",81.6122,7.715,2633,True,364018457,...,False,False,False,False,False,False,False,False,False,False
4,Mission: Impossible - Dead Reckoning Part One,291000000,571125435,164,"['Action', 'Adventure', 'Thriller']",73.4752,7.532,4311,True,280125435,...,False,False,False,False,False,False,False,True,False,False


In [20]:
df_model.replace([np.inf, -np.inf], np.nan, inplace=True) 

# Fill NaN values
df_model.dropna(inplace=True) # Replace inf values with NaN

In [21]:
# Select features and target
feature_cols = [col for col in df_model.columns if col not in ['title', 'genres', 'popularity', 'vote_count', 'vote_average', 'profit', 'profit_margin', 'revenue']]

X = df_model[feature_cols]
y = df_model['revenue']

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [None]:
# Define regression models
models = {
    'LinearRegression': LinearRegression(),
    'DecisionTree': DecisionTreeRegressor(random_state=42),
    'RandomForest': RandomForestRegressor(n_estimators=100, random_state=42)
}

# Train and evaluate each model
results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    mae = mean_absolute_error(y_test, preds)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    r2 = r2_score(y_test, preds)

    results.append({
        'Model': name,
        'MAE': round(mae, 2),
        'RMSE': round(rmse, 2),
        'R²': round(r2, 4)
    })

# Show comparison results
results_df = pd.DataFrame(results).sort_values(by='R²', ascending=False)
print(results_df)

              Model           MAE          RMSE      R²
3           XGBoost  7.768740e+07  1.519908e+08  0.6684
2      RandomForest  7.624430e+07  1.549151e+08  0.6555
1      DecisionTree  9.598921e+07  1.852111e+08  0.5076
0  LinearRegression  3.751534e+08  5.799172e+08 -3.8276


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Plot: Actual vs Predicted Revenue (Random Forest)
best_model = RandomForestRegressor(n_estimators=100, random_state=42)
best_model.fit(X_train, y_train)
preds_rf = best_model.predict(X_test)

plt.figure(figsize=(8, 6))
sns.scatterplot(x=y_test, y=preds_rf, alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel('Actual Revenue')
plt.ylabel('Predicted Revenue')
plt.title('🎬 Random Forest: Actual vs Predicted Revenue')
plt.grid(True)
plt.tight_layout()
plt.show()




In [None]:
# 2. Plot: Feature Importance
importances = best_model.feature_importances_
features = X_train.columns
feat_imp = pd.Series(importances, index=features).sort_values(ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x=feat_imp.values, y=feat_imp.index, palette='viridis')
plt.title('🔍 Feature Importance - Random Forest')
plt.xlabel('Importance Score')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

In [None]:
print(df_model.columns.tolist())

In [None]:
genre_columns = [col for col in df_model.columns if col.startswith('genre1_')]

# Use idxmax to find the first genre1_ column with a 1 in it
df_model['genre1'] = df_model[genre_columns].idxmax(axis=1).str.replace('genre1_', '')

In [None]:
import plotly.express as px

#Option 1

fig1 = px.scatter(
    df_model,
    x='budget',
    y='revenue',
    color='genre1',          # color by genre string (e.g., Action, Comedy)
    size='popularity',       # optional: size points by popularity
    hover_data=['title'],    # optional: show movie title on hover
    title='💸 Budget vs Revenue (Colored by Genre)',
    labels={'budget': 'Budget ($)', 'revenue': 'Revenue ($)'}
)

fig1.update_layout(template='plotly_white')
fig1.show()

In [None]:

#Option 2

fig1 = px.scatter(
    df_model,
    x='budget',
    y='revenue',
    color='genre1',  # Now it's a string column like 'Comedy', 'Drama'
    size='popularity',
    hover_data=['title'],
    title='💸 Budget vs Revenue (Colored by Genre)',
    labels={'budget': 'Budget ($)', 'revenue': 'Revenue ($)'}
)
fig1.update_layout(template='plotly_white')
fig1.show()

In [None]:
# 2. Feature Importance Bar Chart
feat_imp = pd.Series(best_model.feature_importances_, index=X_train.columns).sort_values(ascending=False)
fig2 = px.bar(x=feat_imp.values, y=feat_imp.index,
              orientation='h',
              title='🎯 Feature Importance (Random Forest)',
              labels={'x': 'Importance Score', 'y': 'Feature'})
fig2.update_layout(yaxis=dict(autorange="reversed"))
fig2.show()

In [None]:
# 3. Box Plot: Revenue by Genre
fig3 = px.box(df_movies, x='genre1', y='revenue', points="all",
              title='🎬 Revenue Distribution by Primary Genre')
fig3.update_layout(xaxis_title='Genre', yaxis_title='Revenue')
fig3.show()