In [34]:
# Predicting weather or not a movie is going to be successful based on its
# budget, top actor popularity. The popularity of the movie will be based on its
# ratings by the people. Each movie neeeds over 1000 votings to be valid.

# For a more focused sample size, I have only picked 500 movies produced in the
# U.S.

#Part 1: Data Curation

# I have made a table with the movie names, ratings, budget and top actor
# popularity. Eliminated any movies with under 1000 votings. Changed budget into
# millions.


import requests
import pandas as pd
import time

API_KEY = "6c30ec8965840313bb630941ccabc149"
BASE_URL = "https://api.themoviedb.org/3"

# Function to get the most popular movies
def get_popular_movies(region="US", total_movies=500):
    movies = []
    page = 1

    while len(movies) < total_movies:
        url = f"{BASE_URL}/movie/popular?api_key={API_KEY}&language=en-US&region={region}&page={page}"
        response = requests.get(url)

        if response.status_code != 200:
            print("Error fetching data:", response.json())
            break

        data = response.json().get("results", [])
        movies.extend(data)

        if not data or len(data) < 20:  # each page gives ~20 movies
            break

        page += 1

    return movies[:total_movies]

# Function to get movie details by ID
def get_movie_details(movie_id):
    url = f"{BASE_URL}/movie/{movie_id}?api_key={API_KEY}"
    response = requests.get(url)

    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error fetching details for movie ID {movie_id}: {response.status_code}")
        return None

# Function to get movie credits by ID
def get_movie_credits(movie_id):
    url = f"{BASE_URL}/movie/{movie_id}/credits?api_key={API_KEY}"
    response = requests.get(url)

    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error fetching credits for movie ID {movie_id}: {response.status_code}")
        return None

# Function to get actor popularity by ID
def get_actor_popularity(actor_id):
    url = f"{BASE_URL}/person/{actor_id}?api_key={API_KEY}"
    response = requests.get(url)

    if response.status_code == 200:
        data = response.json()
        return data.get('popularity')
    else:
      print(f"Error fetching popularity for actor ID {actor_id}: {response.status_code}")
      return None


movies_data = get_popular_movies()

df = pd.DataFrame(movies_data)

# Selectin columns of interest
columns = ["id", "title", "release_date", "vote_average", "vote_count", "popularity"]
df = df[columns]

# Filter out movies with vote count less than 1000
df = df[df['vote_count'] >= 1000].copy() # Use .copy() to avoid SettingWithCopyWarning

# Add budget information
df['budget'] = None
for index, row in df.iterrows():
    movie_id = row['id']
    details = get_movie_details(movie_id)
    if details and 'budget' in details:
        df.loc[index, 'budget'] = details['budget']
    time.sleep(0.05) # Add a small delay to avoid hitting API limits

# Add top actor popularity
df['top_actor_popularity (out of 10)'] = None # Initialize top_actor_popularity column
for index, row in df.iterrows():
    movie_id = row['id']
    credits = get_movie_credits(movie_id)
    if credits and 'cast' in credits and len(credits['cast']) > 0:
        top_actor_id = credits['cast'][0]['id']
        actor_popularity = get_actor_popularity(top_actor_id)
        df.loc[index, 'top_actor_popularity (out of 10)'] = actor_popularity
    time.sleep(0.05) # Add a small delay to avoid hitting API limits


# Convertin budget to millions n dropping orginal budget column
df['budget_in_millions'] = df['budget'] / 1000000
df = df.drop(columns=['budget'])


# Sort by vote_average in descending order
df = df.sort_values(by='vote_average', ascending = False)

display(df)


Unnamed: 0,id,title,release_date,vote_average,vote_count,popularity,top_actor_popularity (out of 10),budget_in_millions
115,278,The Shawshank Redemption,1994-09-23,8.713,29052,30.3534,2.2896,25.0
110,238,The Godfather,1972-03-24,8.700,21954,31.2192,1.6597,6.0
269,240,The Godfather Part II,1974-12-20,8.572,13263,18.6726,2.7997,13.0
366,424,Schindler's List,1993-12-15,8.567,16785,15.6155,5.3971,22.0
217,129,Spirited Away,2002-09-20,8.535,17566,20.8639,1.2996,19.0
...,...,...,...,...,...,...,...,...
396,439079,The Nun,2018-09-07,5.906,6990,14.9550,2.2533,22.0
153,216015,Fifty Shades of Grey,2015-02-13,5.880,12130,24.9334,4.4752,40.0
478,957452,The Crow,2024-08-23,5.823,1147,13.4592,3.8964,50.0
499,9714,Home Alone 3,1997-12-12,5.276,3152,13.2795,3.9553,32.0
