# Step 1: Fetch Movie Data from API


In [11]:
import os
import requests
import pandas as pd
from dotenv import load_dotenv
import json

load_dotenv()

ACCESS_TOKEN = os.getenv("ACCESS_TOKEN")

In [12]:
base_url = "https://api.themoviedb.org/3/movie/"

headers = {"accept": "application/json", "Authorization": f"Bearer {ACCESS_TOKEN}"}

movie_ids = [
    0,
    299534,
    19995,
    140607,
    299536,
    597,
    135397,
    420818,
    24428,
    168259,
    99861,
    284054,
    12445,
    181808,
    330457,
    351286,
    109445,
    321612,
    260513,
]

movies = []

# Fetch movie data for each ID
for movie_id in movie_ids:
    url = f"{base_url}{movie_id}?append_to_response=credits"
    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        movies.append(response.json())
    else:
        print(f"Failed to fetch data for movie ID {movie_id}: {response.status_code}")

with open("movies_data.json", "w") as f:
    json.dump(movies, f, indent=4)
    
# Create dataframe
movies_df = pd.DataFrame(movies)
print(f"Fetched {len(movies_df)} movies successfully.")
# print(movies_df.info())

Failed to fetch data for movie ID 0: 404
Fetched 18 movies successfully.


# Step 2: Data Cleaning and Preprocessing


In [13]:
# Drop irrelevant columns
movies_df.drop(
    columns=[
        "adult",
        "imdb_id",
        "original_title",
        "video",
        "homepage",
        "backdrop_path",
    ],
    inplace=True,
)
# print(movies_df.info())

# Extract name from collection
movies_df["collection_name"] = movies_df["belongs_to_collection"].apply(
    lambda x: x["name"] if isinstance(x, dict) and "name" in x else None
)

# Extract genre names
movies_df["genres"] = movies_df["genres"].apply(
    lambda x: "|".join([genre["name"] for genre in x])
)

# Extract spoken languages
movies_df["spoken_languages"] = movies_df["spoken_languages"].apply(
    lambda x: "|".join([lang["english_name"] for lang in x])
)

# Extract production countries
movies_df["production_countries"] = movies_df["production_countries"].apply(
    lambda x: "|".join([country["name"] for country in x])
)

# Extract production companies
movies_df["production_companies"] = movies_df["production_companies"].apply(
    lambda x: "|".join([company["name"] for company in x])
)

# Extract cast, director, and their sizes
movies_df["cast"] = movies_df["credits"].apply(
    lambda x: "|".join([member["name"] for member in x["cast"]])
)
movies_df["cast_size"] = movies_df["credits"].apply(lambda x: len(x["cast"]))
movies_df["director"] = movies_df["credits"].apply(
    lambda x: "|".join(
        [member["name"] for member in x["crew"] if member["job"] == "Director"]
    )
)
movies_df["crew_size"] = movies_df["credits"].apply(lambda x: len(x["crew"]))
movies_df.drop(columns=["credits"], inplace=True)

# Convert column data types
movies_df["budget"] = pd.to_numeric(movies_df["budget"], errors="coerce")
movies_df["id"] = pd.to_numeric(movies_df["id"], errors="coerce")
movies_df["popularity"] = pd.to_numeric(movies_df["popularity"], errors="coerce")
movies_df["revenue"] = pd.to_numeric(movies_df["revenue"], errors="coerce")
movies_df["runtime"] = pd.to_numeric(movies_df["runtime"], errors="coerce")
movies_df["release_date"] = pd.to_datetime(movies_df["release_date"], errors="coerce")

# Replace unrealistic values with NaN
movies_df.loc[movies_df["budget"] <= 0, "budget"] = pd.NA
movies_df.loc[movies_df["revenue"] <= 0, "revenue"] = pd.NA
movies_df.loc[movies_df["runtime"] <= 0, "runtime"] = pd.NA

# convert budget and revenue to millions
movies_df["budget_musd"] = movies_df["budget"] / 1_000_000
movies_df["revenue_musd"] = movies_df["revenue"] / 1_000_000
movies_df.drop(columns=["budget", "revenue"], inplace=True)
movies_df.loc[movies_df["vote_count"] == 0, "vote_average"] = pd.NA

# replace text placeholders with NaN
known_placeholders = ["", "-", "N/A", "No Data", "No overview", "Unknown", "None"]
movies_df["overview"] = movies_df["overview"].replace(known_placeholders, pd.NA)
movies_df["tagline"] = movies_df["tagline"].replace(known_placeholders, pd.NA)

# remove duplicates
movies_df.drop_duplicates(subset=["id"], keep="first", inplace=True)

# drop unknown id or title
movies_df.dropna(subset=["id", "title"], inplace=True)

# keep row with at least 10 non-NaN values
movies_df = movies_df[movies_df.notna().sum(axis=1) >= 10]

# filter only movies released then drop status
movies_df = movies_df[movies_df["status"] == "Released"]
movies_df.drop(columns=["status"], inplace=True)

ordered_columns = [
    "id",
    "title",
    "tagline",
    "release_date",
    "genres",
    "belongs_to_collection",
    "original_language",
    "budget_musd",
    "revenue_musd",
    "production_companies",
    "production_countries",
    "vote_count",
    "vote_average",
    "popularity",
    "runtime",
    "overview",
    "spoken_languages",
    "poster_path",
    "cast",
    "cast_size",
    "director",
    "crew_size",
]

movies_df = movies_df[ordered_columns]
movies_df.reset_index(drop=True, inplace=True)
movies_df.to_csv("cleaned_movies_data.csv", index=False)
print(movies_df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18 entries, 0 to 17
Data columns (total 22 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   id                     18 non-null     int64         
 1   title                  18 non-null     object        
 2   tagline                18 non-null     object        
 3   release_date           18 non-null     datetime64[ns]
 4   genres                 18 non-null     object        
 5   belongs_to_collection  16 non-null     object        
 6   original_language      18 non-null     object        
 7   budget_musd            18 non-null     float64       
 8   revenue_musd           18 non-null     float64       
 9   production_companies   18 non-null     object        
 10  production_countries   18 non-null     object        
 11  vote_count             18 non-null     int64         
 12  vote_average           18 non-null     float64       
 13  popular