## Querying TMDB with IDs from Grouplens

In [6]:
import pandas as pd
from google.cloud import bigquery
import requests
import time
import os

In [8]:
# Test that the API call work
api_key = os.getenv("TMDB_API_KEY")

In [3]:
query = """
SELECT tmdbId
FROM `film-wizard-453315.Grouplens.grouplens_links`
"""

In [4]:
client = bigquery.Client()
tmdb_list = client.query(query)
df = tmdb_list.to_dataframe()
df



Unnamed: 0,tmdbId
0,2
1,3
2,5
3,6
4,11
...,...
87456,1179468
87457,1181568
87458,1181806
87459,1182286


In [5]:
df.isna().sum()

tmdbId    0
dtype: int64

### Movie details

In [5]:
movie_details_list = []

for i in range(df.shape[0]):
    if pd.notna(df.iloc[i, 0]):  # Skip if NA
        tmdbId = int(df.iloc[i, 0])
        
        url = f"https://api.themoviedb.org/3/movie/{tmdbId}?language=en-US"
        
        headers = {
            "accept": "application/json",
            "Authorization": "Bearer TMDB_API_KEY"
print(api_key)"
        }
        
        response = requests.get(url, headers=headers)
        
        if response.status_code == 200:  # Check if the request was successful
            movie_data = response.json()  # Parse JSON response
            
            # Example: Extract relevant movie details
            movie_details = {
                "tmdbId": movie_data.get("id"),
                "title": movie_data.get("title"),
                "original_title": movie_data.get("original_title"),
                "overview": movie_data.get("overview"),
                "release_date": movie_data.get("release_date"),
                "runtime": movie_data.get("runtime"),
                "status": movie_data.get("status"),
                "tagline": movie_data.get("tagline"),
                "budget": movie_data.get("budget"),
                "revenue": movie_data.get("revenue"),
                "vote_average": movie_data.get("vote_average"),
                "vote_count": movie_data.get("vote_count"),
                "popularity": movie_data.get("popularity"),
                
                # Splitting genres into separate columns (up to the first 3)
                "genre_1": movie_data.get("genres", [])[0]['name'] if len(movie_data.get("genres", [])) > 0 else '',
                "genre_2": movie_data.get("genres", [])[1]['name'] if len(movie_data.get("genres", [])) > 1 else '',
                "genre_3": movie_data.get("genres", [])[2]['name'] if len(movie_data.get("genres", [])) > 2 else '',
                
                # Splitting production_companies into separate columns (up to the first 3)
                "production_company_1": movie_data.get("production_companies", [])[0]['name'] if len(movie_data.get("production_companies", [])) > 0 else '',
                "production_company_2": movie_data.get("production_companies", [])[1]['name'] if len(movie_data.get("production_companies", [])) > 1 else '',
                "production_company_3": movie_data.get("production_companies", [])[2]['name'] if len(movie_data.get("production_companies", [])) > 2 else '',
                
                # Splitting production_countries into separate columns (up to the first 3)
                "production_country_1": movie_data.get("production_countries", [])[0]['name'] if len(movie_data.get("production_countries", [])) > 0 else '',
                "production_country_2": movie_data.get("production_countries", [])[1]['name'] if len(movie_data.get("production_countries", [])) > 1 else '',
                "production_country_3": movie_data.get("production_countries", [])[2]['name'] if len(movie_data.get("production_countries", [])) > 2 else '',
                
                # Splitting spoken_languages into separate columns (up to the first 3)
                "spoken_language_1": movie_data.get("spoken_languages", [])[0]['name'] if len(movie_data.get("spoken_languages", [])) > 0 else '',
                "spoken_language_2": movie_data.get("spoken_languages", [])[1]['name'] if len(movie_data.get("spoken_languages", [])) > 1 else '',
                "spoken_language_3": movie_data.get("spoken_languages", [])[2]['name'] if len(movie_data.get("spoken_languages", [])) > 2 else '',
                
                "homepage": movie_data.get("homepage"),
                "imdb_id": movie_data.get("imdb_id"),
                "backdrop_path": movie_data.get("backdrop_path"),
                "poster_path": movie_data.get("poster_path"),
                
                # Splitting origin_country into separate columns (up to the first 3)
                "origin_country_1": movie_data.get("origin_country", [])[0] if len(movie_data.get("origin_country", [])) > 0 else '',
                "origin_country_2": movie_data.get("origin_country", [])[1] if len(movie_data.get("origin_country", [])) > 1 else '',
                "origin_country_3": movie_data.get("origin_country", [])[2] if len(movie_data.get("origin_country", [])) > 2 else '',
                
                "original_language": movie_data.get("original_language"),
            }
            
            movie_details_list.append(movie_details)

        # Add a 0.1 second break every 50 iterations
        if (i + 1) % 50 == 0:
            print(f"Pausing after {i + 1} iterations...")
            time.sleep(0.1)

# Convert the list of dictionaries into a pandas DataFrame
movie_df = pd.DataFrame(movie_details_list)

# # Intermediate check
# pd.set_option('display.max_columns', None)
# movie_df

# Save the DataFrame to a CSV file
movie_df.to_csv("/Users/oliverramsaygray/code/oliverramsaygray/film_wizard/raw_data/movie_details.csv", index=False, encoding='utf-8')

print("Movie details saved to 'movie_details.csv'")

Pausing after 50 iterations...
Pausing after 100 iterations...
Pausing after 150 iterations...
Pausing after 200 iterations...
Pausing after 250 iterations...
Pausing after 300 iterations...
Pausing after 350 iterations...
Pausing after 400 iterations...
Pausing after 450 iterations...
Pausing after 500 iterations...
Pausing after 550 iterations...
Pausing after 600 iterations...
Pausing after 650 iterations...
Pausing after 700 iterations...
Pausing after 750 iterations...
Pausing after 800 iterations...
Pausing after 850 iterations...
Pausing after 900 iterations...
Pausing after 950 iterations...
Pausing after 1000 iterations...
Pausing after 1050 iterations...
Pausing after 1100 iterations...
Pausing after 1150 iterations...
Pausing after 1200 iterations...
Pausing after 1250 iterations...
Pausing after 1300 iterations...
Pausing after 1350 iterations...
Pausing after 1400 iterations...
Pausing after 1450 iterations...
Pausing after 1500 iterations...
Pausing after 1550 iterations.

Pausing after 12450 iterations...
Pausing after 12500 iterations...
Pausing after 12550 iterations...
Pausing after 12600 iterations...
Pausing after 12650 iterations...
Pausing after 12700 iterations...
Pausing after 12750 iterations...
Pausing after 12800 iterations...
Pausing after 12850 iterations...
Pausing after 12900 iterations...
Pausing after 12950 iterations...
Pausing after 13000 iterations...
Pausing after 13050 iterations...
Pausing after 13100 iterations...
Pausing after 13150 iterations...
Pausing after 13200 iterations...
Pausing after 13250 iterations...
Pausing after 13300 iterations...
Pausing after 13350 iterations...
Pausing after 13400 iterations...
Pausing after 13450 iterations...
Pausing after 13500 iterations...
Pausing after 13550 iterations...
Pausing after 13600 iterations...
Pausing after 13650 iterations...
Pausing after 13700 iterations...
Pausing after 13750 iterations...
Pausing after 13800 iterations...
Pausing after 13850 iterations...
Pausing after 

Pausing after 24500 iterations...
Pausing after 24550 iterations...
Pausing after 24600 iterations...
Pausing after 24650 iterations...
Pausing after 24700 iterations...
Pausing after 24750 iterations...
Pausing after 24800 iterations...
Pausing after 24850 iterations...
Pausing after 24900 iterations...
Pausing after 24950 iterations...
Pausing after 25000 iterations...
Pausing after 25050 iterations...
Pausing after 25100 iterations...
Pausing after 25150 iterations...
Pausing after 25200 iterations...
Pausing after 25250 iterations...
Pausing after 25300 iterations...
Pausing after 25350 iterations...
Pausing after 25400 iterations...
Pausing after 25450 iterations...
Pausing after 25500 iterations...
Pausing after 25550 iterations...
Pausing after 25600 iterations...
Pausing after 25650 iterations...
Pausing after 25700 iterations...
Pausing after 25750 iterations...
Pausing after 25800 iterations...
Pausing after 25850 iterations...
Pausing after 25900 iterations...
Pausing after 

Pausing after 36550 iterations...
Pausing after 36600 iterations...
Pausing after 36650 iterations...
Pausing after 36700 iterations...
Pausing after 36750 iterations...
Pausing after 36800 iterations...
Pausing after 36850 iterations...
Pausing after 36900 iterations...
Pausing after 36950 iterations...
Pausing after 37000 iterations...
Pausing after 37050 iterations...
Pausing after 37100 iterations...
Pausing after 37150 iterations...
Pausing after 37200 iterations...
Pausing after 37250 iterations...
Pausing after 37300 iterations...
Pausing after 37350 iterations...
Pausing after 37400 iterations...
Pausing after 37450 iterations...
Pausing after 37500 iterations...
Pausing after 37550 iterations...
Pausing after 37600 iterations...
Pausing after 37650 iterations...
Pausing after 37700 iterations...
Pausing after 37750 iterations...
Pausing after 37800 iterations...
Pausing after 37850 iterations...
Pausing after 37900 iterations...
Pausing after 37950 iterations...
Pausing after 

Pausing after 48600 iterations...
Pausing after 48650 iterations...
Pausing after 48700 iterations...
Pausing after 48750 iterations...
Pausing after 48800 iterations...
Pausing after 48850 iterations...
Pausing after 48900 iterations...
Pausing after 48950 iterations...
Pausing after 49000 iterations...
Pausing after 49050 iterations...
Pausing after 49100 iterations...
Pausing after 49150 iterations...
Pausing after 49200 iterations...
Pausing after 49250 iterations...
Pausing after 49300 iterations...
Pausing after 49350 iterations...
Pausing after 49400 iterations...
Pausing after 49450 iterations...
Pausing after 49500 iterations...
Pausing after 49550 iterations...
Pausing after 49600 iterations...
Pausing after 49650 iterations...
Pausing after 49700 iterations...
Pausing after 49750 iterations...
Pausing after 49800 iterations...
Pausing after 49850 iterations...
Pausing after 49900 iterations...
Pausing after 49950 iterations...
Pausing after 50000 iterations...
Pausing after 

Pausing after 60650 iterations...
Pausing after 60700 iterations...
Pausing after 60750 iterations...
Pausing after 60800 iterations...
Pausing after 60850 iterations...
Pausing after 60900 iterations...
Pausing after 60950 iterations...
Pausing after 61000 iterations...
Pausing after 61050 iterations...
Pausing after 61100 iterations...
Pausing after 61150 iterations...
Pausing after 61200 iterations...
Pausing after 61250 iterations...
Pausing after 61300 iterations...
Pausing after 61350 iterations...
Pausing after 61400 iterations...
Pausing after 61450 iterations...
Pausing after 61500 iterations...
Pausing after 61550 iterations...
Pausing after 61600 iterations...
Pausing after 61650 iterations...
Pausing after 61700 iterations...
Pausing after 61750 iterations...
Pausing after 61800 iterations...
Pausing after 61850 iterations...
Pausing after 61900 iterations...
Pausing after 61950 iterations...
Pausing after 62000 iterations...
Pausing after 62050 iterations...
Pausing after 

Pausing after 72700 iterations...
Pausing after 72750 iterations...
Pausing after 72800 iterations...
Pausing after 72850 iterations...
Pausing after 72900 iterations...
Pausing after 72950 iterations...
Pausing after 73000 iterations...
Pausing after 73050 iterations...
Pausing after 73100 iterations...
Pausing after 73150 iterations...
Pausing after 73200 iterations...
Pausing after 73250 iterations...
Pausing after 73300 iterations...
Pausing after 73350 iterations...
Pausing after 73400 iterations...
Pausing after 73450 iterations...
Pausing after 73500 iterations...
Pausing after 73550 iterations...
Pausing after 73600 iterations...
Pausing after 73650 iterations...
Pausing after 73700 iterations...
Pausing after 73750 iterations...
Pausing after 73800 iterations...
Pausing after 73850 iterations...
Pausing after 73900 iterations...
Pausing after 73950 iterations...
Pausing after 74000 iterations...
Pausing after 74050 iterations...
Pausing after 74100 iterations...
Pausing after 

Pausing after 84750 iterations...
Pausing after 84800 iterations...
Pausing after 84850 iterations...
Pausing after 84900 iterations...
Pausing after 84950 iterations...
Pausing after 85000 iterations...
Pausing after 85050 iterations...
Pausing after 85100 iterations...
Pausing after 85150 iterations...
Pausing after 85200 iterations...
Pausing after 85250 iterations...
Pausing after 85300 iterations...
Pausing after 85350 iterations...
Pausing after 85400 iterations...
Pausing after 85450 iterations...
Pausing after 85500 iterations...
Pausing after 85550 iterations...
Pausing after 85600 iterations...
Pausing after 85650 iterations...
Pausing after 85700 iterations...
Pausing after 85750 iterations...
Pausing after 85800 iterations...
Pausing after 85850 iterations...
Pausing after 85900 iterations...
Pausing after 85950 iterations...
Pausing after 86000 iterations...
Pausing after 86050 iterations...
Pausing after 86100 iterations...
Pausing after 86150 iterations...
Pausing after 

### Movie credits

In [8]:
movie_credits_list = []

# Loop through the desired rows
for i in range(87300, 87310):    
    if pd.notna(df.iloc[i, 0]):  # Skip if NA
        tmdbId = int(df.iloc[i, 0])
        
        url = f"https://api.themoviedb.org/3/movie/{tmdbId}/credits?language=en-US"
        
        headers = {
            "accept": "application/json",
            "Authorization": "Bearer TMDB_API_KEY"
        }
        
        # Fetch the movie credits data
        response = requests.get(url, headers=headers)

        if response.status_code == 200:
            # Parse JSON response
            movie_credits_data = response.json()
            
            # Extract tmbdId
            tmbdId = movie_credits_data.get("id")
            tmbdId = {'tmbdId': tmbdId}
            
            # Extract actor names (only first 5 actors if available)
            cast = movie_credits_data.get("cast", [])
            actors = {f"actor_{i+1}": cast[i]["name"] for i in range(min(5, len(cast)))}

            # Filter crew members with specific jobs
            crew = movie_credits_data.get("crew", [])
            filtered_crew = {}
            for person in crew:
                if person.get("job") in ['Director', 'Writer', 'Producer', 'Executive Producer', 'Director of Photography', 'Editor', 'Original Music Composer']:
                    # Avoid overwriting if multiple people have the same job
                    job_title = person["job"]
                    if job_title in filtered_crew:
                        filtered_crew[job_title] += f", {person['name']}"
                    else:
                        filtered_crew[job_title] = person["name"]

            # Combine movie details, actors, and filtered crew into a single dictionary
            movie_credits = {**tmbdId, **actors, **filtered_crew}
            movie_credits_list.append(movie_credits)

# Convert the list of dictionaries into a pandas DataFrame
movie_credits_df = pd.DataFrame(movie_credits_list)

# Display the DataFrame with all columns shown
pd.set_option('display.max_columns', None)
movie_credits_df

Unnamed: 0,tmbd,actor_1,actor_2,actor_3,actor_4,actor_5,Director,Editor,Producer,Writer,Executive Producer,Director of Photography,Original Music Composer
0,1114816,Le Phong Vu,Nguyen Thinh,Nguyen Thi Truc Quynh,Vu Ngoc Manh,Dylan Besseau,Phạm Thiên Ân,Phạm Thiên Ân,"Jeremy Chua, Tran Van Thi",Phạm Thiên Ân,Tran Van Thi,Dinh Duy Hung,
1,1114842,Simone Buttelli,,,,,Maria Augusta V. Nunes,,,Maria Augusta V. Nunes,,,
2,1114901,,,,,,Henry Roosevelt,"Jawad Metni, Pax Wassermann",,,"Dan Cogan, Liz Garbus, Jon Bardin, Kate Barry",Patrick Ginnetty,
3,1114905,Manolo Cardona,Maribel Verdú,Carla Adell,Juan Carlos Remolina,Adriana Paz,Manolo Cardona,Camilo Abadía,,,,Luis Enrique Carrión,
4,1114928,Gayle Rankin,Hari Nef,Annabelle Dexter-Jones,Rad Pereira,Jared Abrahamson,Stewart Thorndike,"Kathryn J. Schubert, Thomas Emmet Ashton","Lizzie Shapiro, Lexi Tannenholtz",Stewart Thorndike,"Emily Gotto, Samuel Zimmerman, Nicholas Lazo",Grant Greenberg,Jason Falker
5,1114972,River Gallo,Alicia Roth Weigel,Sean Saifa Wall,Julie Cohen,,Julie Cohen,Kelly Kendrick,"Molly O'Brien, Tommy Nguyen",,"Andy Berg, Liz Cole, Elizabeth Fischer, Noah O...","Amy Bench, Kate Phelan, Leah Anova",Amanda Yamate
6,1115091,Rock Hudson,Joe Carberry,Tim Turner,Lee Garlington,Paul Garlington,Stephen Kijak,Claire Didier,"Sarah Schechter, Will Clarke, Greg Berlanti, G...",,"Mike Runagall, Michael McGrath, Andy Mayson, N...",,Laura Karpman
7,1115095,Lovell Gates,Jakkar Thompson,Clayton B. Stevens,,,Sam Pollard,Dave Marcus,"Dave Sirulnick, Byron Motley, Jen Isaacson, Ro...",,"Jeffrey Lurie, Jon Kamen, Todd Wagner, Shawn G...","Henry Adebonojo, Angel Barroeta",
8,1115128,Jeff Daniels,Charlie Day,Mike Veeck,Libby Veeck,Night Train Veeck,"Morgan Neville, Jeff Malmberg","Alan Lowe, Jeff Malmberg","Morgan Neville, Jon Berg, Danny Breen",,"Caitrin Rogers, Greg Silverman, Fran Zeuli",Antonio Cisneros,"Daniel Wohl, Garth Neustadter"
9,1115191,Bill Marler,Darin Detwiler,Dr. John Kobayashi,"Lance Price, Ph.D.",Robert Nugent,Stephanie Soechtig,Weston Cadwell,"Ross M. Dinerstein, Kristin Lazure",Jeff Benedict,"Jeff Benedict, Rebecca Evans, Ross Girard",,


In [24]:
url = "https://api.themoviedb.org/3/discover/movie"

headers = {
    "accept": "application/json",
    "Authorization": "Bearer TMDB_API_KEY"
}

# Initialize an empty list to store all movies
all_movies = []
page = 1  # Start from page 1
max_pages = 200  # Limit to 20 pages

while page <= max_pages:
    params = {
        "include_adult": "false",
        "include_video": "false",
        "language": "en-US",
        "primary_release_year": 2025,
        "sort_by": "popularity.desc",
        "page": page
    }
    
    response = requests.get(url, headers=headers, params=params)
    
    if response.status_code == 200:
        data = response.json()
        movies = data.get("results", [])
        all_movies.extend(movies)  # Add movies to list
        
        total_pages = data.get("total_pages", 1)  # Update total pages
        
        print(f"Fetched page {page}/{min(total_pages, max_pages)}")  # Print progress
        
        if page >= total_pages:
            break  # Stop if there are no more pages
        
        page += 1  # Move to next page
        time.sleep(0.1)  # Pause for 0.1 seconds to avoid rate limits
    else:
        print(f"Failed to fetch page {page}: {response.status_code}")
        break  # Stop if there's an error

# Convert list of movies to DataFrame
df = pd.DataFrame(all_movies)

# Display DataFrame info
display(df.head())

# Convert to csv
df.to_csv("2025_tmdb_top_10k.csv", index=False)

Fetched page 1/200
Fetched page 2/200
Fetched page 3/200
Fetched page 4/200
Fetched page 5/200
Fetched page 6/200
Fetched page 7/200
Fetched page 8/200
Fetched page 9/200
Fetched page 10/200
Fetched page 11/200
Fetched page 12/200
Fetched page 13/200
Fetched page 14/200
Fetched page 15/200
Fetched page 16/200
Fetched page 17/200
Fetched page 18/200
Fetched page 19/200
Fetched page 20/200
Fetched page 21/200
Fetched page 22/200
Fetched page 23/200
Fetched page 24/200
Fetched page 25/200
Fetched page 26/200
Fetched page 27/200
Fetched page 28/200
Fetched page 29/200
Fetched page 30/200
Fetched page 31/200
Fetched page 32/200
Fetched page 33/200
Fetched page 34/200
Fetched page 35/200
Fetched page 36/200
Fetched page 37/200
Fetched page 38/200
Fetched page 39/200
Fetched page 40/200
Fetched page 41/200
Fetched page 42/200
Fetched page 43/200
Fetched page 44/200
Fetched page 45/200
Fetched page 46/200
Fetched page 47/200
Fetched page 48/200
Fetched page 49/200
Fetched page 50/200
Fetched p

Unnamed: 0,adult,backdrop_path,genre_ids,id,original_language,original_title,overview,popularity,poster_path,release_date,title,video,vote_average,vote_count
0,False,/9nhjGaFLKtddDPtPaX5EmKqsWdH.jpg,"[10749, 878, 53]",950396,en,The Gorge,Two highly trained operatives grow close from ...,126.244,/7iMBZzVZtG0oBug4TfqDb9ZxAOa.jpg,2025-02-13,The Gorge,False,7.8,1830
1,False,/ek8CJRZchT9YIB4p7ktEjPXuCIi.jpg,"[28, 53, 80]",1126166,en,Flight Risk,A U.S. Marshal escorts a government witness to...,119.998,/q0bCG4NX32iIEsRFZqRtuvzNCyZ.jpg,2025-01-22,Flight Risk,False,6.088,436
2,False,/hGLywNhy1Fo1rNFHsNZsXGS69B8.jpg,"[878, 35, 12]",696506,en,Mickey 17,Unlikely hero Mickey Barnes finds himself in t...,51.178,/edKpE9B5qN3e559OuMCLZdW1iBZ.jpg,2025-02-28,Mickey 17,False,7.0,418
3,False,/x8Kdi1OJbewkeGuijcSmUWrYyk2.jpg,"[28, 35, 10751, 878]",926670,en,Henry Danger: The Movie,Henry Hart meets a superfan—eager to fight cri...,49.298,/dFWj2rOGsqSIX1PHFghbCBgpMnk.jpg,2025-01-17,Henry Danger: The Movie,False,8.1,165
4,False,/sc1abgWNXc29wSBaerrjGBih06l.jpg,"[27, 878, 53]",1084199,en,Companion,During a weekend getaway at a secluded lakesid...,46.094,/oCoTgC3UyWGfyQ9thE10ulWR7bn.jpg,2025-01-22,Companion,False,7.0,730


## Cleaning up tmdb metadata for 1900-2023 films

In [112]:
# Shows that there are duplicates within the original API pull csv
df_meta_raw = pd.read_csv('/Users/oliverramsaygray/code/oliverramsaygray/film_wizard/raw_data/tmdb_movie_details.csv')
df_meta_raw.duplicated().sum()

  df_meta_raw = pd.read_csv('/Users/oliverramsaygray/code/oliverramsaygray/film_wizard/raw_data/tmdb_movie_details.csv')


np.int64(60855)

In [105]:
# Duplicates clean up
df_meta_raw.drop_duplicates(inplace=True)
df_meta_raw.info()

<class 'pandas.core.frame.DataFrame'>
Index: 86360 entries, 0 to 147214
Data columns (total 34 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   tmdbId                86360 non-null  object 
 1   title                 86359 non-null  object 
 2   original_title        86360 non-null  object 
 3   overview              85752 non-null  object 
 4   release_date          86305 non-null  object 
 5   runtime               86354 non-null  float64
 6   status                86354 non-null  object 
 7   tagline               41360 non-null  object 
 8   budget                86354 non-null  float64
 9   revenue               86354 non-null  float64
 10  vote_average          86353 non-null  object 
 11  vote_count            86351 non-null  object 
 12  popularity            86350 non-null  object 
 13  genre_1               85314 non-null  object 
 14  genre_2               55955 non-null  object 
 15  genre_3               2

In [116]:
# Further clean up becasue Google bq is rejecting my upload
df_meta_raw['vote_average'] = pd.to_numeric(df_meta_raw['vote_average'], errors='coerce')
df_meta_raw['popularity'] = pd.to_numeric(df_meta_raw['popularity'], errors='coerce')
df_meta_raw['vote_count'] = pd.to_numeric(df_meta_raw['vote_count'], errors='coerce').astype('Int64')  # Keeps NaNsdf_meta_raw.info()
df_meta_raw = df_meta_raw[pd.to_numeric(df_meta_raw['tmdbId'], errors='coerce').notna()]

df_meta_raw.info()

<class 'pandas.core.frame.DataFrame'>
Index: 147209 entries, 0 to 147214
Data columns (total 34 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   tmdbId                147209 non-null  object 
 1   title                 147209 non-null  object 
 2   original_title        147209 non-null  object 
 3   overview              146601 non-null  object 
 4   release_date          86340 non-null   object 
 5   runtime               86383 non-null   float64
 6   status                86383 non-null   object 
 7   tagline               41370 non-null   object 
 8   budget                86383 non-null   float64
 9   revenue               86383 non-null   float64
 10  vote_average          86383 non-null   float64
 11  vote_count            86383 non-null   Int64  
 12  popularity            86383 non-null   float64
 13  genre_1               85346 non-null   object 
 14  genre_2               55972 non-null   object 
 15  genre

In [117]:
df_meta_raw.to_csv('/Users/oliverramsaygray/code/oliverramsaygray/film_wizard/raw_data/tmdb_movie_details_clean.csv')