In [1]:
import pandas as pd

# Load the IMDb datasets from TSV files
titles_basics = pd.read_csv('title.basics.tsv', sep='\t', low_memory=False)  # Movie details
ratings = pd.read_csv('title.ratings.tsv', sep='\t', low_memory=False)       # Movie ratings
names_basics = pd.read_csv('name.basics.tsv', sep='\t', low_memory=False)    # People info
title_principals = pd.read_csv('title.principals.tsv', sep='\t')             # Roles in movies

In [2]:
# Step 1: Find Nicolas Cage's unique ID (nconst)
cage_data = names_basics[names_basics['primaryName'] == 'Nicolas Cage']      # Exact name match
cage_nconst = cage_data['nconst'].iloc[0]                                    # Get his ID
print(f"Nicolas Cage's ID: {cage_nconst}")

Nicolas Cage's ID: nm0000115


In [3]:
# Step 2: Get all titles where Nicolas Cage worked (as any role)
cage_roles = title_principals[title_principals['nconst'] == cage_nconst]     # Filter by his ID
cage_tconsts = cage_roles['tconst'].unique()                                 # Unique movie IDs
print(f"Found {len(cage_tconsts)} titles with Nicolas Cage.")

# Note: To limit to acting roles, add: & (title_principals['category'] == 'actor')

Found 518 titles with Nicolas Cage.


In [4]:
# Step 3: Filter for all titles (not just movies yet)
cage_movies = titles_basics[titles_basics['tconst'].isin(cage_tconsts)]      # Match movie IDs

# Note: To limit to movies only, add: & (titles_basics['titleType'] == 'movie')

# Step 4: Pick key columns and fix missing values
cage_movies_list = cage_movies[['tconst', 'primaryTitle', 'startYear', 'genres']].copy()  # Select columns
cage_movies_list['startYear'] = cage_movies_list['startYear'].replace('\\N', 'Unknown')   # Replace \N with Unknown
cage_movies_list['genres'] = cage_movies_list['genres'].replace('\\N', 'Unknown')         # Same for genres

In [5]:
# Step 5: Add ratings to the list (optional)
cage_movies_with_ratings = cage_movies_list.merge(
    ratings[['tconst', 'averageRating', 'numVotes']],  # Ratings data
    left_on='tconst', right_on='tconst',               # Match by movie ID
    how='left'                                         # Keep all movies, even without ratings
)

In [None]:
# Step 6: Show the results
print("\nNicolas Cage Movies:")
print(cage_movies_with_ratings[['primaryTitle', 'startYear', 'genres', 'averageRating', 'numVotes']].to_string(index=False))

# Step 7: Save results to a CSV file
cage_movies_with_ratings.to_csv('Nicolas_Cage_Works.csv', index=False)
print("\nResults saved to 'Nicolas_Cage_Works.csv'.")


Nicolas Cage Movies:
                                                                                        primaryTitle startYear                        genres  averageRating  numVotes
                                                                                   The Best of Times      1981                        Comedy            5.1     386.0
                                                                                         Rumble Fish      1983           Crime,Drama,Romance            7.1   39616.0
                                                                                         Valley Girl      1983                Comedy,Romance            6.4   19693.0
                                                                                               Birdy      1984                     Drama,War            7.2   25825.0
                                                                                     The Cotton Club      1984             Crime,Drama,Music        

In [6]:
import pandas as pd
import requests
from time import sleep

# TMDB Configuration
TMDB_API_KEY = 'fa5745da7dc2567ff75acde4074ca567'
TMDB_IMAGE_BASE = 'https://image.tmdb.org/t/p/w500'


def get_tmdb_poster(title, year=None):
    """Fetch poster URL from TMDB"""
    try:
        response = requests.get(
            'https://api.themoviedb.org/3/search/movie',
            params={
                'api_key': TMDB_API_KEY,
                'query': title,
                'year': year if year and year != 'Unknown' else None,
                'include_adult': True
            })
        response.raise_for_status()

        results = response.json().get('results', [])
        if results:
            return f"{TMDB_IMAGE_BASE}{results[0]['poster_path']}" if results[0]['poster_path'] else None
    except Exception as e:
        print(f"Error fetching poster for {title}: {e}")
    return None


# Add TMDB poster URLs to DataFrame
print("\nFetching poster URLs from TMDB...")
cage_movies_with_ratings['posterUrl'] = [
    get_tmdb_poster(row['primaryTitle'], row['startYear']) for _, row in cage_movies_with_ratings.iterrows()
]

# Respect rate limits by adding sleep between requests
sleep(0.3)  # Respect TMDB rate limits (4 requests/second max)

# Save the results with poster URLs
cage_movies_with_ratings.to_csv('Nicolas_Cage_Works.csv', index=False)
print("\nResults with poster URLs saved to 'Nicolas_Cage_Works.csv'")


Fetching poster URLs from TMDB...

Results with poster URLs saved to 'Nicolas_Cage_Works.csv'


In [None]:
# Step 7: Save results to a JSON file
#cage_movies_with_ratings.to_json(
#    'Nicolas_Cage_Works.json', orient='records', lines=True)
#print("\nResults saved to 'Nicolas_Cage_Works.json'.")


Results saved to 'Nicolas_Cage_Works.json'.
