In [288]:
# scrapes any 'title search' movie selection of this sort
# https://www.imdb.com/search/title/?title_type=feature&release_date=2000-01-01,2023-12-31&groups=oscar_nominee&sort=num_votes,desc

In [None]:
# using selenium and beautifulsoup

In [301]:
import pandas as pd
import time
import re

from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

In [302]:
chrome_options = Options()
chrome_options.add_argument('--headless')  
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--window-size=1920x1080')
chrome_options.add_argument('--disable-blink-features=AutomationControlled') 
chrome_options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36')

In [303]:
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)

In [304]:
# url_2025 = 'https://www.imdb.com/search/title/?title_type=feature&release_date=2024-01-01,2024-12-31&groups=oscar_nominee&sort=num_votes,desc'
url_2000_2024 = 'https://www.imdb.com/search/title/?title_type=feature&release_date=2000-01-01,2023-12-31&groups=oscar_nominee&sort=num_votes,desc'
driver.get(url_2000_2024)
time.sleep(3) 

In [305]:
def clean_title(title):
    return re.sub(r"^\d+\.\s*", "", title).strip()

def clean_rating_count(rating_count):
    if not rating_count:
        return 'N/A'
    
    rating_count = rating_count.strip('()')  # Remove parentheses
    
    if 'K' in rating_count:
        return int(float(rating_count.replace('K', '')) * 1_000)
    elif 'M' in rating_count:
        return int(float(rating_count.replace('M', '')) * 1_000_000)
    return int(rating_count.replace(',', ''))

In [306]:
seen_titles = set()
movies_data = []

while True:
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    movie_containers = soup.find_all('div', class_='ipc-metadata-list-summary-item__c')
    
    for movie in movie_containers: 
        title = movie.find('h3') 
        title = clean_title(title.text.strip())
        
        # avoid reparsing movies already seen
        if title in seen_titles: 
            continue
        
        # mark title as seen
        seen_titles.add(title)
        
        metadata = movie.find_all('span', class_='sc-ad5a2436-7 cJVQtZ dli-title-metadata-item')
        metadata_arr = [data.text.strip() for data in metadata]

        year = metadata_arr[0] if len(metadata_arr) > 0 else 'N/A'
        runtime = metadata_arr[1] if len(metadata_arr) > 1 else 'N/A'
        age_rating = metadata_arr[2] if len(metadata_arr) > 2 else 'N/A'

        star_rating = movie.find('span', class_='ipc-rating-star--rating')
        star_rating = star_rating.text.strip() if star_rating else 'N/A'

        rating_count = movie.find('span', class_='ipc-rating-star--voteCount')
        rating_count = clean_rating_count(rating_count.text.strip() if rating_count else 'N/A')
        
        # append data to movies_data
        movies_data.append({
            'title': title,
            'year': year,
            'runtime': runtime,
            'age_rating':age_rating,
            'star_rating': star_rating,
            'rating_count': rating_count
        })

        # print(title)
        # print(year, runtime, age_rating)
        # print(rating_count)
        
    # see more logic
    try:
        see_more_button = driver.find_element(By.CLASS_NAME, 'ipc-see-more__button')
        ActionChains(driver).move_to_element(see_more_button).click().perform()
        time.sleep(3) 
    except Exception:
        print('Exiting...')
        break
        
driver.quit()

The Dark Knight
Inception
Interstellar
The Lord of the Rings: The Fellowship of the Ring
The Lord of the Rings: The Return of the King
The Lord of the Rings: The Two Towers
Django Unchained
Gladiator
The Wolf of Wall Street
Inglourious Basterds
Batman Begins
Joker
The Prestige
The Avengers
The Departed
Avatar
Memento
Avengers: Endgame
Guardians of the Galaxy
Avengers: Infinity War
Pirates of the Caribbean: The Curse of the Black Pearl
WALL·E
Up
Iron Man
Mad Max: Fury Road
Finding Nemo
Catch Me If You Can
Eternal Sunshine of the Spotless Mind
Gone Girl
No Country for Old Men
Whiplash
Parasite
Monsters, Inc.
A Beautiful Mind
Star Wars: Episode VII - The Force Awakens
Harry Potter and the Deathly Hallows: Part 2
The Martian
The Pianist
Dune: Part One
Spider-Man: No Way Home
Requiem for a Dream
Captain America: The Winter Soldier
Iron Man 3
Toy Story 3
The Grand Budapest Hotel
Spider-Man
The Revenant
The Hobbit: An Unexpected Journey
Harry Potter and the Sorcerer's Stone
Spirited Away
Iron

Once
Greyhound
Frankenweenie
Fences
A Single Man
Dancer in the Dark
Talk to Her
Little Children
Guillermo del Toro's Pinocchio
Precious
House of Flying Daggers
Malena
The Queen
The Man Who Wasn't There
Dogtooth
Across the Universe
Super Size Me
The Iron Lady
The Wolfman
Frost/Nixon
Capernaum
Inherent Vice
ParaNorman
United 93
August Rush
Poseidon
The Cell
The Diving Bell and the Butterfly
The Good Shepherd
Aftersun
Volver
Anna Karenina
The Worst Person in the World
Amour
Ali
American Fiction
Extremely Loud & Incredibly Close
The Motorcycle Diaries
Eurovision Song Contest: The Story of Fire Saga
Philomena
The Wind Rises
A Prophet
Good Night, and Good Luck.
Minari
Persepolis
Thirteen
Unfaithful
Richard Jewell
Tár
The Great Beauty
West Side Story
Jackass Presents: Bad Grandpa
The Muppets
News of the World
Mary Poppins Returns
Gosford Park
Frida
August: Osage County
Selma
Belfast
Hereafter
Crazy Heart
Biutiful
Monster's Ball
Beginners
Trolls
Surf's Up
Jane Eyre
Judas and the Black Messiah


In [307]:
oscar_2000_2024_noms = pd.DataFrame(movies_data)
oscar_2000_2024_noms

Unnamed: 0,title,year,runtime,age_rating,star_rating,rating_count
0,The Dark Knight,2008,2h 32m,PG-13,9.0,3000000
1,Inception,2010,2h 28m,PG-13,8.8,2600000
2,Interstellar,2014,2h 49m,PG-13,8.7,2300000
3,The Lord of the Rings: The Fellowship of the Ring,2001,2h 58m,PG-13,8.9,2100000
4,The Lord of the Rings: The Return of the King,2003,3h 21m,PG-13,9.0,2100000
...,...,...,...,...,...,...
972,Scottsboro: An American Tragedy,2000,1h 24m,,7.5,369
973,Watani: My Homeland,2016,1h 16m,,7.4,305
974,Open Heart,2013,40m,,7.1,173
975,Legacy,2000,1h 30m,Not Rated,6.9,161
