# Crawl IMdB Website for TOP Grossing Movies and their info from each year

In [None]:
%pip install selenium
%pip install bs4
%pip install pandas

In [13]:
import os
import time
import pandas as pd

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.edge.service import Service
from selenium.webdriver.edge.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


In [14]:
def safe_extract(soup_obj, selector, attribute=None, processing=None):
    try:
        if isinstance(soup_obj, (int, str)) or soup_obj is None:
            return None
        element = soup_obj.select_one(selector) if isinstance(selector, str) else soup_obj.find(*selector)
        if element:
            text = element.get(attribute) if attribute else element.text
            return processing(text) if processing else text
    except Exception as e:
        return None
    return None

In [15]:
def parse_awards(awards_text):
    """
    Parse awards text to extract wins, nominations, and oscars.
    Returns a dictionary with the counts.
    """
    awards = {
        "wins": 0,
        "nominations": 0,
        "oscars": 0
    }
    
    if not awards_text:
        return awards
        
    try:
        # Extract Oscars
        oscar_match = awards_text.lower().split('nominated for')
        if oscar_match and len(oscar_match) > 1:
            oscar_text = oscar_match[1].split()[0]
            if oscar_text.isdigit():
                awards["oscars"] = int(oscar_text)
        
        # Extract wins and nominations
        if '&' in awards_text:
            parts = awards_text.lower().split('&')
            
            # Extract wins
            wins_text = parts[0].split('wins')[0].strip().split()[-1]
            if wins_text.isdigit():
                awards["wins"] = int(wins_text)
            
            # Extract nominations
            noms_text = parts[1].split('nominations')[0].strip().split()[-1]
            if noms_text.isdigit():
                awards["nominations"] = int(noms_text)
                
    except Exception as e:
        print(f"Error parsing awards: {str(e)}")
        
    return awards

In [16]:
def extract_box_office_data(soup, test_id):
    try:
        element = soup.find("li", {"data-testid": f"title-boxoffice-{test_id}"})
        if element:
            content = element.find("span", class_="ipc-metadata-list-item__list-content-item")
            if content and content.text:
                value = content.text.strip()
                # Extract only the number, removing currency symbol and commas
                amount = ''.join(c for c in value if c.isdigit())
                return int(amount) if amount else None
    except:
        return None
    return None

In [17]:
def extract_credits(soup):
    credits = {"directors": [], "writers": [], "stars": []}

    try:
        # Find all principal credits
        credits_section = soup.find("div", {"class": "sc-70a366cc-2"})
        if not credits_section:
            return credits

        credit_items = credits_section.find_all(
            "li", {"data-testid": "title-pc-principal-credit"}
        )

        for item in credit_items:
            # Get the label (Director/Writers/Stars)
            label = item.find(
                "span", {"class": "ipc-metadata-list-item__label"}
            ) or item.find("a", {"class": "ipc-metadata-list-item__label"})

            if not label:
                continue

            label_text = label.text.lower().strip()

            # Extract names based on the label
            names = item.select("a.ipc-metadata-list-item__list-content-item--link")
            extracted_names = [name.text.strip() for name in names if name.text.strip()]

            if "director" in label_text:
                credits["directors"] = extracted_names
            elif "writer" in label_text:
                credits["writers"] = extracted_names
            elif "star" in label_text:
                credits["stars"] = extracted_names

    except Exception as e:
        print(f"Error extracting credits: {str(e)}")

    return credits

In [18]:
def extract_list_data(soup, selector, class_name=None):
    try:
        if class_name:
            elements = soup.select(f"{selector} a.{class_name}")
        else:
            elements = soup.select(f"{selector} a.ipc-metadata-list-item__list-content-item--link")
        return [elem.text.strip() for elem in elements if elem.text.strip()]
    except:
        return None
    return None

In [19]:
def load_more_items(driver):
    try:
        # Updated XPath for the "Load more" button
        button_xpath = "//button[contains(@class, 'ipc-btn') and contains(@class, 'ipc-see-more')]"
        
        # Wait for the button to be clickable
        load_more = WebDriverWait(driver, 2).until(
            EC.element_to_be_clickable((By.XPATH, button_xpath))
        )
        
        # Scroll to button
        driver.execute_script("arguments[0].scrollIntoView({ behavior: 'smooth', block: 'center' });", load_more)
        time.sleep(0.5)
        
        # Click the button
        load_more.click()
        time.sleep(2)
        return True
    except Exception as e:
        print(f"Load more error: {str(e)}")
        return False

In [20]:
def extract_genres(soup):
    try:
        genre_list = soup.select("div.ipc-chip-list__scroller a.ipc-chip--on-baseAlt span.ipc-chip__text")
        return [genre.text.strip() for genre in genre_list if genre.text.strip()]
    except:
        return None

In [21]:
def crawl_imdb_movies(year: int):
    output_dir = os.path.join("Data", str(year))
    os.makedirs(output_dir, exist_ok=True)

    url = f"https://www.imdb.com/search/title/?title_type=feature&release_date={year}-01-01,{year}-12-31&count=100&sort=boxoffice_gross_us,desc"

    options = Options()
    options.add_argument("--lang=en-US")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-gpu")
    options.add_argument("--window-size=1920,1080")

    driver_path = "edgedriver.exe"
    service = Service(executable_path=driver_path)
    driver = webdriver.Edge(service=service, options=options)

    try:
        driver.get(url)
        time.sleep(2)

        # Wait for the movie list to be present
        movie_list_xpath = "//ul[contains(@class, 'ipc-metadata-list')]"
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, movie_list_xpath))
        )

        # Load more movies
        loaded_data = 100
        while loaded_data < 600:
            if load_more_items(driver):
                loaded_data += 100
            else:
                print("Failed to load more items")
                break

        # Parse the page content
        soup = BeautifulSoup(driver.page_source, "html.parser")

        # Find the movie list container
        movie_list = soup.find("ul", class_="ipc-metadata-list")
        if not movie_list:
            print("Movie list container not found!")
            return None

        # Find all movie items
        film_containers = movie_list.find_all(
            "li", class_="ipc-metadata-list-summary-item"
        )

        films_data = []
        for film in film_containers:
            try:
                # Extract title
                title_element = film.find("h3", class_="ipc-title__text")
                title = title_element.text.strip() if title_element else None

                # Extract movie link
                link_element = film.find("a", class_="ipc-title-link-wrapper")
                movie_link = (
                    f"https://www.imdb.com{link_element['href']}"
                    if link_element and "href" in link_element.attrs
                    else None
                )

                # Extract metadata
                metadata = film.find_all("span", class_="dli-title-metadata-item")
                year_text = metadata[0].text if len(metadata) > 0 else None
                duration = metadata[1].text if len(metadata) > 1 else None
                mpa = metadata[2].text if len(metadata) > 2 else None

                # Extract rating
                rating_element = film.find("span", class_="ipc-rating-star--rating")
                rating = rating_element.text.strip() if rating_element else None

                # Extract votes
                votes_element = film.find("span", class_="ipc-rating-star--voteCount")
                votes = (
                    votes_element.text.replace(" ", "").strip("()")[2::]
                    if votes_element
                    else None
                )
                film_data = {
                    "Title": title,
                    "Movie Link": movie_link,
                    "Year": year_text,
                    "Duration": duration,
                    "MPA": mpa,
                    "Rating": rating,
                    "Votes": votes,
                }
                films_data.append(film_data)

            except Exception as e:
                print(f"Error processing film: {str(e)}")
                continue

        if not films_data:
            print("No movies data was collected!")
            return None

        # Create DataFrame and save initial data
        initial_movies_df = pd.DataFrame(films_data)
        initial_movies_path = os.path.join(output_dir, f"imdb_movies_{year}.csv")
        initial_movies_df.to_csv(initial_movies_path, index=False)


        if (
            "Movie Link" not in initial_movies_df.columns
            or initial_movies_df["Movie Link"].isna().all()
        ):
            return initial_movies_df

        # Extract advanced movie details
        all_movie_data = []
        for url in initial_movies_df["Movie Link"].dropna():
            try:
                driver.get(url)
                time.sleep(1)
                soup = BeautifulSoup(driver.page_source, "html.parser")

                # Extract all credits at once
                credits = extract_credits(soup)

                advanced_details = {
                    "Movie Link": url,
                    "budget": extract_box_office_data(soup, "budget"),
                    "grossWorldWide": extract_box_office_data(
                        soup, "cumulativeworldwidegross"
                    ),
                    "gross_US_Canada": extract_box_office_data(soup, "grossdomestic"),
                    "opening_weekend_Gross": extract_box_office_data(
                        soup, "openingweekenddomestic"
                    ),
                    "directors": credits["directors"],
                    "writers": credits["writers"],
                    "stars": credits["stars"],
                    "genres": extract_genres(soup),
                    "countries_origin": extract_list_data(
                        soup, "li[data-testid='title-details-origin']"
                    ),
                    "filming_locations": extract_list_data(
                        soup, "li[data-testid='title-details-filminglocations']"
                    ),
                    "production_companies": extract_list_data(
                        soup, "li[data-testid='title-details-companies']"
                    ),
                    "Languages": extract_list_data(
                        soup, "li[data-testid='title-details-languages']"
                    ),
                }

                # Extract awards data
                awards_element = soup.find("li", {"data-testid": "award_information"})
                if awards_element:
                    awards_text = awards_element.text.strip()
                    awards_dict = parse_awards(awards_text)
                    advanced_details.update(awards_dict)
                else:
                    advanced_details.update({"wins": 0, "nominations": 0, "oscars": 0})

                advanced_details["release_date"] = safe_extract(
                    soup, "a[href*='releaseinfo']"
                )

                all_movie_data.append(advanced_details)

            except Exception as e:
                print(f"Error processing {url}: {str(e)}")
                all_movie_data.append({"Movie Link": url})

        advanced_movies_df = pd.DataFrame(all_movie_data)
        advanced_movies_path = os.path.join(
            output_dir, f"advanced_movies_details_{year}.csv"
        )
        advanced_movies_df.to_csv(advanced_movies_path, index=False)

        merged_data = pd.merge(
            initial_movies_df, advanced_movies_df, how="left", on="Movie Link"
        )
        merged_path = os.path.join(output_dir, f"merged_movies_data_{year}.csv")
        merged_data.to_csv(merged_path, index=False)

        return merged_data

    finally:
        driver.quit()

In [None]:
years_to_crawl = range(1960, 2025)
for year in years_to_crawl:
    print(f"Crawling data for year {year}")
    crawl_imdb_movies(year)