In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Base URL for IMDb
SITE_URL = 'https://www.imdb.com'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0'}

# URL for IMDb Top 250 page
URL = SITE_URL + "/chart/top/?ref_=nv_mv_250"
movie_list = []

# Loop to go through pages of IMDb Top 250
while True:
    # Fetch page content
    page = requests.get(URL, headers=headers)
    soup = BeautifulSoup(page.content, 'html.parser')

    # Find movie entries
    content_headers = soup.find_all("div", {"class": "ipc-metadata-list-summary-item__c"})
    
    for content in content_headers:
        movie_item = []
        
        # Movie name
        title_tag = content.find("a", {"class": "ipc-title-link-wrapper"})
        if title_tag:
            name = title_tag.get_text(strip=True)  # Extract text and remove any extra whitespace
        else:
            name = "N/A"
        movie_item.append(name)

        # Movie year, duration, and age rating - check if spans exist
        metadata_items = content.find_all("span", {"class": "sc-5bc66c50-6 OOdsw dli-title-metadata-item"})
        if len(metadata_items) > 0:
            year = metadata_items[0].text
        else:
            year = "N/A"
        movie_item.append(year)

        if len(metadata_items) > 1:
            duration = metadata_items[1].text
        else:
            duration = "N/A"
        movie_item.append(duration)

        if len(metadata_items) > 2:
            age_rating = metadata_items[2].text
        else:
            age_rating = "N/A"
        movie_item.append(age_rating)

        # Append movie details to the list
        movie_list.append(movie_item)

    # Check if there's a next page
    # Find the "Next" page link
    next_page_tag = soup.find("span", {"class": "ipc-see-more__text"})
    if next_page_tag and 'href' in next_page_tag.attrs:
        URL = SITE_URL + next_page_tag['href']
    else:
        URL = None

# Create a DataFrame and save to CSV
df = pd.DataFrame(movie_list, columns=['Name', 'Year', 'Duration', 'Age Rating'])
df.to_csv("imdb_top_250.csv", index=False)


In [17]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
import time

# Set up Selenium WebDriver
driver = webdriver.Chrome()  # Ensure you have the correct WebDriver
driver.get('https://www.imdb.com/search/title/?groups=top_250&sort=user_rating,desc')

movie_list = []

# Loop to keep clicking "50 más" button until all items are fully loaded
while True:
    try:
        # Scroll to the bottom of the page to trigger lazy loading
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        
        # Wait for a short period to allow content to load
        time.sleep(2)  # Adjust this delay if necessary
        
        # Check for and click the "50 más" button
        load_more_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.CLASS_NAME, "ipc-see-more__button"))
        )
        load_more_button.click()

        # Wait for additional content to load after clicking
        time.sleep(5)  # Longer wait to ensure movies are fully loaded

    except:
        # Exit loop if "50 más" button is no longer found or clickable
        break

# Parse the page after all content is loaded
soup = BeautifulSoup(driver.page_source, 'html.parser')
content_headers = soup.find_all("div", {"class": "ipc-metadata-list-summary-item__c"})

# Extract movie details from each movie block
for content in content_headers:
    movie_item = []

    # Movie title
    title_tag = content.find("a", {"class": "ipc-title-link-wrapper"})
    name = title_tag.get_text(strip=True) if title_tag else "N/A"
    movie_item.append(name)

    # Movie year, duration, and age rating
    metadata_items = content.find_all("span", {"class": "sc-5bc66c50-6 OOdsw dli-title-metadata-item"})
    year = metadata_items[0].text if len(metadata_items) > 0 else "N/A"
    duration = metadata_items[1].text if len(metadata_items) > 1 else "N/A"
    age_rating = metadata_items[2].text if len(metadata_items) > 2 else "N/A"
    movie_item.extend([year, duration, age_rating])

    # Append movie details to the list
    movie_list.append(movie_item)

# Close the browser
driver.quit()

# Create a DataFrame and save to CSV
df = pd.DataFrame(movie_list, columns=['Name', 'Year', 'Duration', 'Age Rating'])
df.to_csv('top_movie_data.csv', index=False)


In [4]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
import time

# Set up Selenium WebDriver
driver = webdriver.Chrome()  # Ensure you have the correct WebDriver
driver.get('https://www.imdb.com/search/title/?groups=top_250&sort=user_rating,desc')

movie_list = []

# Loop to keep clicking "50 más" button until all items are fully loaded
while True:
    try:
        # Scroll to the bottom of the page to trigger lazy loading
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        
        # Wait for a short period to allow content to load
        time.sleep(2)  # Adjust this delay if necessary
        
        # Check for and click the "50 más" button
        load_more_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.CLASS_NAME, "ipc-see-more__button"))
        )
        load_more_button.click()

        # Wait for additional content to load after clicking
        time.sleep(5)  # Longer wait to ensure movies are fully loaded

    except:
        # Exit loop if "50 más" button is no longer found or clickable
        break

# Parse the page after all content is loaded
soup = BeautifulSoup(driver.page_source, 'html.parser')
content_headers = soup.find_all("div", {"class": "ipc-metadata-list-summary-item__c"})

# Extract movie details from each movie block
for content in content_headers:
    movie_item = []

    # Movie title
    title_tag = content.find("a", {"class": "ipc-title-link-wrapper"})
    name = title_tag.get_text(strip=True) if title_tag else "N/A"
    movie_item.append(name)

    # Get movie URL to navigate to its details page
    movie_url = "https://www.imdb.com" + title_tag['href'] if title_tag else None

    # Visit the movie's detail page if URL is available
    if movie_url:
        driver.get(movie_url)
        time.sleep(2)  # Wait for page to load

        # Parse the movie page and find similar movies in "más del estilo" section
        movie_soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        # Find "más del estilo" section (update this selector based on structure)
        # Find the section containing similar movies
        similar_movies_section = movie_soup.find_all("div", {"class": "ipc-shoveler ipc-shoveler--base ipc-shoveler--page0"})

        # Extract similar movie names
        similar_movies = []
        for similar_movie in similar_movies_section:
            # Find all span tags with data-testid="title" within the similar_movie div
            titles = similar_movie.find_all("span", {"data-testid": "title"})
            
            # Get the text content of each title span and add it to the similar_movies list
            for title in titles:
                movie_name = title.get_text(strip=True)
                print(movie_name)
                similar_movies.append(movie_name)
        
    # Append similar movies list to movie item
    movie_item.append(similar_movies.to_string(index=False))
    
    # Append movie details to the list
    movie_list.append(movie_item)
    
    # Return to main page to continue with the next movie
    driver.back()
    time.sleep(2)  # Wait for the page to reload
    break

# Close the browser
driver.quit()

# Create a DataFrame and save to CSV
df = pd.DataFrame(movie_list, columns=['Name', 'Similar Movies'])
df.to_csv('top_movie_data_with_similar.csv', index=False)


El caballero oscuro
El padrino
Forrest Gump
El club de la lucha
Pulp Fiction
Origen
La lista de Schindler
La milla verde
Seven
El señor de los anillos: El retorno del rey
El padrino parte II
12 hombres sin piedad


AttributeError: 'list' object has no attribute 'to_string'

In [9]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
import time

# Set up Selenium WebDriver
driver = webdriver.Chrome()  # Or use webdriver.Firefox() if you have Firefox installed
driver.get('https://www.imdb.com/search/title/?groups=top_250&sort=user_rating,desc')

movie_list = []

while True:
    # Wait for the page to load
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CLASS_NAME, "ipc-metadata-list-summary-item__c"))
    )

    # Get page source and parse with BeautifulSoup
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    content_headers = soup.find_all("div", {"class": "ipc-metadata-list-summary-item__c"})
    
    # Extract movie details
    for content in content_headers:
        movie_item = []
        
        # Movie title
        title_tag = content.find("a", {"class": "ipc-title-link-wrapper"})
        name = title_tag.get_text(strip=True) if title_tag else "N/A"
        movie_item.append(name)

        # Movie year, duration, and age rating
        metadata_items = content.find_all("span", {"class": "sc-5bc66c50-6 OOdsw dli-title-metadata-item"})
        year = metadata_items[0].text if len(metadata_items) > 0 else "N/A"
        duration = metadata_items[1].text if len(metadata_items) > 1 else "N/A"
        age_rating = metadata_items[2].text if len(metadata_items) > 2 else "N/A"
        movie_item.extend([year, duration, age_rating])

        # Append movie details to the list
        movie_list.append(movie_item)
    
    # Try to find and click the "50 más" button to load the next set of movies
    try:
        load_more_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.CLASS_NAME, "ipc-see-more__button"))
        )
        load_more_button.click()
        time.sleep(3)  # Pause to allow content to load
    except Exception as e:
        print("No more pages to load or error:", e)
        break

# Close the browser
driver.quit()

# Create a DataFrame and save to CSV
df = pd.DataFrame(movie_list, columns=['Name', 'Year', 'Duration', 'Age Rating'])
df.to_csv("imdb_top_250_2.csv", index=False)


No more pages to load or error: Message: element click intercepted: Element is not clickable at point (398, 9736)
  (Session info: chrome=130.0.6723.117)
Stacktrace:
	GetHandleVerifier [0x00007FF6988638A5+3004357]
	(No symbol) [0x00007FF6984F9970]
	(No symbol) [0x00007FF6983A582A]
	(No symbol) [0x00007FF6983FD80E]
	(No symbol) [0x00007FF6983FB2AC]
	(No symbol) [0x00007FF6983F8778]
	(No symbol) [0x00007FF6983F798C]
	(No symbol) [0x00007FF6983E996E]
	(No symbol) [0x00007FF69841BBDA]
	(No symbol) [0x00007FF6983E92A6]
	(No symbol) [0x00007FF69841BDF0]
	(No symbol) [0x00007FF69843BA4C]
	(No symbol) [0x00007FF69841B983]
	(No symbol) [0x00007FF6983E7628]
	(No symbol) [0x00007FF6983E8791]
	GetHandleVerifier [0x00007FF69888A00D+3161901]
	GetHandleVerifier [0x00007FF6988DE060+3506048]
	GetHandleVerifier [0x00007FF6988D400D+3465005]
	GetHandleVerifier [0x00007FF698650EEB+830987]
	(No symbol) [0x00007FF69850467F]
	(No symbol) [0x00007FF6985009D4]
	(No symbol) [0x00007FF698500B6D]
	(No symbol) [0x0