In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
import time

In [4]:
# Set up Selenium WebDriver
options = webdriver.ChromeOptions()
options.add_argument("--lang=en")

driver = webdriver.Chrome(options=options)
# driver.get('https://www.imdb.com/search/title/?groups=top_250&sort=user_rating,desc') # Best 250 movies
driver.get('https://www.imdb.com/search/title/?groups=bottom_250&sort=user_rating,desc')

movie_links = []

# Loop to load all movies by clicking the "50 more" button
while True:
    try:
        # Scroll to the bottom of the page to trigger lazy loading
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)  # Adjust delay if necessary

        # Click the "50 more" button
        load_more_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.CLASS_NAME, "ipc-see-more__button"))
        )
        load_more_button.click()
        time.sleep(5)  # Wait for additional content to load after clicking

    except:
        # Exit loop if "50 more" button is no longer clickable
        break

# Collect movie links from the current page
soup = BeautifulSoup(driver.page_source, 'html.parser')
content_headers = soup.find_all("div", {"class": "ipc-metadata-list-summary-item__c"})

for content in content_headers:
    title_tag = content.find("a", {"class": "ipc-title-link-wrapper"})
    if title_tag:
        movie_url = "https://www.imdb.com" + title_tag['href']
        movie_links.append(movie_url)

# Close the main page and prepare to visit each movie link
driver.quit()

In [5]:
# Look how many movie links we have
print(len(movie_links))

250


In [6]:
# Initialize a new WebDriver instance for movie details extraction
driver = webdriver.Chrome(options=options)
movie_list = []

# Visit each movie page and collect details
for movie_url in movie_links:
    movie_item = []
    
    # Navigate to the movie's detail page
    driver.get(movie_url)
    time.sleep(2)  # Adjust if necessary

    # Parse the movie page and extract movie details
    movie_soup = BeautifulSoup(driver.page_source, 'html.parser')

    # Movie title
    title_tag = movie_soup.find("h1")
    name = title_tag.get_text(strip=True) if title_tag else "N/A"
    print(f'We are in movie: {name}')
    movie_item.append(name)

    # Find the section containing similar movies
    similar_movies_section = movie_soup.find_all("div", {"class": "ipc-shoveler ipc-shoveler--base ipc-shoveler--page0"})
    similar_movies = []

    for similar_movie in similar_movies_section:
        titles = similar_movie.find_all("span", {"data-testid": "title"})
        for title in titles:
            movie_name = title.get_text(strip=True)
            similar_movies.append(movie_name)

    # Convert similar movies list to a string
    similar_movies_str = ", ".join(similar_movies)
    movie_item.append(similar_movies_str)

    # Append movie details to the list
    movie_list.append(movie_item)

# Close the browser
driver.quit()

# Save data to a CSV file
df = pd.DataFrame(movie_list, columns=['Name', 'Similar Movies'])
df.to_csv('top_movie_data_with_similar.csv', index=False)


We are in movie: Meet the Blacks
We are in movie: Home Alone 3
We are in movie: Fifty Shades Darker
We are in movie: Aliens vs. Predator: Requiem
We are in movie: Alvin and the Chipmunks: The Squeakquel
We are in movie: God's Not Dead
We are in movie: Fifty Shades Freed
We are in movie: Death Note
We are in movie: Sex and the City 2
We are in movie: Little Man
We are in movie: Jason X
We are in movie: Space Jam: A New Legacy
We are in movie: Ouija
We are in movie: Friday the 13th Part VIII: Jason Takes Manhattan
We are in movie: Species II
We are in movie: Rings
We are in movie: Skyline
We are in movie: Kangaroo Jack
We are in movie: xXx: State of the Union
We are in movie: The Butterfly Effect 2
We are in movie: Nutty Professor II: The Klumps
We are in movie: Big Mommas: Like Father, Like Son
We are in movie: Movie 43
We are in movie: The Human Centipede (First Sequence)
We are in movie: Basic Instinct 2
We are in movie: Spy Kids 3: Game Over
We are in movie: He's All That
We are in m

In [7]:
df = pd.DataFrame(movie_list, columns=['Name', 'Similar Movies'])
df.to_csv('bottom_movie_data_with_similar.csv', index=False)