In [1]:
# Importing the libraries needed 
import pandas as pd
import requests
import time
import numpy as np
import re
from bs4 import BeautifulSoup
from time import sleep
from random import randint

In [2]:
# Declaring the headers 
headers = {"Accept-Language": "en-US,en;q=0.5"}

In [3]:
# Initialize lists to store data
movie_name = []
year = []
runtime = []
content_rating = []
genre = []
rating = []
metascore = []
votes = []
gross = []
worldwide_gross = []
directors = []
stars = []

In [4]:
#creating an array of values and passing it in the url for dynamic webpages
pages = np.arange(1,11,1)

In [5]:
for page in pages:
    page = requests.get("https://www.imdb.com/list/ls098063263/?st_dt=&mode=detail&page="+str(page)+"&sort=list_order,asc")
    soup = BeautifulSoup(page.text, 'html.parser')
    sleep(randint(2,8))
    
    movie_data = soup.findAll('div', attrs= {'class': 'lister-item mode-detail'})
    for i in movie_data:

        # Retrieve movie names
        get_name = i.h3.a.text
        movie_name.append(get_name)

        # Retrieve released year
        get_year = i.h3.find('span', class_ = 'lister-item-year text-muted unbold').text
        year_part = ''.join(filter(str.isdigit, get_year))  # Extract only numeric characters
        year.append(year_part)

        # Retrieve movie runtime
        get_time = i.p.find('span', class_= 'runtime').text.split(' ')[0]
        runtime.append(get_time)

        # Retrieve content rating
        get_content_rating = i.p.find('span', class_= 'certificate').text if i.find('span', class_ = 'certificate') else ""
        content_rating.append(get_content_rating)

        # Retrieve movie genre
        get_genre = i.p.find('span', class_= 'genre').text
        cleaned_genre = ', '.join([genre.strip() for genre in get_genre.split(',')])  # Clean and join genres
        genre.append(cleaned_genre)

        # Retrieve rating
        get_rating = i.find('span', class_ = 'ipl-rating-star__rating').text
        rating.append(get_rating)

        # Retrieve metascore
        get_metascore = i.find('div', class_ = 'inline-block ratings-metascore').text.replace('\n', "").split(' ')[0] if i.find('div', class_ = 'inline-block ratings-metascore') else ""
        metascore.append(get_metascore)


        # Retrieve votes
        value = i.find_all('span', attrs = {'name': "nv"})    

        get_vote = value[0].text
        votes.append(get_vote)

        # Retrieve gross
        get_gross = value[1].text if len(value)>1 else ""
        cleaned_gross = get_gross.replace("$", "").replace("M", "")
        gross.append(cleaned_gross)

        # Retrieve worldwide lifetime gross
        get_worldwide_gross = i.h2.b.text.replace("$", "")
        worldwide_gross.append(get_worldwide_gross)

        # Retrieve director names
        try:
            get_director = i.findAll('p')[2].get_text()
                #print(director)
            for word in get_director.split('|'): # Split directors and stars using |
                word = word.strip() 
                if word.startswith('Dire'): 
                    ind = word.index(':') + 1 
                    director =(word[ind:]).strip().replace('\n','') # Remove empty lines
        except:
            director = 'Missing' # Set the default value for entries without director details

        directors.append(director)

        # Retrieve stars
        try:
            get_stars = i.findAll('p')[2].get_text()
                #print(star)
            for word in get_stars.split('|'): # Split directors and stars using |
                word = word.strip() 
                if word.startswith('Star'): 
                    ind = word.index(':') + 1 
                    star =(word[ind:]).strip().replace('\n','') # Remove empty lines
        except:
            star = 'Missing' # Set the default value for entries without star details

        stars.append(star)


In [6]:
# Creating a dataframe 
movie_list = pd.DataFrame({
    "Movie Name": movie_name, 
    "Year of Release" : year,
    "Runtime (min)" : runtime,
    "Content Rating" : content_rating,
    "Metascore" : metascore,
    "Votes" : votes,
    "Directors" : directors,
    "Stars" : stars,
    "Gross (M,$)" : gross,
    "Worldwide Gross ($)" : worldwide_gross
})

In [7]:
movie_list

Unnamed: 0,Movie Name,Year of Release,Runtime (min),Content Rating,Metascore,Votes,Directors,Stars,"Gross (M,$)",Worldwide Gross ($)
0,Avatar,2009,162,13+,83,1378834,James Cameron,"Sam Worthington, Zoe Saldana, Sigourney Weaver...",760.51,2923706026
1,Avengers: Endgame,2019,181,13,78,1253058,"Anthony Russo, Joe Russo","Robert Downey Jr., Chris Evans, Mark Ruffalo, ...",858.37,2799439100
2,Avatar: The Way of Water,2022,192,13,67,486613,James Cameron,"Sam Worthington, Zoe Saldana, Sigourney Weaver...",659.68,2320250281
3,Titanic,1997,194,PG-13,75,1271852,James Cameron,"Leonardo DiCaprio, Kate Winslet, Billy Zane, K...",659.33,2264743305
4,Star Wars: Episode VII - The Force Awakens,2015,138,G,80,969664,J.J. Abrams,"Daisy Ridley, John Boyega, Oscar Isaac, Domhna...",936.66,2071310218
...,...,...,...,...,...,...,...,...,...,...
995,Hot Shots!,1991,85,,61,115930,Jim Abrahams,"Charlie Sheen, Cary Elwes, Valeria Golino, Llo...",69.47,181096164
996,Road to Perdition,2002,117,R,72,283198,Sam Mendes,"Tom Hanks, Tyler Hoechlin, Paul Newman",104.45,181001478
997,Kill Bill: Vol. 1,2003,111,18,69,1183772,Quentin Tarantino,"Uma Thurman, David Carradine, Daryl Hannah, Mi...",70.10,180906076
998,The Scorpion King,2002,92,u 13+,45,144926,Chuck Russell,"Dwayne Johnson, Steven Brand, Michael Clarke D...",91.05,180630907


In [8]:
movie_list.head(5)

Unnamed: 0,Movie Name,Year of Release,Runtime (min),Content Rating,Metascore,Votes,Directors,Stars,"Gross (M,$)",Worldwide Gross ($)
0,Avatar,2009,162,13+,83,1378834,James Cameron,"Sam Worthington, Zoe Saldana, Sigourney Weaver...",760.51,2923706026
1,Avengers: Endgame,2019,181,13,78,1253058,"Anthony Russo, Joe Russo","Robert Downey Jr., Chris Evans, Mark Ruffalo, ...",858.37,2799439100
2,Avatar: The Way of Water,2022,192,13,67,486613,James Cameron,"Sam Worthington, Zoe Saldana, Sigourney Weaver...",659.68,2320250281
3,Titanic,1997,194,PG-13,75,1271852,James Cameron,"Leonardo DiCaprio, Kate Winslet, Billy Zane, K...",659.33,2264743305
4,Star Wars: Episode VII - The Force Awakens,2015,138,G,80,969664,J.J. Abrams,"Daisy Ridley, John Boyega, Oscar Isaac, Domhna...",936.66,2071310218


In [9]:
# save data in excel format
movie_list.to_excel("Top 1000 Highest-Grossing Movies of All Time IMDb.xlsx")

In [10]:
# Save data in csv format
movie_list.to_csv("Top 1000 Highest-Grossing Movies of All Time IMDb.csv")