In [51]:
HEADERS = {'User-Agent': 'Mozilla/5.0'}

import re
import requests
import json
from bs4 import BeautifulSoup

# Helper methods for scraping 

def extract_html_script_tag(soup):
    json_script = soup.find('script', type='application/ld+json')
    if json_script:
        try:
            return json.loads(json_script.string)
        except Exception as e:
            print("Error occurred.")
    return {}

def extract_basic_movie_details(movie):

    if movie.get("@type") != "Movie":
        return None
    name = movie.get("name")
    url = movie.get("url")
    rating = movie.get("aggregateRating", {}).get("ratingValue")
    genre = movie.get("genre")
    
    return {
        'title': name,
        'movie_url': url,
        'rating': rating,
        'genre': genre
    }

"""def extract_release_year(data):
    if "datePublished" in data:
        return data["datePublished"].get("year")
    return None"""

def extract_release_year(soup):

    # Find the section containing the release date
    release_date_section = soup.find("li", attrs={"data-testid": "title-details-releasedate"})

    if release_date_section:
        release_date = release_date_section.text
        year_match = re.search(r"\b(\d{4})\b", release_date)
        release_year = year_match.group(1) if year_match else None
        return release_year

    return None

def extract_directors(data):
    directors = []
    if "director" in data:
        directors = [d.get("name") for d in data["director"]]
    return directors

def extract_lead_actors(data):
    lead_actors = []
    if "actor" in data:
        lead_actors = [actor.get("name") for actor in data["actor"]]
    return lead_actors

def is_box_office_tag(tag):
    if tag.name != "span":
        return False
    return "Gross worldwide" in tag.get_text()

def extract_box_office(soup):
    box_office = None
    box_office_html_element = soup.find(is_box_office_tag)
    if box_office_html_element:
        sibling = box_office_html_element.find_next_sibling()
        if sibling:
            box_office = sibling.text.strip()
    return box_office


In [None]:
import pandas as pd

def get_top_250_movie_list():

    top_250_url = "https://www.imdb.com/chart/top/"
    response = requests.get(top_250_url, headers=HEADERS)

    soup = BeautifulSoup(response.text, 'html.parser')
    data = extract_html_script_tag(soup)
    movie_info = []
    
    for element in data.get("itemListElement", []):

        movie = element.get("item", {})
        basic_details = extract_basic_movie_details(movie)

        extra_details = get_extra_movie_details(basic_details.get("movie_url"))
        basic_details.update(extra_details)
        
        movie_info.append(basic_details)
        
    return movie_info

def get_extra_movie_details(movie_url):

    response = requests.get(movie_url, headers=HEADERS)
    soup = BeautifulSoup(response.text, 'html.parser')
    data = extract_html_script_tag(soup)
    details = {}

    details['release_year'] = extract_release_year(soup)
    details['directors'] = extract_directors(data)
    details['lead_actors'] = extract_lead_actors(data)
    details['box_office'] = extract_box_office(soup)
    
    return details

# Display DataFrame
movie_list = get_top_250_movie_list()
df = pd.DataFrame(movie_list)
print(df)


https://www.imdb.com/title/tt0111161/
https://www.imdb.com/title/tt0068646/
https://www.imdb.com/title/tt0468569/
https://www.imdb.com/title/tt0071562/
https://www.imdb.com/title/tt0050083/
