<a href="https://colab.research.google.com/github/mab2004/imdb-top-movies-scraper/blob/main/IMDb_Top_250_Movies_Scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import Modules


In [21]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import json

## Request page source from URL

In [None]:
url = "https://www.imdb.com/chart/top/"

In [None]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}
page = requests.get(url, headers=headers)
page

<Response [200]>

## Display the page source code


In [None]:
page.content

## Display the JSON linked data script

In [None]:
soup = BeautifulSoup(page.content, "html.parser")
script_tag = soup.find('script', type='application/ld+json')
print(script_tag)

<script type="application/ld+json">{"@type":"ItemList","itemListElement":[{"@type":"ListItem","item":{"@type":"Movie","url":"https://www.imdb.com/title/tt0111161/","name":"The Shawshank Redemption","description":"A banker convicted of uxoricide forms a friendship over a quarter century with a hardened convict, while maintaining his innocence and trying to remain hopeful through simple compassion.","image":"https://m.media-amazon.com/images/M/MV5BMDAyY2FhYjctNDc5OS00MDNlLThiMGUtY2UxYWVkNGY2ZjljXkEyXkFqcGc@._V1_.jpg","aggregateRating":{"@type":"AggregateRating","bestRating":10,"worstRating":1,"ratingValue":9.3,"ratingCount":3097047},"contentRating":"12","genre":"Drama","duration":"PT2H22M"}},{"@type":"ListItem","item":{"@type":"Movie","url":"https://www.imdb.com/title/tt0068646/","name":"The Godfather","description":"The aging patriarch of an organized crime dynasty transfers control of his clandestine empire to his reluctant son.","image":"https://m.media-amazon.com/images/M/MV5BNGEwYjg

In [19]:
if script_tag:
    json_data = script_tag.string
    print(json_data)

{"@type":"ItemList","itemListElement":[{"@type":"ListItem","item":{"@type":"Movie","url":"https://www.imdb.com/title/tt0111161/","name":"The Shawshank Redemption","description":"A banker convicted of uxoricide forms a friendship over a quarter century with a hardened convict, while maintaining his innocence and trying to remain hopeful through simple compassion.","image":"https://m.media-amazon.com/images/M/MV5BMDAyY2FhYjctNDc5OS00MDNlLThiMGUtY2UxYWVkNGY2ZjljXkEyXkFqcGc@._V1_.jpg","aggregateRating":{"@type":"AggregateRating","bestRating":10,"worstRating":1,"ratingValue":9.3,"ratingCount":3097047},"contentRating":"12","genre":"Drama","duration":"PT2H22M"}},{"@type":"ListItem","item":{"@type":"Movie","url":"https://www.imdb.com/title/tt0068646/","name":"The Godfather","description":"The aging patriarch of an organized crime dynasty transfers control of his clandestine empire to his reluctant son.","image":"https://m.media-amazon.com/images/M/MV5BNGEwYjgwOGQtYjg5ZS00Njc1LTk2ZGEtM2QwZWQ2Nj

In [22]:
data = json.loads(json_data)
print(data)

{'@type': 'ItemList', 'itemListElement': [{'@type': 'ListItem', 'item': {'@type': 'Movie', 'url': 'https://www.imdb.com/title/tt0111161/', 'name': 'The Shawshank Redemption', 'description': 'A banker convicted of uxoricide forms a friendship over a quarter century with a hardened convict, while maintaining his innocence and trying to remain hopeful through simple compassion.', 'image': 'https://m.media-amazon.com/images/M/MV5BMDAyY2FhYjctNDc5OS00MDNlLThiMGUtY2UxYWVkNGY2ZjljXkEyXkFqcGc@._V1_.jpg', 'aggregateRating': {'@type': 'AggregateRating', 'bestRating': 10, 'worstRating': 1, 'ratingValue': 9.3, 'ratingCount': 3097047}, 'contentRating': '12', 'genre': 'Drama', 'duration': 'PT2H22M'}}, {'@type': 'ListItem', 'item': {'@type': 'Movie', 'url': 'https://www.imdb.com/title/tt0068646/', 'name': 'The Godfather', 'description': 'The aging patriarch of an organized crime dynasty transfers control of his clandestine empire to his reluctant son.', 'image': 'https://m.media-amazon.com/images/M/M

## Run a loop to retrieve and store the data from JSON-LD data

In [25]:
movie_data = []

for item in data['itemListElement']:
    movie_item = item['item']
    title = movie_item.get('name', 'N/A')
    description = movie_item.get('description', 'N/A')
    rating_value = movie_item.get('aggregateRating', {}).get('ratingValue', 'N/A')
    vote_count = movie_item.get('aggregateRating', {}).get('ratingCount', 'N/A')
    content_rating = movie_item.get('contentRating', 'N/A')
    genre = movie_item.get('genre', 'N/A')
    duration_iso = movie_item.get('duration', 'N/A')

    # Convert ISO 8601 duration to a more readable format
    duration_readable = 'N/A'
    if duration_iso and duration_iso != 'N/A':
        try:
            # Remove the 'PT' prefix
            duration_iso = duration_iso[2:]
            hours = 0
            minutes = 0
            if 'H' in duration_iso:
                hours_part = duration_iso.split('H')[0]
                hours = int(hours_part)
                duration_iso = duration_iso.split('H')[1]
            if 'M' in duration_iso:
                minutes_part = duration_iso.split('M')[0]
                minutes = int(minutes_part)

            duration_parts = []
            if hours > 0:
                duration_parts.append(f"{hours} Hour{'s' if hours > 1 else ''}")
            if minutes > 0:
                duration_parts.append(f"{minutes} Minute{'s' if minutes > 1 else ''}")

            duration_readable = " ".join(duration_parts) if duration_parts else 'N/A'
        except (ValueError, IndexError):
            duration_readable = 'N/A'

    imdb_link = movie_item.get('url', 'N/A')

    movie_data.append({
        'Title': title,
        'Description': description,
        'Rating Value': rating_value,
        'Vote Count': vote_count,
        'Content Rating': content_rating,
        'Genre': genre,
        'Duration': duration_readable,
        'IMDb Link': imdb_link
    })



## Create a dataframe and output the scraped data to a .csv file

In [26]:
df = pd.DataFrame(movie_data)
display(df.head())
df.to_csv('top_movies.csv', index=False)

Unnamed: 0,Title,Description,Rating Value,Vote Count,Content Rating,Genre,Duration,IMDb Link
0,The Shawshank Redemption,A banker convicted of uxoricide forms a friend...,9.3,3097047,12,Drama,2 Hours 22 Minutes,https://www.imdb.com/title/tt0111161/
1,The Godfather,The aging patriarch of an organized crime dyna...,9.2,2158594,16,"Crime, Drama",2 Hours 55 Minutes,https://www.imdb.com/title/tt0068646/
2,The Dark Knight,When a menace known as the Joker wreaks havoc ...,9.1,3072078,16,"Action, Crime, Drama",2 Hours 32 Minutes,https://www.imdb.com/title/tt0468569/
3,The Godfather Part II,The early life and career of Vito Corleone in ...,9.0,1450868,12,"Crime, Drama",3 Hours 22 Minutes,https://www.imdb.com/title/tt0071562/
4,12 Angry Men,The jury in a New York City murder trial is fr...,9.0,947631,Unrated,"Crime, Drama",1 Hour 36 Minutes,https://www.imdb.com/title/tt0050083/
