Objective
To scrape data from a popular website that lists movie information—IMDb. In this example, we'll
extract the titles, years, and ratings of the top movies from the IMDb Top 250 list.

In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [8]:
url = 'https://www.imdb.com/chart/top/'
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/91.0.4472.124 Safari/537.36"}

In [9]:
response= requests.get(url,headers= headers)
response

<Response [200]>

In [10]:
if response.status_code == 200:
    print("Successfully fetched the page!")
else:
    print(f"Failed to retrieve the page. Status code: {response.status_code}")

Successfully fetched the page!


Parse the JSON Content

In [15]:
import json

In [18]:
json_data = soup.find('script', type = 'application/ld+json')

if json_data:
    data = json.loads(json_data.string)
    
# Create empty lists to store movie titles, years, ratings, and URLs
titles = []
urls = []
descriptions = []
best_ratings = []
worst_ratings = []
ratings = []
genres = []
durations = []
data

{'@type': 'ItemList',
 'itemListElement': [{'@type': 'ListItem',
   'item': {'@type': 'Movie',
    'url': 'https://www.imdb.com/title/tt0111161/',
    'name': 'The Shawshank Redemption',
    'description': 'A banker convicted of uxoricide forms a friendship over a quarter century with a hardened convict, while maintaining his innocence and trying to remain hopeful through simple compassion.',
    'image': 'https://m.media-amazon.com/images/M/MV5BMDAyY2FhYjctNDc5OS00MDNlLThiMGUtY2UxYWVkNGY2ZjljXkEyXkFqcGc@._V1_.jpg',
    'aggregateRating': {'@type': 'AggregateRating',
     'bestRating': 10,
     'worstRating': 1,
     'ratingValue': 9.3,
     'ratingCount': 3053005},
    'contentRating': 'R',
    'genre': 'Drama',
    'duration': 'PT2H22M'}},
  {'@type': 'ListItem',
   'item': {'@type': 'Movie',
    'url': 'https://www.imdb.com/title/tt0068646/',
    'name': 'The Godfather',
    'description': 'The aging patriarch of an organized crime dynasty transfers control of his clandestine empire

In [20]:
# Check if the data contains the expected structure
if 'itemListElement' in data:
    for item in data['itemListElement']:
        movie = item['item']
        titles.append(movie['name'])
        urls.append(movie['url'])
        descriptions.append(movie['description'])
        
     # Extract ratings (best, worst, actual rating)
        best_ratings.append(movie['aggregateRating']['bestRating'])
        worst_ratings.append(movie['aggregateRating']['worstRating'])
        ratings.append(float(movie['aggregateRating']['ratingValue']))
        genres.append(movie['genre'])
        durations.append(movie['duration'])

In [21]:
df= pd.DataFrame({
    'Title': titles,
    'URL' : urls,
    'Description' : descriptions,
    'Best Rating': best_ratings,
    'Worst Rating': worst_ratings,
    'Rating': ratings,
    'Genre': genres,
    'Duration': durations

})
df.head()

Unnamed: 0,Title,URL,Description,Best Rating,Worst Rating,Rating,Genre,Duration
0,The Shawshank Redemption,https://www.imdb.com/title/tt0111161/,A banker convicted of uxoricide forms a friend...,10,1,9.3,Drama,PT2H22M
1,The Godfather,https://www.imdb.com/title/tt0068646/,The aging patriarch of an organized crime dyna...,10,1,9.2,"Crime, Drama",PT2H55M
2,The Dark Knight,https://www.imdb.com/title/tt0468569/,When a menace known as the Joker wreaks havoc ...,10,1,9.0,"Action, Crime, Drama",PT2H32M
3,The Godfather Part II,https://www.imdb.com/title/tt0071562/,The early life and career of Vito Corleone in ...,10,1,9.0,"Crime, Drama",PT3H22M
4,12 Angry Men,https://www.imdb.com/title/tt0050083/,The jury in a New York City murder trial is fr...,10,1,9.0,"Crime, Drama",PT1H36M


In [23]:
# Save the DataFrame to a CSV file
df.to_csv('topIMBDmovies.csv',index = False)
print("Data saved in CSV format")

Data saved in CSV format


In [None]:
_