In [None]:
!pip install requests beautifulsoup4 pandas


In [16]:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
import re

def get_soup(url, params=None, headers=None):
    response = requests.get(url, params=params, headers=headers)
    soup = BeautifulSoup(response.content, "html.parser")
    return soup

def extract_movie_data(movie):
    title = movie.find("h3", class_="lister-item-header").find("a").text
    rating = movie.find("div", class_="ratings-imdb-rating").strong.text
    description = movie.find("div", class_="lister-item-content").find_all("p")[1].text.strip()
    genre_element = movie.find("span", class_="genre")
    genre = genre_element.text.strip() if genre_element else None
    release_date = movie.find("span", class_="lister-item-year text-muted unbold").text.strip()
    director_stars = movie.find("p", class_="text-muted").find_all("a")
    directors = [person.text for person in director_stars[:-1]]
    stars = [person.text for person in director_stars[-1:]]

    movie_data = {
        "Title": title,
        "Rating": rating,
        "Description": description,
        "Genre": genre,
        "Release Date": release_date,
        "Directors": directors,
        "Stars": stars
    }

    return movie_data

def scrape_imdb_movies(year, limit):
    base_url = "https://www.imdb.com/search/title"
    headers = {"Accept-Language": "en-US,en;q=0.9"}

    movies = []
    start = 1
    while len(movies) < limit:
        params = {
            "release_date": year,
            "sort": "num_votes,desc",
            "start": start
        }
        soup = get_soup(base_url, params=params, headers=headers)

        movie_list = soup.find_all("div", class_="lister-item mode-advanced")
        if len(movie_list) == 0:
            break

        for movie in movie_list:
            movie_data = extract_movie_data(movie)
            movies.append(movie_data)

            if len(movies) >= limit:
                break

        start += 50  # IMDb displays 50 movies per page
        time.sleep(1)  # Add a delay to avoid overwhelming the server

    return movies

# Scrape 1000 movies released in 2023 (or as many as available)
movies = scrape_imdb_movies(2023, 1000)



In [None]:
for movie in movies:
    print(movie)

In [23]:
df = pd.DataFrame(movies)

df = df.dropna()
df['Release Year'] = df['Release Date'].str.extract(r'(\d{4})')
df['Release Year'] = pd.to_numeric(df['Release Year'],
                                   errors='coerce').astype('Int64')
df = df.drop(['Release Date'], axis=1)
df['Rating'] = pd.to_numeric(df['Rating'], errors='coerce')
df['Title'] = df['Title'].apply(lambda x: re.sub(r'\W+', ' ', x))

df.to_csv("imdb_movies_dataset.csv", index=False)

# Print the first few rows of the DataFrame

df.head()


Unnamed: 0,Title,Rating,Description,Genre,Directors,Stars,Release Year
0,The Last of Us,8.8,"After a global pandemic destroys civilization,...","Action, Adventure, Drama",[],[],2023
1,John Wick Chapter 4,7.9,John Wick uncovers a path to defeating The Hig...,"Action, Crime, Thriller",[],[],2023
2,The Last of Us,8.0,When an unknown person approaches his compound...,"Action, Adventure, Drama",[],[],2023
3,Ant Man and the Wasp Quantumania,6.1,Scott Lang and Hope Van Dyne are dragged into ...,"Action, Adventure, Comedy",[],[],2023
4,Guardians of the Galaxy Vol 3,8.2,"Still reeling from the loss of Gamora, Peter Q...","Action, Adventure, Comedy",[],[],2023
