# Scraping imdb Top 250 Info

In [7]:
from bs4 import BeautifulSoup
import pandas as pd
import re
import requests

In [52]:
url = "https://www.imdb.com/list/ls055386972/"
response = requests.get(url, headers={"Accept-Lanuage":"en-US"})
soup = BeautifulSoup(response.content, "html.parser")


In [57]:
movies = []
for movie in soup.find_all("div", class_="lister-item-content"):
    title = movie.find("h3").find("a").string
    duration = int(movie.find(class_="runtime").string.strip(' min'))
    year = int(re.search(r"\d{4}", movie.find(class_="lister-item-year").string).group(0))
    movies.append({'title': title, 'duration': duration, 'year': year})
movies[0]

{'title': 'Der Pate', 'duration': 175, 'year': 1972}

In [58]:
# converting list to dataframe
movies_df = pd.DataFrame(movies)
movies_df.head()

Unnamed: 0,title,duration,year
0,Der Pate,175,1972
1,Schindlers Liste,195,1993
2,Die 12 Geschworenen,96,1957
3,La vita è bella,116,1997
4,Zwei glorreiche Halunken,161,1966


In [59]:
# scraping all pages of the Top250
def fetch_page(page):
    response = requests.get(
        "https://www.imdb.com/search/title/",
        params={"groups":"top_250", "sort":"user_rating","start": (1 + page * 50)},
        headers={"Accept-Language":"en-US"})
    soup = BeautifulSoup(response.content, "html.parser")
    return soup

In [60]:
def parse_movies(soup):
    movies = []
    for movie in soup.find_all("div", class_="lister-item-content"):
        title = movie.find("h3").find("a").string
        duration = int(movie.find(class_="runtime").string.strip(' min'))
        year = int(re.search(r"\d{4}", movie.find(class_="lister-item-year").string).group(0))
        movies.append({'title': title, 'duration': duration, 'year': year})
    return movies

In [61]:
all_movies = []
for page in range(5):
    print(f"Parsing page {page + 1}...")
    soup = fetch_page(page)
    all_movies += parse_movies(soup)
print("Done")

Parsing page 1...
Parsing page 2...
Parsing page 3...
Parsing page 4...
Parsing page 5...
Done


In [62]:
all_movies

[{'title': 'The Shawshank Redemption', 'duration': 142, 'year': 1994},
 {'title': 'The Godfather', 'duration': 175, 'year': 1972},
 {'title': 'The Dark Knight', 'duration': 152, 'year': 2008},
 {'title': "Schindler's List", 'duration': 195, 'year': 1993},
 {'title': 'The Lord of the Rings: The Return of the King',
  'duration': 201,
  'year': 2003},
 {'title': '12 Angry Men', 'duration': 96, 'year': 1957},
 {'title': 'The Godfather Part II', 'duration': 202, 'year': 1974},
 {'title': 'Pulp Fiction', 'duration': 154, 'year': 1994},
 {'title': 'Inception', 'duration': 148, 'year': 2010},
 {'title': 'Fight Club', 'duration': 139, 'year': 1999},
 {'title': 'The Lord of the Rings: The Fellowship of the Ring',
  'duration': 178,
  'year': 2001},
 {'title': 'Forrest Gump', 'duration': 142, 'year': 1994},
 {'title': 'The Good, the Bad and the Ugly', 'duration': 178, 'year': 1966},
 {'title': 'The Lord of the Rings: The Two Towers',
  'duration': 179,
  'year': 2002},
 {'title': 'Jai Bhim', 'du

In [63]:
all_movies_df = pd.DataFrame(all_movies)

In [65]:
all_movies_df.head()

Unnamed: 0,title,duration,year
0,The Shawshank Redemption,142,1994
1,The Godfather,175,1972
2,The Dark Knight,152,2008
3,Schindler's List,195,1993
4,The Lord of the Rings: The Return of the King,201,2003
