In [300]:
# 1. Import the necessary LIBRARIES
import requests
from urllib import robotparser
from bs4 import BeautifulSoup
import pandas as pd
pd.set_option('display.max_columns', 10)
pd.set_option('display.max_rows', 100)

In [304]:
# Robots.txt File check

# robots.txt file url
ROBOT_PROTOCOL_URL = 'https://www.rottentomatoes.com/robots.txt'

# request the robots.txt file from the address
robot_parser = robotparser.RobotFileParser(ROBOT_PROTOCOL_URL)

#read robots.txt 
robot_parser.read()

# Check the address can fetch
print(robot_parser.can_fetch('*','/top/bestofrt/'))

True


In [284]:
# 3. Send get() Request and fetch the webpage contents
url = "https://www.rottentomatoes.com/top/bestofrt/"
base_url = "https://www.rottentomatoes.com"
# Request url for top 100 movies 
response = requests.get(url)

# Parse the response with html parser
soup = BeautifulSoup(response.content, "html.parser")

In [285]:
# Get all rank, rating, title and review of the movies according to html class and tags
rank = soup.find_all("td", class_ = "bold")
rating = soup.find_all("span", class_ = "tMeterScore")
title = soup.find("table", class_="table").find_all("a", class_ = "unstyled articleLink")
review = soup.find_all("td", class_ = "right hidden-xs")

In [286]:
# Create lists for movie informations
ranks = []
ratings = []
titles = []
reviews = []
genres = []
languages = []
directors = []
release_dates_s = []
runtimes = []

# Loop for 100 movies
for idx in range(100):
    # Append informations to the lists
    ranks.append(rank[idx].text)
    ratings.append(rating[idx].text)
    titles.append((title[idx].text).strip()[:-7]) # Delete blanks and last 7 character for the year value
    reviews.append(review[idx].text)
    
    # To access other movie informations get href and combine with base url
    movie_url = base_url + title[idx]["href"]
    print(idx, movie_url)
    
    # Request movie url
    response_movie = requests.get(movie_url)
    source = BeautifulSoup(response_movie.content, "html.parser")
    
    # Get labels and values of movies according to html attributes
    movie_label = source.find_all("div",attrs={"data-qa":"movie-info-item-label"})
    movie_value = source.find_all("div",attrs={"data-qa":"movie-info-item-value"})
    
    # Append genre, language etc. to the lists
    for idx, item in enumerate(movie_value):
        fixed_item = item.text.replace("\n", "").replace(" ", "").replace(",", ", ")
        fixed_value = movie_label[idx].text
        if fixed_value == "Genre:":
            genres.append(fixed_item)
        elif fixed_value == "Original Language:":
            languages.append(fixed_item)
        elif fixed_value == "Director:":
            directors.append(fixed_item)
        elif fixed_value == "Release Date (Streaming):":
            release_dates_s.append(fixed_item)
        elif fixed_value == "Runtime:":
            runtimes.append(fixed_item)

0 https://www.rottentomatoes.com/m/it_happened_one_night
1 https://www.rottentomatoes.com/m/citizen_kane
2 https://www.rottentomatoes.com/m/the_wizard_of_oz_1939
3 https://www.rottentomatoes.com/m/modern_times
4 https://www.rottentomatoes.com/m/black_panther_2018
5 https://www.rottentomatoes.com/m/parasite_2019
6 https://www.rottentomatoes.com/m/avengers_endgame
7 https://www.rottentomatoes.com/m/1003707-casablanca
8 https://www.rottentomatoes.com/m/knives_out
9 https://www.rottentomatoes.com/m/us_2019
10 https://www.rottentomatoes.com/m/toy_story_4
11 https://www.rottentomatoes.com/m/lady_bird
12 https://www.rottentomatoes.com/m/mission_impossible_fallout
13 https://www.rottentomatoes.com/m/blackkklansman
14 https://www.rottentomatoes.com/m/get_out
15 https://www.rottentomatoes.com/m/the_irishman
16 https://www.rottentomatoes.com/m/godfather
17 https://www.rottentomatoes.com/m/mad_max_fury_road
18 https://www.rottentomatoes.com/m/spider_man_into_the_spider_verse
19 https://www.rottent

In [297]:
# Create a data frame for combine all lists
data = pd.DataFrame()

#Combine all lists
data["Rank"] = ranks
data["Rating"] = ratings
data["Title"] = titles
data["No. of Reviews"] = reviews
data["Genre"] = genres
data["Original Language"] = languages
data["Director"] = directors
data["Release Date (Streaming)"] = release_dates_s
data["Runtime"] = runtimes

In [290]:
data

Unnamed: 0,Rank,Rating,Title,No. of Reviews,Genre,Original Language,Director,Release Date (Streaming),Runtime
0,1.0,79%,It Happened One Night,98,"Romance, Comedy",English,FrankCapra,"Dec28, 1999",1h45m
1,2.0,83%,Citizen Kane,121,Drama,English,OrsonWelles,"Feb23, 2010",1h59m
2,3.0,92%,The Wizard of Oz,160,"Fantasy, Musical, Kids&family",English,VictorFleming,"Aug12, 2003",1h41m
3,4.0,100%,Modern Times,109,Comedy,English,CharlieChaplin,"Nov16, 2010",1h29m
4,5.0,90%,Black Panther,525,"Action, Fantasy, Adventure",English,RyanCoogler,"May2, 2018",2h14m
5,6.0,89%,Parasite (Gisaengchung),464,"Drama, Mystery&thriller, Comedy",Korean,BongJoonHo,"Oct11, 2019",2h12m
6,7.0,82%,Avengers: Endgame,547,"Action, Sci-fi, Fantasy, Adventure",English,"AnthonyRusso, JoeRusso","Jul30, 2019",3h1m
7,8.0,96%,Casablanca,124,Drama,English,MichaelCurtiz,"Nov17, 1998",1h42m
8,9.0,82%,Knives Out,468,"Crime, Drama, Mystery&thriller, Comedy",English,RianJohnson,"Jun12, 2020",2h10m
9,10.0,52%,Us,553,"Horror, Mystery&thriller",English,JordanPeele,"Jun4, 2019",1h56m
