<a href="https://colab.research.google.com/github/mohamed0998/scraping-rotten-tomatoes/blob/master/Extracting_movie.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Set-up

In [1]:
# load packages
import requests
from bs4 import BeautifulSoup

In [13]:
# Define the URL of the site
base_site = "https://editorial.rottentomatoes.com/guide/140-essential-action-movies-to-watch-now/"

In [14]:
# sending a request to the webpage
response = requests.get(base_site)
response.status_code

200

In [15]:
# get the HTML from the webpage
html = response.content

## Choosing a parser

### html.parser

In [16]:
# convert the HTML to a Beautiful Soup object
soup = BeautifulSoup(html, 'html.parser')

In [17]:
# Exporting the HTML to a file
with open('Rotten_tomatoes_page_2_HTML_Parser.html', 'wb') as file:
    file.write(soup.prettify('utf-8'))

In [7]:
# When inspecting the file we see that HTML element is closed at the begining -- it parsed incorrectly!
# Let's check another parser

### lxml

In [18]:
# convert the HTML to a BeatifulSoup object
soup = BeautifulSoup(html, 'lxml')

In [19]:
# Exporting the HTML to a file
with open('Rotten_tomatoes_page_2_LXML_Parser.html', 'wb') as file:
    file.write(soup.prettify('utf-8'))

In [None]:
# By first accounts of inspecting the file everything seems fine

### A word of caution

In [20]:
# Beautiful Soup ranks the lxml parser as the best one.

# If a parser is not explicitly stated in the Beautiful Soup constructor,
# the best one available on the current machine is chosen.

# This means that the same piece of code can give different results on different computers.

## Finding an element containing all the data

In [21]:
# Find all div tags on the webpage containing the information we want to scrape
divs = soup.find_all("div", {"class": "col-sm-18 col-full-xs countdown-item-content"})
divs

[<div class="col-sm-18 col-full-xs countdown-item-content">
 <div class="row countdown-item-title-bar">
 <div class="col-sm-20 col-full-xs" style="height: 100%;">
 <div class="article_movie_title" style="float: left;">
 <div><h2><a href="https://www.rottentomatoes.com/m/1018009-running_scared">Running Scared</a> <span class="subtle start-year">(1986)</span> <span class="icon tiny rotten" title="Rotten"></span> <span class="tMeterScore">57%</span></h2></div>
 </div>
 </div>
 <div class="col-sm-4 col-full-xs" style="height: 100%;">
 <div class="countdown-index">#140</div>
 </div>
 </div>
 <div class="row countdown-item-details">
 <div class="col-sm-24">
 <div class="info countdown-adjusted-score"><span class="descriptor">Adjusted Score: </span>58273% <span class="glyphicon glyphicon-question-sign" data-html="true" data-original-title="The Adjusted Score comes from a weighted formula (Bayesian) that we use that accounts for variation in the number of reviews per movie." data-placement="to

# Extracting the title, year and score of each movie

In [22]:
# The title, year and score of each movie are contained in the 'h2' tags

In [None]:
# for instance, let's explore the first div
divs[0].find("h2")

<h2><a href="https://www.rottentomatoes.com/m/13_assassins_2011/">13 Assassins</a> <span class="subtle start-year">(2011)</span> <span class="icon tiny certified" title="Certified Fresh"></span> <span class="tMeterScore">95%</span></h2>

In [23]:
# Extracting all 'h2' tags
headings = [div.find("h2") for div in divs]
headings

[<h2><a href="https://www.rottentomatoes.com/m/1018009-running_scared">Running Scared</a> <span class="subtle start-year">(1986)</span> <span class="icon tiny rotten" title="Rotten"></span> <span class="tMeterScore">57%</span></h2>,
 <h2><a href="https://www.rottentomatoes.com/m/equilibrium">Equilibrium</a> <span class="subtle start-year">(2002)</span> <span class="icon tiny rotten" title="Rotten"></span> <span class="tMeterScore">41%</span></h2>,
 <h2><a href="https://www.rottentomatoes.com/m/hero">Hero</a> <span class="subtle start-year">(2002)</span> <span class="icon tiny certified" title="Certified Fresh"></span> <span class="tMeterScore">94%</span></h2>,
 <h2><a href="https://www.rottentomatoes.com/m/1017666-road_house">Road House</a> <span class="subtle start-year">(1989)</span> <span class="icon tiny rotten" title="Rotten"></span> <span class="tMeterScore">40%</span></h2>,
 <h2><a href="https://www.rottentomatoes.com/m/unstoppable-2010">Unstoppable</a> <span class="subtle start

In [24]:
# Inspecting the text inside the headings
[heading.text for heading in headings]

['Running Scared (1986)  57%',
 'Equilibrium (2002)  41%',
 'Hero (2002)  94%',
 'Road House (1989)  40%',
 'Unstoppable (2010)  87%',
 'Shaft (1971)  88%',
 'The Villainess (2017)  85%',
 'Highlander (1986)  70%',
 'Die Hard 2 (1990)  69%',
 'National Treasure (2004)  46%',
 'The Protector (2005)  53%',
 'Revenge (2017)  93%',
 'El Mariachi (1992)  91%',
 'A Touch of Zen (1971)  97%',
 'Top Gun (1986)  58%',
 'Con Air (1997)  56%',
 'The Expendables 2 (2012)  67%',
 'The Mummy (1999)  61%',
 'Mr. & Mrs. Smith (2005)  60%',
 'Rush Hour (1998)  61%',
 'The Equalizer (2014)  60%',
 'Captain America: Civil War (2016)  90%',
 'Air Force One (1997)  78%',
 'Bloodsport (1988)  40%',
 'Blade (1998)  57%',
 'Bad Boys (1995)  42%',
 'Die Hard With a Vengeance (1995)  59%',
 'The Running Man (1987)  66%',
 'Code of Silence (1985)  70%',
 "Shoot 'Em Up (2007)  67%",
 'Crank (2006)  61%',
 'Machete (2010)  72%',
 'Drive (2011)  93%',
 'Batman (1989)  71%',
 'Under Siege (1992)  79%',
 'Independenc

In [25]:
# It does contain the info we want to extract
# However, we need to obtain the title, year and score separately
# Let's inspect one heading to see if there is a way to distinguish between them
headings[0]

<h2><a href="https://www.rottentomatoes.com/m/1018009-running_scared">Running Scared</a> <span class="subtle start-year">(1986)</span> <span class="icon tiny rotten" title="Rotten"></span> <span class="tMeterScore">57%</span></h2>

In [None]:
# We notice that:

# The movie title is in the 'a' tag
# The year is in a 'span' with class 'start-year'
# The score is in a 'span' with class 'tMeterScore'

## Title

In [26]:
# Let's check all heading links
[heading.find('a') for heading in headings]

[<a href="https://www.rottentomatoes.com/m/1018009-running_scared">Running Scared</a>,
 <a href="https://www.rottentomatoes.com/m/equilibrium">Equilibrium</a>,
 <a href="https://www.rottentomatoes.com/m/hero">Hero</a>,
 <a href="https://www.rottentomatoes.com/m/1017666-road_house">Road House</a>,
 <a href="https://www.rottentomatoes.com/m/unstoppable-2010">Unstoppable</a>,
 <a href="https://www.rottentomatoes.com/m/1018699-shaft">Shaft</a>,
 <a href="https://www.rottentomatoes.com/m/the_villainess">The Villainess</a>,
 <a href="https://www.rottentomatoes.com/m/highlander">Highlander</a>,
 <a href="https://www.rottentomatoes.com/m/die_hard_2_1990">Die Hard 2</a>,
 <a href="https://www.rottentomatoes.com/m/national_treasure">National Treasure</a>,
 <a href="https://www.rottentomatoes.com/m/protector">The Protector</a>,
 <a href="https://www.rottentomatoes.com/m/revenge_2018">Revenge</a>,
 <a href="https://www.rottentomatoes.com/m/el_mariachi">El Mariachi</a>,
 <a href="https://www.rotten

In [27]:
# Obtaining the movie titles from the links
movie_names = [heading.find('a').string for heading in headings]
movie_names

['Running Scared',
 'Equilibrium',
 'Hero',
 'Road House',
 'Unstoppable',
 'Shaft',
 'The Villainess',
 'Highlander',
 'Die Hard 2',
 'National Treasure',
 'The Protector',
 'Revenge',
 'El Mariachi',
 'A Touch of Zen',
 'Top Gun',
 'Con Air',
 'The Expendables 2',
 'The Mummy',
 'Mr. & Mrs. Smith',
 'Rush Hour',
 'The Equalizer',
 'Captain America: Civil War',
 'Air Force One',
 'Bloodsport',
 'Blade',
 'Bad Boys',
 'Die Hard With a Vengeance',
 'The Running Man',
 'Code of Silence',
 "Shoot 'Em Up",
 'Crank',
 'Machete',
 'Drive',
 'Batman',
 'Under Siege',
 'Independence Day',
 'Bullitt',
 'Wanted',
 'Superman: The Movie',
 'Ronin',
 'They Live',
 'Cliffhanger',
 "Marvel's the Avengers",
 'Hot Fuzz',
 'The Warriors',
 'Starship Troopers',
 'Elite Squad 2',
 'Point Break',
 'The Long Kiss Goodnight',
 'The Guest',
 'Taken',
 '300',
 'True Lies',
 'Demolition Man',
 'Hardcore Henry',
 'Police Story',
 'Brotherhood of the Wolf',
 'Kingsman: The Secret Service',
 'The Fifth Element',
 

## Year

In [28]:
# Filtering only the spans containing the year
[heading.find("span", class_ = 'start-year') for heading in headings]

[<span class="subtle start-year">(1986)</span>,
 <span class="subtle start-year">(2002)</span>,
 <span class="subtle start-year">(2002)</span>,
 <span class="subtle start-year">(1989)</span>,
 <span class="subtle start-year">(2010)</span>,
 <span class="subtle start-year">(1971)</span>,
 <span class="subtle start-year">(2017)</span>,
 <span class="subtle start-year">(1986)</span>,
 <span class="subtle start-year">(1990)</span>,
 <span class="subtle start-year">(2004)</span>,
 <span class="subtle start-year">(2005)</span>,
 <span class="subtle start-year">(2017)</span>,
 <span class="subtle start-year">(1992)</span>,
 <span class="subtle start-year">(1971)</span>,
 <span class="subtle start-year">(1986)</span>,
 <span class="subtle start-year">(1997)</span>,
 <span class="subtle start-year">(2012)</span>,
 <span class="subtle start-year">(1999)</span>,
 <span class="subtle start-year">(2005)</span>,
 <span class="subtle start-year">(1998)</span>,
 <span class="subtle start-year">(2014)<

In [29]:
# Extracting the year string
years = [heading.find("span", class_ = 'start-year').string for heading in headings]
years

['(1986)',
 '(2002)',
 '(2002)',
 '(1989)',
 '(2010)',
 '(1971)',
 '(2017)',
 '(1986)',
 '(1990)',
 '(2004)',
 '(2005)',
 '(2017)',
 '(1992)',
 '(1971)',
 '(1986)',
 '(1997)',
 '(2012)',
 '(1999)',
 '(2005)',
 '(1998)',
 '(2014)',
 '(2016)',
 '(1997)',
 '(1988)',
 '(1998)',
 '(1995)',
 '(1995)',
 '(1987)',
 '(1985)',
 '(2007)',
 '(2006)',
 '(2010)',
 '(2011)',
 '(1989)',
 '(1992)',
 '(1996)',
 '(1968)',
 '(2008)',
 '(1978)',
 '(1998)',
 '(1988)',
 '(1993)',
 '(2012)',
 '(2007)',
 '(1979)',
 '(1997)',
 '(2010)',
 '(1991)',
 '(1996)',
 '(2014)',
 '(2008)',
 '(2006)',
 '(1994)',
 '(1993)',
 '(2015)',
 '(1985)',
 '(2001)',
 '(2014)',
 '(1997)',
 '(1986)',
 '(2017)',
 '(1995)',
 '(2004)',
 '(1984)',
 '(2003)',
 '(2004)',
 '(1993)',
 '(1981)',
 '(2000)',
 '(2004)',
 '(2010)',
 '(1992)',
 '(1989)',
 '(2004)',
 '(1986)',
 '(2008)',
 '(2018)',
 '(2017)',
 '(1964)',
 '(1976)',
 '(2017)',
 '(1972)',
 '(2014)',
 '(2003)',
 '(1971)',
 '(2015)',
 '(1990)',
 '(1992)',
 '(1971)',
 '(2014)',
 '(2003)',

In [30]:
years[0]

'(1986)'

### Removing the brackets

In [31]:
# One way to remove the brackets is to drop the first and last symbol of the string
years[0][1:-1]

'1986'

In [32]:
# However, this will break, if the format of the year is changed

In [33]:
# Alternativelly, we can do it with the help of the strip() method (this is robust)

# It removes leading and trailing symbols from a string
# By default, it removes whitespace, but we can specify other symbols to strip

In [34]:
# Removing '('
years[0].strip('(')

'1986)'

In [35]:
# Removing ')'
years[0].strip(')')

'(1986'

In [36]:
# Combining both
years[0].strip('()')

'1986'

In [37]:
# Updating years with stripped values
years = [year.strip('()') for year in years]
years

['1986',
 '2002',
 '2002',
 '1989',
 '2010',
 '1971',
 '2017',
 '1986',
 '1990',
 '2004',
 '2005',
 '2017',
 '1992',
 '1971',
 '1986',
 '1997',
 '2012',
 '1999',
 '2005',
 '1998',
 '2014',
 '2016',
 '1997',
 '1988',
 '1998',
 '1995',
 '1995',
 '1987',
 '1985',
 '2007',
 '2006',
 '2010',
 '2011',
 '1989',
 '1992',
 '1996',
 '1968',
 '2008',
 '1978',
 '1998',
 '1988',
 '1993',
 '2012',
 '2007',
 '1979',
 '1997',
 '2010',
 '1991',
 '1996',
 '2014',
 '2008',
 '2006',
 '1994',
 '1993',
 '2015',
 '1985',
 '2001',
 '2014',
 '1997',
 '1986',
 '2017',
 '1995',
 '2004',
 '1984',
 '2003',
 '2004',
 '1993',
 '1981',
 '2000',
 '2004',
 '2010',
 '1992',
 '1989',
 '2004',
 '1986',
 '2008',
 '2018',
 '2017',
 '1964',
 '1976',
 '2017',
 '1972',
 '2014',
 '2003',
 '1971',
 '2015',
 '1990',
 '1992',
 '1971',
 '2014',
 '2003',
 '1993',
 '2018',
 '2010',
 '1995',
 '2002',
 '2019',
 '2012',
 '2002',
 '2008',
 '1997',
 '1985',
 '2008',
 '2011',
 '2011',
 '1987',
 '1996',
 '1987',
 '2017',
 '2006',
 '2017',
 

In [38]:
# Converting all the strings to integers
years = [int(year) for year in years]
years

[1986,
 2002,
 2002,
 1989,
 2010,
 1971,
 2017,
 1986,
 1990,
 2004,
 2005,
 2017,
 1992,
 1971,
 1986,
 1997,
 2012,
 1999,
 2005,
 1998,
 2014,
 2016,
 1997,
 1988,
 1998,
 1995,
 1995,
 1987,
 1985,
 2007,
 2006,
 2010,
 2011,
 1989,
 1992,
 1996,
 1968,
 2008,
 1978,
 1998,
 1988,
 1993,
 2012,
 2007,
 1979,
 1997,
 2010,
 1991,
 1996,
 2014,
 2008,
 2006,
 1994,
 1993,
 2015,
 1985,
 2001,
 2014,
 1997,
 1986,
 2017,
 1995,
 2004,
 1984,
 2003,
 2004,
 1993,
 1981,
 2000,
 2004,
 2010,
 1992,
 1989,
 2004,
 1986,
 2008,
 2018,
 2017,
 1964,
 1976,
 2017,
 1972,
 2014,
 2003,
 1971,
 2015,
 1990,
 1992,
 1971,
 2014,
 2003,
 1993,
 2018,
 2010,
 1995,
 2002,
 2019,
 2012,
 2002,
 2008,
 1997,
 1985,
 2008,
 2011,
 2011,
 1987,
 1996,
 1987,
 2017,
 2006,
 2017,
 1994,
 1989,
 2014,
 1973,
 1985,
 1982,
 2015,
 1984,
 2000,
 2003,
 1994,
 1994,
 1994,
 2014,
 2000,
 1987,
 2007,
 1990,
 1981,
 1995,
 2011,
 2018,
 1981,
 1986,
 1992,
 1999,
 1991,
 1988,
 2015]