# (1) Importing required packages

In [87]:
# Loading all the required packages
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import pandas as pd

# (2) Making a GET request

In [2]:
# URL of the website which is to be scraped
base_url = 'https://en.wikipedia.org/wiki/Superhero_film'

response = requests.get(base_url) # Making request
response.status_code # Obtaining the status code

200

# (3) Converting HTML to Beautiful Soup object

- `.content` extracts the HTML code, which is then passed to Beautiful Soup library with **lxml** parser <br>
- **lxml** is the best parser among 3 available parsers of Beautiful Soup. <br>
- `.prettify` provides the parse tree in a readable manner. The retun type is **string**.

In [3]:
soup = BeautifulSoup(response.content, 'lxml')
print(soup.prettify()) 

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   Superhero film - Wikipedia
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"Xos@6wpAIEIAAGLnX4UAAADK","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"Superhero_film","wgTitle":"Superhero film","wgCurRevisionId":947133432,"wgRevisionId":947133432,"wgArticleId":1275470,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["All articles with dead external links","Articles with dead external links from September 2010","Articles with short description","Use mdy dates from February 20

# (4) Searching elements from the parse tree

### Searching by tag names

In [4]:
soup.find('h2') # Returns first occurrence of given tag

<h2 id="mw-toc-heading">Contents</h2>

In [5]:
soup.find_all('h2') # Returns all occurrences of given tag
# Return type = list

[<h2 id="mw-toc-heading">Contents</h2>,
 <h2><span class="mw-headline" id="History">History</span><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Superhero_film&amp;action=edit&amp;section=1" title="Edit section: History">edit</a><span class="mw-editsection-bracket">]</span></span></h2>,
 <h2><span class="mw-headline" id="Animated">Animated</span><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Superhero_film&amp;action=edit&amp;section=6" title="Edit section: Animated">edit</a><span class="mw-editsection-bracket">]</span></span></h2>,
 <h2><span class="mw-headline" id="Parody">Parody</span><span class="mw-editsection"><span class="mw-editsection-bracket">[</span><a href="/w/index.php?title=Superhero_film&amp;action=edit&amp;section=7" title="Edit section: Parody">edit</a><span class="mw-editsection-bracket">]</span></span></h2>,
 <h2><span class="mw-headline" id="Box_office_recep

In [6]:
# If given tag is absent in the HTML, then returns None
# This behaviour is also shown by 'find_all()'
print(soup.find('unvalid'))

None


In [7]:
type(soup.find('a'))

bs4.element.Tag

In [71]:
isinstance(soup.find_all('a'), list)

True

In [73]:
var = soup.find('div')
var.name # Used to find the name of tag stored in an element

'div'

In [17]:
# Storing all tables in a variable
table = soup.find_all('table')
table[0]

<table class="box-Unreferenced_section plainlinks metadata ambox ambox-content ambox-Unreferenced" role="presentation"><tbody><tr><td class="mbox-image"><div style="width:52px"><a class="image" href="/wiki/File:Question_book-new.svg"><img alt="" data-file-height="399" data-file-width="512" decoding="async" height="39" src="//upload.wikimedia.org/wikipedia/en/thumb/9/99/Question_book-new.svg/50px-Question_book-new.svg.png" srcset="//upload.wikimedia.org/wikipedia/en/thumb/9/99/Question_book-new.svg/75px-Question_book-new.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/9/99/Question_book-new.svg/100px-Question_book-new.svg.png 2x" width="50"/></a></div></td><td class="mbox-text"><div class="mbox-text-span">This section <b>does not <a href="/wiki/Wikipedia:Citing_sources" title="Wikipedia:Citing sources">cite</a> any <a href="/wiki/Wikipedia:Verifiability" title="Wikipedia:Verifiability">sources</a></b>.<span class="hide-when-compact"> Please help <a class="external text" href="ht

In [69]:
len(table)

3

## Searching by attribute names

In [23]:
soup.find('div', id = 'mw-head-base')

<div class="noprint" id="mw-head-base"></div>

In [24]:
soup.find('div', class_ = 'thumb tright')

<div class="thumb tright"><div class="thumbinner" style="width:222px;"><a class="image" href="/wiki/File:CaptainMarvelLobbyCard002A.jpg"><img alt="" class="thumbimage" data-file-height="555" data-file-width="781" decoding="async" height="156" src="//upload.wikimedia.org/wikipedia/commons/thumb/c/c4/CaptainMarvelLobbyCard002A.jpg/220px-CaptainMarvelLobbyCard002A.jpg" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/c/c4/CaptainMarvelLobbyCard002A.jpg/330px-CaptainMarvelLobbyCard002A.jpg 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/c/c4/CaptainMarvelLobbyCard002A.jpg/440px-CaptainMarvelLobbyCard002A.jpg 2x" width="220"/></a> <div class="thumbcaption"><div class="magnify"><a class="internal" href="/wiki/File:CaptainMarvelLobbyCard002A.jpg" title="Enlarge"></a></div><i><a href="/wiki/Adventures_of_Captain_Marvel" title="Adventures of Captain Marvel">Adventures of Captain Marvel</a></i>, <a href="/wiki/Republic_Pictures" title="Republic Pictures">Republic Pictures</a>, 1941</d

## Passing attributes in a dictionary

In [27]:
soup.find('h1', attrs = {'id': 'firstHeading', 'class': 'firstHeading', 'lang': 'en'})

<h1 class="firstHeading" id="firstHeading" lang="en">Superhero film</h1>

# (5) Navigating the tree

In [11]:
# Returns its contents along with given tag
soup.find('td') 

<td class="mbox-image"><div style="width:52px"><a class="image" href="/wiki/File:Question_book-new.svg"><img alt="" data-file-height="399" data-file-width="512" decoding="async" height="39" src="//upload.wikimedia.org/wikipedia/en/thumb/9/99/Question_book-new.svg/50px-Question_book-new.svg.png" srcset="//upload.wikimedia.org/wikipedia/en/thumb/9/99/Question_book-new.svg/75px-Question_book-new.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/9/99/Question_book-new.svg/100px-Question_book-new.svg.png 2x" width="50"/></a></div></td>

In [12]:
# Returns only the contents of given tag, as a list
table_content = soup.find('td').contents
print(table_content)

[<div style="width:52px"><a class="image" href="/wiki/File:Question_book-new.svg"><img alt="" data-file-height="399" data-file-width="512" decoding="async" height="39" src="//upload.wikimedia.org/wikipedia/en/thumb/9/99/Question_book-new.svg/50px-Question_book-new.svg.png" srcset="//upload.wikimedia.org/wikipedia/en/thumb/9/99/Question_book-new.svg/75px-Question_book-new.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/9/99/Question_book-new.svg/100px-Question_book-new.svg.png 2x" width="50"/></a></div>]


### `.parent` Method

In [19]:
# Returns parent tag of the given tag
soup.find('td').parent

<tr><td class="mbox-image"><div style="width:52px"><a class="image" href="/wiki/File:Question_book-new.svg"><img alt="" data-file-height="399" data-file-width="512" decoding="async" height="39" src="//upload.wikimedia.org/wikipedia/en/thumb/9/99/Question_book-new.svg/50px-Question_book-new.svg.png" srcset="//upload.wikimedia.org/wikipedia/en/thumb/9/99/Question_book-new.svg/75px-Question_book-new.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/9/99/Question_book-new.svg/100px-Question_book-new.svg.png 2x" width="50"/></a></div></td><td class="mbox-text"><div class="mbox-text-span">This section <b>does not <a href="/wiki/Wikipedia:Citing_sources" title="Wikipedia:Citing sources">cite</a> any <a href="/wiki/Wikipedia:Verifiability" title="Wikipedia:Verifiability">sources</a></b>.<span class="hide-when-compact"> Please help <a class="external text" href="https://en.wikipedia.org/w/index.php?title=Superhero_film&amp;action=edit">improve this section</a> by <a href="/wiki/Help:Intro

### `.children` Method

In [14]:
# Similarly, .children can be used to find its child tags
soup.find('td').children

<list_iterator at 0x7fb5b492cf90>

To get a list of all children of an element, we should use `.content`. <br>
`.children` is an iterator over the list obtained by above method. <br>
Due to this, we can use it to iterate over all the children of an element using a for loop.

In [15]:
for i in soup.find('tbody').children:
    print(i)

<tr><td class="mbox-image"><div style="width:52px"><a class="image" href="/wiki/File:Question_book-new.svg"><img alt="" data-file-height="399" data-file-width="512" decoding="async" height="39" src="//upload.wikimedia.org/wikipedia/en/thumb/9/99/Question_book-new.svg/50px-Question_book-new.svg.png" srcset="//upload.wikimedia.org/wikipedia/en/thumb/9/99/Question_book-new.svg/75px-Question_book-new.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/9/99/Question_book-new.svg/100px-Question_book-new.svg.png 2x" width="50"/></a></div></td><td class="mbox-text"><div class="mbox-text-span">This section <b>does not <a href="/wiki/Wikipedia:Citing_sources" title="Wikipedia:Citing sources">cite</a> any <a href="/wiki/Wikipedia:Verifiability" title="Wikipedia:Verifiability">sources</a></b>.<span class="hide-when-compact"> Please help <a class="external text" href="https://en.wikipedia.org/w/index.php?title=Superhero_film&amp;action=edit">improve this section</a> by <a href="/wiki/Help:Intro

## Chaining `.parent` Method

In [16]:
# .parent can be chained
soup.find('td').parent.parent

<tbody><tr><td class="mbox-image"><div style="width:52px"><a class="image" href="/wiki/File:Question_book-new.svg"><img alt="" data-file-height="399" data-file-width="512" decoding="async" height="39" src="//upload.wikimedia.org/wikipedia/en/thumb/9/99/Question_book-new.svg/50px-Question_book-new.svg.png" srcset="//upload.wikimedia.org/wikipedia/en/thumb/9/99/Question_book-new.svg/75px-Question_book-new.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/9/99/Question_book-new.svg/100px-Question_book-new.svg.png 2x" width="50"/></a></div></td><td class="mbox-text"><div class="mbox-text-span">This section <b>does not <a href="/wiki/Wikipedia:Citing_sources" title="Wikipedia:Citing sources">cite</a> any <a href="/wiki/Wikipedia:Verifiability" title="Wikipedia:Verifiability">sources</a></b>.<span class="hide-when-compact"> Please help <a class="external text" href="https://en.wikipedia.org/w/index.php?title=Superhero_film&amp;action=edit">improve this section</a> by <a href="/wiki/Hel

# (6) Extracting data from the HTML parse tree

In [38]:
a = soup.find_all('a')[16]
a

<a href="/wiki/Superhero_comics" title="Superhero comics">superhero comics</a>

In [53]:
# Returns a dictionary of attributes along with their values
a.attrs

{'href': '/wiki/Superhero_comics', 'title': 'Superhero comics'}

## Extracting attribute values

**Method 1**: 
- For mult-values attributes (such as `class`), return type is a **list**
- If passed attribute is absent, then it throws an error.

In [48]:
a['title']

'Superhero comics'

**Method 2**: **`.get()`** Method *{More preferred method}*
- For mult-values attributes (such as `class`), return type is a **list**
- If passed attribute is absent, then it returns `None`.

In [50]:
a.get('title')

'Superhero comics'

In [52]:
print(a.get('class'))

None


In [51]:
# Custom message can be created, in case attribute is absent
a.get('class', 'Attribute is not found')

'Attribute is not found'

# (7) Extracting the text from the HTML parse tree

In [57]:
p = soup.find_all('p')[10]
p

<p>After the comic book boom and the success of several comic book adaptation films (including superhero films) in the 1990s,<sup class="reference" id="cite_ref-16"><a href="#cite_note-16">[16]</a></sup> the first decade of the 21st century brought increased interest in superhero films and some of their most profitable franchises, many from Marvel Enterprises. The success of the <i><a href="/wiki/X-Men_(TV_series)" title="X-Men (TV series)">X-Men</a></i> TV series had made <a class="mw-redirect" href="/wiki/20th_Century_Fox" title="20th Century Fox">20th Century Fox</a> license the film rights in 1994.<sup class="reference" id="cite_ref-17"><a href="#cite_note-17">[17]</a></sup> After the success of <i>Men in Black</i> in 1997, <a href="/wiki/Columbia_Pictures" title="Columbia Pictures">Columbia Pictures</a> licensed the film rights of <i>Spider-Man</i> in 1999.<sup class="reference" id="cite_ref-18"><a href="#cite_note-18">[18]</a></sup> <a class="mw-redirect" href="/wiki/20th_Century

### `.text` Method
**`.text`** returns everything present inside the given tag. **`.get_text()`** also does the same work

In [63]:
print(p.text) # p.get_text() yields the exact same result

After the comic book boom and the success of several comic book adaptation films (including superhero films) in the 1990s,[16] the first decade of the 21st century brought increased interest in superhero films and some of their most profitable franchises, many from Marvel Enterprises. The success of the X-Men TV series had made 20th Century Fox license the film rights in 1994.[17] After the success of Men in Black in 1997, Columbia Pictures licensed the film rights of Spider-Man in 1999.[18] 20th Century Fox's X-Men (2000) became a film franchise by its surprise hit,[19] and M. Night Shyamalan's Unbreakable (2000)[20] also succeeded and added an element of more urban naturalism.[21] Later, one of the largest blockbusters of all time was released with Sam Raimi's Spider-Man (2002).[22] With high ticket and DVD sales, several new superhero films were released every year in the 2000s, including Daredevil (2003), The League of Extraordinary Gentlemen (2003), Hulk (2003), Catwoman (2004), H

### `.string` Method
**`.string`** returns **`None`**, if there are more than 1 tags present inside the given tag

In [62]:
print(p.string)

None


### `.strings` Method
All strings inside an element can be accessed separatelly by using the **`.strings` iterator**

In [67]:
for j in p.strings:
    print(j)

After the comic book boom and the success of several comic book adaptation films (including superhero films) in the 1990s,
[16]
 the first decade of the 21st century brought increased interest in superhero films and some of their most profitable franchises, many from Marvel Enterprises. The success of the 
X-Men
 TV series had made 
20th Century Fox
 license the film rights in 1994.
[17]
 After the success of 
Men in Black
 in 1997, 
Columbia Pictures
 licensed the film rights of 
Spider-Man
 in 1999.
[18]
 
20th Century Fox
's 
X-Men
 (2000) became 
a film franchise
 by its surprise hit,
[19]
 and 
M. Night Shyamalan
's 
Unbreakable
 (2000)
[20]
 also succeeded and added an element of more urban 
naturalism
.
[21]
 Later, one of the largest blockbusters of all time was released with 
Sam Raimi
's 
Spider-Man
 (2002).
[22]
 With high ticket and DVD sales, several new superhero films were released every year in the 2000s, including 
Daredevil
 (2003), 
The League of Extraordinary Gentle

### `.stripped_strings` Method
The extra whitespace can be removed by using the **`.stripped_strings` iterator** instead.

In [68]:
for k in p.stripped_strings:
    print(k)

After the comic book boom and the success of several comic book adaptation films (including superhero films) in the 1990s,
[16]
the first decade of the 21st century brought increased interest in superhero films and some of their most profitable franchises, many from Marvel Enterprises. The success of the
X-Men
TV series had made
20th Century Fox
license the film rights in 1994.
[17]
After the success of
Men in Black
in 1997,
Columbia Pictures
licensed the film rights of
Spider-Man
in 1999.
[18]
20th Century Fox
's
X-Men
(2000) became
a film franchise
by its surprise hit,
[19]
and
M. Night Shyamalan
's
Unbreakable
(2000)
[20]
also succeeded and added an element of more urban
naturalism
.
[21]
Later, one of the largest blockbusters of all time was released with
Sam Raimi
's
Spider-Man
(2002).
[22]
With high ticket and DVD sales, several new superhero films were released every year in the 2000s, including
Daredevil
(2003),
The League of Extraordinary Gentlemen
(2003),
Hulk
(2003),
Catwoma

# (8) Extracting data from the links present in the HTML parse tree

In [74]:
# Find all links on the page 
links = soup.find_all('a')
links

[<a id="top"></a>,
 <a class="mw-jump-link" href="#mw-head">Jump to navigation</a>,
 <a class="mw-jump-link" href="#p-search">Jump to search</a>,
 <a href="/wiki/Superhero_Movie" title="Superhero Movie">Superhero Movie</a>,
 <a class="image" href="/wiki/File:CaptainMarvelLobbyCard002A.jpg"><img alt="" class="thumbimage" data-file-height="555" data-file-width="781" decoding="async" height="156" src="//upload.wikimedia.org/wikipedia/commons/thumb/c/c4/CaptainMarvelLobbyCard002A.jpg/220px-CaptainMarvelLobbyCard002A.jpg" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/c/c4/CaptainMarvelLobbyCard002A.jpg/330px-CaptainMarvelLobbyCard002A.jpg 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/c/c4/CaptainMarvelLobbyCard002A.jpg/440px-CaptainMarvelLobbyCard002A.jpg 2x" width="220"/></a>,
 <a class="internal" href="/wiki/File:CaptainMarvelLobbyCard002A.jpg" title="Enlarge"></a>,
 <a href="/wiki/Adventures_of_Captain_Marvel" title="Adventures of Captain Marvel">Adventures of Captain Mar

In [75]:
# Dropping the links without "href" attribute
clean_links = [l for l in links if l.get('href') != None]
clean_links

[<a class="mw-jump-link" href="#mw-head">Jump to navigation</a>,
 <a class="mw-jump-link" href="#p-search">Jump to search</a>,
 <a href="/wiki/Superhero_Movie" title="Superhero Movie">Superhero Movie</a>,
 <a class="image" href="/wiki/File:CaptainMarvelLobbyCard002A.jpg"><img alt="" class="thumbimage" data-file-height="555" data-file-width="781" decoding="async" height="156" src="//upload.wikimedia.org/wikipedia/commons/thumb/c/c4/CaptainMarvelLobbyCard002A.jpg/220px-CaptainMarvelLobbyCard002A.jpg" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/c/c4/CaptainMarvelLobbyCard002A.jpg/330px-CaptainMarvelLobbyCard002A.jpg 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/c/c4/CaptainMarvelLobbyCard002A.jpg/440px-CaptainMarvelLobbyCard002A.jpg 2x" width="220"/></a>,
 <a class="internal" href="/wiki/File:CaptainMarvelLobbyCard002A.jpg" title="Enlarge"></a>,
 <a href="/wiki/Adventures_of_Captain_Marvel" title="Adventures of Captain Marvel">Adventures of Captain Marvel</a>,
 <a href="

In [78]:
clean_titles = [title.get('title') for title in clean_links if title.get('title') != None]
clean_titles

['Superhero Movie',
 'Enlarge',
 'Adventures of Captain Marvel',
 'Republic Pictures',
 'Superhero',
 'Action film',
 'Adventure film',
 'Fantasy film',
 'Science fiction film',
 'Origin story',
 'Supervillain',
 'Archenemy',
 'Superhero comics',
 'RoboCop (franchise)',
 'The Meteor Man (film)',
 'Unbreakable (film series)',
 'The Incredibles',
 'Hancock (film)',
 'They Call Me Jeeg',
 'The Green Hornet',
 'The Green Hornet (radio series)',
 'The Green Hornet (TV series)',
 'Underdog (2007 film)',
 'The Powerpuff Girls (film)',
 'Animated television series',
 'Tokusatsu',
 'Anime',
 'Manga',
 '20th Century Fox',
 'X-Men (film series)',
 'Sam Raimi',
 'Spider-Man in film',
 'Pixar',
 'The Incredibles',
 'Christopher Nolan',
 'The Dark Knight Trilogy',
 'Marvel Cinematic Universe',
 'Iron Man (2008 film)',
 'DC Extended Universe',
 'Man of Steel (film)',
 'Academy Award',
 'Action film',
 'Horror film',
 'Fantasy film',
 'Comedy film',
 'Edit section: History',
 'Edit section: Early year

# (9) Getting absolute links to all the links present in the page

In [83]:
# Relative URL of the link to MCU wiki 
clean_links[39].get('href')

'/wiki/Marvel_Cinematic_Universe'

In [86]:
mcu_link = urljoin(base_url, clean_links[39].get('href'))
mcu_link # Absolute URL of the link to MCU wiki

'https://en.wikipedia.org/wiki/Marvel_Cinematic_Universe'

# (10) Storing data from tables in a structured manner

In [120]:
# New URL with many tables
base_url2 = 'https://en.wikipedia.org/wiki/List_of_highest-grossing_films'
response2 = requests.get(base_url2)
response2.status_code

200

In [137]:
soup2 = BeautifulSoup(response2.content, 'lxml')

### Method 1 (Tedious method)

In [122]:
tables = soup2.find_all('table')
len(tables)

92

In [126]:
tables[0]

<table class="wikitable sortable plainrowheaders" style="margin:auto; margin:auto;">
<caption>Highest-grossing films<sup class="reference" id="cite_ref-13"><a href="#cite_note-13">[13]</a></sup>
</caption>
<tbody><tr>
<th scope="col">Rank
</th>
<th scope="col">Peak
</th>
<th scope="col">Title
</th>
<th scope="col">Worldwide gross
</th>
<th scope="col">Year
</th>
<th class="unsortable" scope="col">Reference(s)
</th></tr>
<tr>
<td>1
</td>
<td>1
</td>
<th scope="row"><i><a href="/wiki/Avengers:_Endgame" title="Avengers: Endgame">Avengers: Endgame</a></i>
</th>
<td align="right">$2,797,800,564
</td>
<td data-sort-value="2019-04" style="text-align:center;">2019
</td>
<td style="text-align:center;"><sup class="reference" id="cite_ref-endgame_14-0"><a href="#cite_note-endgame-14">[# 1]</a></sup><sup class="reference" id="cite_ref-endgame_peak_15-0"><a href="#cite_note-endgame_peak-15">[# 2]</a></sup>
</td></tr>
<tr>
<td>2
</td>
<td>1
</td>
<th scope="row"><i><a href="/wiki/Avatar_(2009_film)"

In [131]:
# Extracting entries which will serve as headings for our required table
headings = [heading.text for heading in tables[0].find_all('th')[0:5]]
headings

['Rank\n', 'Peak\n', 'Title\n', 'Worldwide gross\n', 'Year\n']

In [132]:
# Removing "\n" character from all these headings
headings = [heading.strip('\n') for heading in headings]
headings

['Rank', 'Peak', 'Title', 'Worldwide gross', 'Year']

### Method 2: Using Pandas
Pandas method used for table extraction from HTML. <br>
It uses Beautiful Soup in the background. <br>
It is a fast and efficient method, with an added benefit of replacing all missing values with `NaN`. <br>

In [134]:
all_tables = pd.read_html(base_url2)

In [135]:
print(type(all_tables)) # "List" type
print(type(all_tables[0])) # "DataFrame" type

<class 'list'>
<class 'pandas.core.frame.DataFrame'>


In [138]:
# First table which stores highest worldwide gross
highest_gross_table = all_tables[0]
highest_gross_table.head()

Unnamed: 0,Rank,Peak,Title,Worldwide gross,Year,Reference(s)
0,1,1,Avengers: Endgame,"$2,797,800,564",2019,[# 1][# 2]
1,2,1,Avatar,"$2,790,439,000",2009,[# 3][# 4]
2,3,1,Titanic,"$2,194,439,542",1997,[# 5][# 6]
3,4,3,Star Wars: The Force Awakens,"$2,068,223,624",2015,[# 7][# 8]
4,5,4,Avengers: Infinity War,"$2,048,359,754",2018,[# 9][# 10]


In [143]:
# Permanently dropping the "Reference(s)" column
highest_gross_table.drop('Reference(s)', axis = 1 ,inplace = True)
highest_gross_table.head()

Unnamed: 0,Rank,Peak,Title,Worldwide gross,Year
0,1,1,Avengers: Endgame,"$2,797,800,564",2019
1,2,1,Avatar,"$2,790,439,000",2009
2,3,1,Titanic,"$2,194,439,542",1997
3,4,3,Star Wars: The Force Awakens,"$2,068,223,624",2015
4,5,4,Avengers: Infinity War,"$2,048,359,754",2018


We can add filters to select only tables with given attributes and values, by using <br>
`pd.read_html(url, attrs = {'class': 'some_class_name'})`