In [2]:
import requests
from bs4 import BeautifulSoup

In [4]:
html_doc = """
<!DOCTYPE html>
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
</html>
"""

In [6]:
soup = BeautifulSoup(html_doc, 'html.parser')

In [7]:
print(soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title">
   <b>
    The Dormouse's story
   </b>
  </p>
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
  <p class="story">
   ...
  </p>
 </body>
</html>



# HTML to select elements on the code

In [8]:
soup.title

<title>The Dormouse's story</title>

In [11]:
soup.body

<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body>

In [9]:
p_tags = soup.find_all('p')

In [10]:
p_tags

[<p class="title"><b>The Dormouse's story</b></p>,
 <p class="story">Once upon a time there were three little sisters; and their names were
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
 and they lived at the bottom of a well.</p>,
 <p class="story">...</p>]

In [12]:
for p in p_tags:
    print(p.get_text())

The Dormouse's story
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.
...


In [15]:
a_tags = soup.find_all('a')

In [16]:
for a in a_tags:
    print(a.get_text())

Elsie
Lacie
Tillie


In [17]:
for a in a_tags:
    print(a.get('href'))

http://example.com/elsie
http://example.com/lacie
http://example.com/tillie


In [20]:
soup.title.parent.string

"The Dormouse's story"

In [21]:
soup.title.parent.name

'head'

In [25]:
soup.head.parent.parent.name

'[document]'

In [27]:
soup.text.count('were')

2

# Simple website query

In [32]:
import re
re.findall(r'\w+', requests.get('https://www.ironhack.com/en').text).count('bootcamp')

63

# CSS method
Each argument in the HTML tag has its symbol. Class is '.', id is '#', etc. If we don't pass any symbol, we're searching for tags.

In [35]:
soup.select('.sister')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [36]:
soup.select('#link1')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

In [37]:
for a in soup.select('a'):
    print(a.get_text())

Elsie
Lacie
Tillie


In [42]:
print(soup.select('a')[0].get_text())

Elsie


In [46]:
soup.select('p.story')

[<p class="story">Once upon a time there were three little sisters; and their names were
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
 and they lived at the bottom of a well.</p>,
 <p class="story">...</p>]

In [45]:
soup.select('p.story')[0].get_text()

'Once upon a time there were three little sisters; and their names were\nElsie,\nLacie and\nTillie;\nand they lived at the bottom of a well.'

# Checking for understanding
You are provided with the HTML code below.

Write code to print the following contents (not including the HTML tags, only human-readable text):

- All the "fun facts".
- The names of all the places.
- The content (name and fact) of all the cities (only cities, not countries!)
- The names (not facts!) of all the cities (not countries!)

In [48]:
x = """
<!DOCTYPE html>
<html>
  <head>
    Geography
  </head>
  <body>
    <div class="city">
      <h2>London</h2>
      <p>London is the most popular tourist destination in the world.</p>
    </div>

    <div class="city">
      <h2>Paris</h2>
      <p>Paris was originally a Roman City called Lutetia.</p>
    </div>

    <div class="country">
      <h2>Spain</h2>
      <p>Spain produces 43,8% of all the world's Olive Oil.</p>
    </div>
  </body>
</html>
"""

In [49]:
soup1 = BeautifulSoup(x, 'html.parser')

In [55]:
# Getting fun facts
fun_facts = soup1.find_all('p')
for f in fun_facts:
    print(f.text)

London is the most popular tourist destination in the world.
Paris was originally a Roman City called Lutetia.
Spain produces 43,8% of all the world's Olive Oil.


In [58]:
# Names of places
places = soup1.find_all('h2')
for p in places:
    print(p.text)

London
Paris
Spain


In [62]:
# Name & fact of all cities
facts_citites = soup1.select('div.city')
for fc in facts_citites:
    print(fc.text)


London
London is the most popular tourist destination in the world.


Paris
Paris was originally a Roman City called Lutetia.



In [90]:
# Names of the cities
for fc in facts_citites:
    print(fc.h2.get_text())

London
Paris


In [91]:
for fc in facts_citites:
    print(fc.select('h2'))

[<h2>London</h2>]
[<h2>Paris</h2>]


In [92]:
for fc in facts_citites:
    print(fc.h2)

<h2>London</h2>
<h2>Paris</h2>


# Scraping IMDb Top 250 Movies

In [93]:
import pandas as pd

In [109]:
url = 'https://www.imdb.com/chart/top/'
headers = {"Accept-Language": "en-US,en;q=0.5"}
page = requests.get(url, headers=headers)

In [110]:
soup2 = BeautifulSoup(page.content, 'html.parser')

In [135]:
# Getting the first title / relevant content
soup2.select('td.titleColumn > a')[0].text

'The Shawshank Redemption'

In [134]:
# Getting the dir stars
soup2.select('td.titleColumn > a')[0]['title']

'Frank Darabont (dir.), Tim Robbins, Morgan Freeman'

In [180]:
# Getting the rank
rank = soup2.select('td.titleColumn')[98].text
rank1 = re.search(r'(\d+)\.', rank).group(0)
rank1

'99.'

In [137]:
# Getting the year
soup2.select('td.titleColumn > span')[0].text

'(1994)'

## For loop to get movies

In [199]:
rank = []
titles = []
year = []
dir_actor = []
length = len(soup2.select('td.titleColumn'))

In [200]:
for i in range(length):
    rank.append(re.search(r'(\d+)\.', soup2.select('td.titleColumn')[i].text).group(0))
    titles.append(soup2.select('td.titleColumn > a')[i].text)
    year.append(soup2.select('td.titleColumn > span')[i].text)
    dir_actor.append(soup2.select('td.titleColumn > a')[i]['title'])

## Creating a dataframe from movies data

In [206]:
# cleaning steps - drop () from year, drop . from rank, separate dirs from actors
year_cl = [yr.strip(')').strip('(') for yr in year]
rank_cl = [rk.replace('.', '') for rk in rank]

In [209]:
director = []
star1 = []
star2 = []
for movie in dir_actor:
    split_list = movie.split(',')
    director.append(split_list[0].replace(' (dir.)', ''))
    star1.append(split_list[1])
    star2.append(split_list[2])

In [210]:
movies = pd.DataFrame({
    'rank': rank_cl,
    'movie_title': titles,
    'director': director,
    'actor1': star1,
    'actor2': star2,
    'year': year_cl
})

In [211]:
movies

Unnamed: 0,rank,movie_title,director,actor1,actor2,year
0,1,The Shawshank Redemption,Frank Darabont,Tim Robbins,Morgan Freeman,1994
1,2,The Godfather,Francis Ford Coppola,Marlon Brando,Al Pacino,1972
2,3,The Godfather: Part II,Francis Ford Coppola,Al Pacino,Robert De Niro,1974
3,4,The Dark Knight,Christopher Nolan,Christian Bale,Heath Ledger,2008
4,5,12 Angry Men,Sidney Lumet,Henry Fonda,Lee J. Cobb,1957
...,...,...,...,...,...,...
245,246,Miracle in Cell No. 7,Mehmet Ada Öztekin,Aras Bulut Iynemli,Nisa Sofiya Aksongur,2019
246,247,Hera Pheri,Priyadarshan,Akshay Kumar,Suniel Shetty,2000
247,248,Neon Genesis Evangelion: The End of Evangelion,Hideaki Anno,Megumi Ogata,Megumi Hayashibara,1997
248,249,The Battle of Algiers,Gillo Pontecorvo,Brahim Hadjadj,Jean Martin,1966
