In [1]:
html_doc = """
<!DOCTYPE html>
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
</html>
"""

In [4]:
from bs4 import BeautifulSoup

In [6]:
soup = BeautifulSoup(html_doc, 'html.parser')

In [7]:
type(soup)

bs4.BeautifulSoup

# Option 1: Using Beautiful soup "the html way"

In [32]:
soup.title

<title>The Dormouse's story</title>

In [13]:
soup.title.name

'title'

In [14]:
soup.title.string

"The Dormouse's story"

In [15]:
soup.title.parent

<head><title>The Dormouse's story</title></head>

In [16]:
soup.title.parent.name

'head'

In [17]:
soup.title.parent.string

"The Dormouse's story"

In [18]:
soup.p

<p class="title"><b>The Dormouse's story</b></p>

In [19]:
soup.p['class']

['title']

In [21]:
soup.find_all('p')

[<p class="title"><b>The Dormouse's story</b></p>,
 <p class="story">Once upon a time there were three little sisters; and their names were
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
 and they lived at the bottom of a well.</p>,
 <p class="story">...</p>]

In [27]:
soup.find_all('a')[0].get('href')

'http://example.com/elsie'

In [31]:
for atag in soup.find_all('a'):
    print(atag.get('href'))

http://example.com/elsie
http://example.com/lacie
http://example.com/tillie


# Option 2: Using Beautiful soup "the CSS way"

In [36]:
soup.select('#link1')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

In [39]:
soup.select('a')[0].text

'Elsie'

In [40]:
soup.select('a')[0].get_text()

'Elsie'

In [41]:
for atag in soup.select('a'):
    print(atag.text)

Elsie
Lacie
Tillie


In [47]:
soup.select('a')[0].text

'Elsie'

A css selector address:

```html
.lister-list > tr:nth-child(7) > td:nth-child(2) > a:nth-child(1)
```

# Let's scrape the imdb Top 250

In [51]:
# 1. importing libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd

# 2. find url and store it in avariable
url = "https://www.imdb.com/chart/top"

# 3. download html with a get request
response = requests.get(url)

In [52]:
response.status_code

200

In [54]:
soup = BeautifulSoup(response.content, 'html.parser')

In [57]:
soup.select(".lister-list > tr:nth-child(1) > td:nth-child(2) > a:nth-child(1)")

[<a href="/title/tt0111161/" title="Frank Darabont (dir.), Tim Robbins, Morgan Freeman">Die Verurteilten</a>]

In [60]:
soup.select("td.titleColumn a")

[<a href="/title/tt0111161/" title="Frank Darabont (dir.), Tim Robbins, Morgan Freeman">Die Verurteilten</a>,
 <a href="/title/tt0068646/" title="Francis Ford Coppola (dir.), Marlon Brando, Al Pacino">Der Pate</a>,
 <a href="/title/tt0071562/" title="Francis Ford Coppola (dir.), Al Pacino, Robert De Niro">Der Pate 2</a>,
 <a href="/title/tt0468569/" title="Christopher Nolan (dir.), Christian Bale, Heath Ledger">The Dark Knight</a>,
 <a href="/title/tt0050083/" title="Sidney Lumet (dir.), Henry Fonda, Lee J. Cobb">Die zwölf Geschworenen</a>,
 <a href="/title/tt0108052/" title="Steven Spielberg (dir.), Liam Neeson, Ralph Fiennes">Schindlers Liste</a>,
 <a href="/title/tt0167260/" title="Peter Jackson (dir.), Elijah Wood, Viggo Mortensen">Der Herr der Ringe: Die Rückkehr des Königs</a>,
 <a href="/title/tt0110912/" title="Quentin Tarantino (dir.), John Travolta, Uma Thurman">Pulp Fiction</a>,
 <a href="/title/tt0060196/" title="Sergio Leone (dir.), Clint Eastwood, Eli Wallach">Zwei glorre

In [63]:
# Movie title
soup.select("td.titleColumn a")[0].text

'Die Verurteilten'

In [64]:
# Actors / actresses and dir' stored in 'title' attribute
soup.select("td.titleColumn a")[0]['title']

'Frank Darabont (dir.), Tim Robbins, Morgan Freeman'

In [67]:
soup.select("td.titleColumn span.secondaryInfo")[0].text

'(1994)'

## Now make it work for all the movies

In [69]:
# movie_lst = soup.select("td.titleColumn a")
# yr_lst = soup.select("td.titleColumn span.secondaryInfo")

In [73]:
from tqdm.notebook import tqdm

title = []
dir_stars = []
year = []

len_movies = len(movie_lst)

for i in tqdm(range(len_movies)):
    title.append(soup.select("td.titleColumn a")[i].text)
    dir_stars.append(soup.select("td.titleColumn a")[i]['title'])
    year.append(soup.select("td.titleColumn span.secondaryInfo")[i].text)

HBox(children=(FloatProgress(value=0.0, max=250.0), HTML(value='')))




In [74]:
title

['Die Verurteilten',
 'Der Pate',
 'Der Pate 2',
 'The Dark Knight',
 'Die zwölf Geschworenen',
 'Schindlers Liste',
 'Der Herr der Ringe: Die Rückkehr des Königs',
 'Pulp Fiction',
 'Zwei glorreiche Halunken',
 'Der Herr der Ringe: Die Gefährten',
 'Fight Club',
 'Forrest Gump',
 'Inception',
 'Der Herr der Ringe: Die zwei Türme',
 'Das Imperium schlägt zurück',
 'Matrix',
 'GoodFellas - Drei Jahrzehnte in der Mafia',
 'Einer flog über das Kuckucksnest',
 'Die sieben Samurai',
 'Sieben',
 'Das Leben ist schön',
 'City of God',
 'Das Schweigen der Lämmer',
 'Ist das Leben nicht schön?',
 'Der Soldat James Ryan',
 'Krieg der Sterne',
 'The Green Mile',
 'Chihiros Reise ins Zauberland',
 'Interstellar',
 'Parasite',
 'Léon: Der Profi',
 'Harakiri',
 'Der König der Löwen',
 'Die üblichen Verdächtigen',
 'Der Pianist',
 'Terminator 2: Tag der Abrechnung',
 'Zurück in die Zukunft',
 'American History X',
 'Moderne Zeiten',
 'Gladiator',
 'Psycho',
 'Departed: Unter Feinden',
 'Lichter der G

In [75]:
dir_stars

['Frank Darabont (dir.), Tim Robbins, Morgan Freeman',
 'Francis Ford Coppola (dir.), Marlon Brando, Al Pacino',
 'Francis Ford Coppola (dir.), Al Pacino, Robert De Niro',
 'Christopher Nolan (dir.), Christian Bale, Heath Ledger',
 'Sidney Lumet (dir.), Henry Fonda, Lee J. Cobb',
 'Steven Spielberg (dir.), Liam Neeson, Ralph Fiennes',
 'Peter Jackson (dir.), Elijah Wood, Viggo Mortensen',
 'Quentin Tarantino (dir.), John Travolta, Uma Thurman',
 'Sergio Leone (dir.), Clint Eastwood, Eli Wallach',
 'Peter Jackson (dir.), Elijah Wood, Ian McKellen',
 'David Fincher (dir.), Brad Pitt, Edward Norton',
 'Robert Zemeckis (dir.), Tom Hanks, Robin Wright',
 'Christopher Nolan (dir.), Leonardo DiCaprio, Joseph Gordon-Levitt',
 'Peter Jackson (dir.), Elijah Wood, Ian McKellen',
 'Irvin Kershner (dir.), Mark Hamill, Harrison Ford',
 'Lana Wachowski (dir.), Keanu Reeves, Laurence Fishburne',
 'Martin Scorsese (dir.), Robert De Niro, Ray Liotta',
 'Milos Forman (dir.), Jack Nicholson, Louise Fletch

In [76]:
year

['(1994)',
 '(1972)',
 '(1974)',
 '(2008)',
 '(1957)',
 '(1993)',
 '(2003)',
 '(1994)',
 '(1966)',
 '(2001)',
 '(1999)',
 '(1994)',
 '(2010)',
 '(2002)',
 '(1980)',
 '(1999)',
 '(1990)',
 '(1975)',
 '(1954)',
 '(1995)',
 '(1997)',
 '(2002)',
 '(1991)',
 '(1946)',
 '(1998)',
 '(1977)',
 '(1999)',
 '(2001)',
 '(2014)',
 '(2019)',
 '(1994)',
 '(1962)',
 '(1994)',
 '(1995)',
 '(2002)',
 '(1991)',
 '(1985)',
 '(1998)',
 '(1936)',
 '(2000)',
 '(1960)',
 '(2006)',
 '(1931)',
 '(2014)',
 '(2011)',
 '(1988)',
 '(2006)',
 '(1968)',
 '(1942)',
 '(1988)',
 '(1954)',
 '(1979)',
 '(1979)',
 '(2000)',
 '(1940)',
 '(1981)',
 '(2012)',
 '(2006)',
 '(1957)',
 '(2008)',
 '(2020)',
 '(2019)',
 '(1980)',
 '(2018)',
 '(1950)',
 '(1957)',
 '(2018)',
 '(2003)',
 '(1997)',
 '(1964)',
 '(2012)',
 '(1984)',
 '(2016)',
 '(2017)',
 '(1986)',
 '(2019)',
 '(2018)',
 '(1999)',
 '(1995)',
 '(1963)',
 '(1995)',
 '(1981)',
 '(2009)',
 '(1984)',
 '(2009)',
 '(1997)',
 '(1983)',
 '(2007)',
 '(1992)',
 '(1968)',
 '(2000)',

In [80]:
# clean the year column
year_clean = [yr.strip(')').strip('(') for yr in year]

In [81]:
year_clean

['1994',
 '1972',
 '1974',
 '2008',
 '1957',
 '1993',
 '2003',
 '1994',
 '1966',
 '2001',
 '1999',
 '1994',
 '2010',
 '2002',
 '1980',
 '1999',
 '1990',
 '1975',
 '1954',
 '1995',
 '1997',
 '2002',
 '1991',
 '1946',
 '1998',
 '1977',
 '1999',
 '2001',
 '2014',
 '2019',
 '1994',
 '1962',
 '1994',
 '1995',
 '2002',
 '1991',
 '1985',
 '1998',
 '1936',
 '2000',
 '1960',
 '2006',
 '1931',
 '2014',
 '2011',
 '1988',
 '2006',
 '1968',
 '1942',
 '1988',
 '1954',
 '1979',
 '1979',
 '2000',
 '1940',
 '1981',
 '2012',
 '2006',
 '1957',
 '2008',
 '2020',
 '2019',
 '1980',
 '2018',
 '1950',
 '1957',
 '2018',
 '2003',
 '1997',
 '1964',
 '2012',
 '1984',
 '2016',
 '2017',
 '1986',
 '2019',
 '2018',
 '1999',
 '1995',
 '1963',
 '1995',
 '1981',
 '2009',
 '1984',
 '2009',
 '1997',
 '1983',
 '2007',
 '1992',
 '1968',
 '2000',
 '2012',
 '1958',
 '1931',
 '2004',
 '1941',
 '2016',
 '1985',
 '1921',
 '1952',
 '1948',
 '1987',
 '1952',
 '2000',
 '1959',
 '1983',
 '1971',
 '2020',
 '2019',
 '2010',
 '1976',
 

In [86]:
# we're going to split the dir_star list into two actor lists
# and one director list
actor1 = []
actor2 = []
dir1 = []

for dir_star in dir_stars:
    temporary_people = dir_star.split(",")  # only three
    
    # now append the individual people to their corresponding lists
    # (already cleaned!)
    dir1.append(temporary_people[0].replace(' (dir.)', ''))
    actor1.append(temporary_people[1][1:])
    actor2.append(temporary_people[2][1:])

In [87]:
# create the resulting dataframe
movies = pd.DataFrame({'year': year_clean,
                       'title': title,
                       'dir': dir1,
                       'actor1': actor1,
                       'actor2': actor2,
                      })

In [88]:
movies

Unnamed: 0,year,title,dir,actor1,actor2
0,1994,Die Verurteilten,Frank Darabont,Tim Robbins,Morgan Freeman
1,1972,Der Pate,Francis Ford Coppola,Marlon Brando,Al Pacino
2,1974,Der Pate 2,Francis Ford Coppola,Al Pacino,Robert De Niro
3,2008,The Dark Knight,Christopher Nolan,Christian Bale,Heath Ledger
4,1957,Die zwölf Geschworenen,Sidney Lumet,Henry Fonda,Lee J. Cobb
...,...,...,...,...,...
245,2016,Koe no katachi,Naoko Yamada,Miyu Irino,Saori Hayami
246,1966,Schlacht um Algier,Gillo Pontecorvo,Brahim Hadjadj,Jean Martin
247,1997,Neon Genesis Evangelion - The End of Evangelion,Hideaki Anno,Megumi Ogata,Megumi Hayashibara
248,1994,Drei Farben - Rot,Krzysztof Kieslowski,Irène Jacob,Jean-Louis Trintignant


# Sraping many pages

In [5]:
url = 'https://www.imdb.com/search/title/?title_type=feature&release_date=1990-01-01,1992-12-31&user_rating=7.5'

In [6]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [7]:
response = requests.get(url)
response.status_code

200

In [8]:
response.content

b'\n\n<!DOCTYPE html>\n<html\n    xmlns:og="http://ogp.me/ns#"\n    xmlns:fb="http://www.facebook.com/2008/fbml">\n    <head>\n         \n        <meta charset="utf-8">\n        <meta http-equiv="X-UA-Compatible" content="IE=edge">\n\n    <meta name="apple-itunes-app" content="app-id=342792525, app-argument=imdb:///?src=mdot">\n\n\n\n        <script type="text/javascript">var IMDbTimer={starttime: new Date().getTime(),pt:\'java\'};</script>\n\n<script>\n    if (typeof uet == \'function\') {\n      uet("bb", "LoadTitle", {wb: 1});\n    }\n</script>\n  <script>(function(t){ (t.events = t.events || {})["csm_head_pre_title"] = new Date().getTime(); })(IMDbTimer);</script>\n        <title>Feature Film,\nReleased between 1990-01-01 and 1992-12-31,\nUser Rating of 7.5\n(Sorted by Popularity Ascending) - IMDb</title>\n  <script>(function(t){ (t.events = t.events || {})["csm_head_post_title"] = new Date().getTime(); })(IMDbTimer);</script>\n<script>\n    if (typeof uet == \'function\') {\n     

In [9]:
# html 
soup = BeautifulSoup(response.content, 'html.parser')
soup


<!DOCTYPE html>

<html xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://ogp.me/ns#">
<head>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="app-id=342792525, app-argument=imdb:///?src=mdot" name="apple-itunes-app"/>
<script type="text/javascript">var IMDbTimer={starttime: new Date().getTime(),pt:'java'};</script>
<script>
    if (typeof uet == 'function') {
      uet("bb", "LoadTitle", {wb: 1});
    }
</script>
<script>(function(t){ (t.events = t.events || {})["csm_head_pre_title"] = new Date().getTime(); })(IMDbTimer);</script>
<title>Feature Film,
Released between 1990-01-01 and 1992-12-31,
User Rating of 7.5
(Sorted by Popularity Ascending) - IMDb</title>
<script>(function(t){ (t.events = t.events || {})["csm_head_post_title"] = new Date().getTime(); })(IMDbTimer);</script>
<script>
    if (typeof uet == 'function') {
      uet("be", "LoadTitle", {wb: 1});
    }
</script>
<script>
    if (typeof uex == 'function') {
      u

In [19]:
# define a start variable that goes into the url
iterations = range(1,501, 50)
#s_var = 101
f_url = [f"https://www.imdb.com/search/title/?title_type=feature&release_date=1990-01-01,1992-12-31&user_rating=7.5,&start={i}&ref_=adv_nxt" for i in iterations]

In [15]:
for i in iterations:
    print(i)

1
51
101
151
201
251
301
351
401
451


In [20]:
f_url = [f"https://www.imdb.com/search/title/?title_type=feature&release_date=1990-01-01,1992-12-31&user_rating=7.5,&start={i}&ref_=adv_nxt" for i in iterations]

# Respectful scraping!

In [None]:
from time import sleep 
for i in range (5):
    print(i)
    wait_time = randint(1,4)
    print("I will sleep for" + str(wait_time)+ "seconds")
    sleep(wait_time)

In [None]:
pages.append(reponse)
wait_time

In [22]:
from time import sleep 
for i in range (5):
    print(i)
    wait_time = randint(1,4)
    print("I will sleep for" + str(wait_time)+ "seconds")
    sleep(wait_time)

0


NameError: name 'randint' is not defined

In [23]:
pages = []
for i in iterations:
    start_at = str(i)
    url = f"https://www.imdb.com/search/title/?title_type=feature&release_date=1990-01-01,1992-12-31&user_rating=7.5,&start={start_at}&ref_=adv_nxt"
#download the url
    response = requests.get(url)
# print the status code
    print("Status code: " + str(response.status_code))
    pages.append(response)
    wait_time = randint(0.300, 2)
    sleep(wait_time)

Status code: 200


NameError: name 'randint' is not defined

# Put everything together 

In [None]:
pages = []

for i in iterations:
    start_at = str(i)
    url = f_url = f"https://www.imdb.com/search/title/?title_type=feature&release_date=1990-01-01,1992-12-31&user_rating=7.5,&start={i}&ref_=adv_nxt"
     #download the url
    response = requests.gets    