# IMDB Web Scraping

In [1]:
import requests
import re
from bs4 import BeautifulSoup

### Web Scrape IMDB Results Page

In [None]:
# results page - filters used year 2008 - 2017 (10 years), feature films, country - US, 
# language: English, popularity: ascending —> 100 titles per page

# features being extracted from movie results page
'''
- Movie title     - str
- MPAA rating     - list of str
- Metascore  /100 - float
- User Rating /10 - float
- Genre           - str
- Runtime in mins - int
- *Director       - str
- *Stars          - list of str
- US Gross        - int
- # Votes         - int
'''

In [2]:
url_results_pg = 'https://www.imdb.com/search/title?title_type=feature&release_date=2007-01-01,2017-12-31&countries=us&languages=en&count=100'
response1 = requests.get(url_results_pg)
page1 = response1.text
soup1 = BeautifulSoup(page1,'html.parser')  

In [3]:
# movie fitlered results page
main_section = soup1.find(id='main')
single_movie_section = main_section.select(".article .lister-item-content")[0]

In [4]:
# serial number
sno = single_movie_section.find("span", class_="lister-item-index unbold text-primary")
sno_clean = sno.get_text().strip()
sno_clean = int(sno_clean.replace(',', '').replace('.', ''))
print(sno_clean)

# title
title = single_movie_section.find("h3", class_="lister-item-header").find("a").get_text().strip()
print(title)

# mpaa rating
mpaa_rating = single_movie_section.find("span", class_="certificate").get_text().strip()
print(mpaa_rating)

# metascore
metascore = single_movie_section.find("span", class_="metascore").get_text().strip()
metascore_clean = int(metascore.replace(',', '').replace('.', ''))
print(metascore_clean)

# user rating
user_rating = single_movie_section.find("div", class_="inline-block ratings-imdb-rating")
user_rating_num = user_rating.find("strong").get_text().strip()
user_rating_num = float(user_rating_num.replace(',', ''))
print(user_rating_num)
 
# genre    
genre = single_movie_section.find("span", class_="genre").get_text().strip()
print(genre)

1
The Shape of Water
R
87
7.4
Adventure, Drama, Fantasy


In [5]:
# runtime
runtime = single_movie_section.find("span", class_="runtime").get_text().strip()
runtimeNum = int(runtime.split()[0].replace(',', '').replace('.', ''))
print(runtimeNum)

# Votes
votes = single_movie_section.find("p", class_="sort-num_votes-visible").get_text().strip()
votes_count = int(votes.split()[1].replace(',', ''))
print(votes_count)

# Box office gross
# us_box_gross = single_movie_section.find("p", class_="sort-num_votes-visible").split()[-1]
# print(us_box_gross)

# Director
director = single_movie_section.find('a', href=re.compile('adv_li_dr_0')).get_text().strip()
print(director)

# Stars
star1 = single_movie_section.find('a', href=re.compile('adv_li_st_0')).get_text().strip()
star2 = single_movie_section.find('a', href=re.compile('adv_li_st_1')).get_text().strip()
star3 = single_movie_section.find('a', href=re.compile('adv_li_st_2')).get_text().strip()
print(star1, star2, star3)

# Movie url
movie_url = single_movie_section.find('a', href=re.compile('/title/'))['href']   
print("Movie url: ", movie_url)


123
253005
Guillermo del Toro
Sally Hawkins Octavia Spencer Michael Shannon
Movie url:  /title/tt5580390/?ref_=adv_li_tt


In [6]:
# parsing through a single results page and gathering all features
# need to fix for None types in the .py file functions

for i in range(1, 101):
    single_movie_section = main_section.select(".article .lister-item-content")[i]
    sno = single_movie_section.find("span", class_="lister-item-index unbold text-primary")
    sno_clean = sno.get_text().strip()
    sno_clean = int(sno_clean.replace(',', '').replace('.', ''))
    print(sno_clean)
    
    # title
    title = single_movie_section.find("h3", class_="lister-item-header").find("a").get_text().strip()
    print(title)

    # mpaa rating
    mpaa_rating = single_movie_section.find("span", class_="certificate").get_text().strip()
    print(mpaa_rating)

    # metascore
    metascore = single_movie_section.find("span", class_="metascore").get_text().strip()
    metascore_clean = int(metascore.replace(',', '').replace('.', ''))
    print(metascore_clean)

    # user rating
    user_rating = single_movie_section.find("div", class_="inline-block ratings-imdb-rating")
    user_rating_num = user_rating.find("strong").get_text().strip()
    user_rating_num = float(user_rating_num.replace(',', ''))
    print(user_rating_num)
 
    # genre    
    genre = single_movie_section.find("span", class_="genre").get_text().strip()
    print(genre)

2
The Greatest Showman
PG
48
7.7
Biography, Drama, Musical
3
Scott Pilgrim vs. the World
PG-13
69
7.5
Action, Comedy, Fantasy
4
It
R
69
7.4
Drama, Horror, Thriller
5
Thor: Ragnarok
PG-13
74
7.9
Action, Adventure, Comedy
6
Justice League
PG-13
45
6.6
Action, Adventure, Fantasy
7
Blade Runner 2049
R
81
8.1
Drama, Mystery, Sci-Fi
8
Sicario
R
82
7.6
Action, Crime, Drama
9
Spider-Man: Homecoming
PG-13
73
7.5
Action, Adventure, Sci-Fi
10
Suicide Squad
PG-13
40
6.1
Action, Adventure, Fantasy
11
Murder on the Orient Express
PG-13
52
6.5
Crime, Drama, Mystery
12
The VVitch: A New-England Folktale
R
83
6.8
Horror, Mystery
13
X-Men: Apocalypse
PG-13
52
7.0
Action, Adventure, Sci-Fi
14
Hostiles
R
65
7.2
Action, Adventure, Drama
15
The Dark Knight
PG-13
84
9.0
Action, Crime, Drama
16
Star Wars: Episode VIII - The Last Jedi
PG-13
85
7.2
Action, Adventure, Fantasy
17
Jumanji: Welcome to the Jungle
PG-13
58
7.0
Action, Adventure, Comedy
18
Fantastic Beasts and Where to Find Them
PG-13
66
7.4
Adventure

AttributeError: 'NoneType' object has no attribute 'get_text'

### Web Scrape Individual Movie Page

In [None]:
# individual movie page
# features being extracted from individual movie page
'''
- Budget           - int
- Release date     - dt
- Writer 1         - str
- Writer 2         - str
'''

In [7]:
url_movie_pg = 'https://www.imdb.com/title/tt5580390/?ref_=adv_li_tt'
response2 = requests.get(url_movie_pg)
page2 = response2.text
soup2 = BeautifulSoup(page2,'html.parser')  

In [8]:
# individual movie page
movie_section = soup2.find(id="pagecontent")
mv = movie_section.select(".flatland")[0]

# budget
budget = mv.find(text=re.compile("Budget:")).next.strip()
print(budget)

# release dt
release_dt = ' '.join(mv.find(text=re.compile("Release Date:")).next.split()[:3])
print(release_dt)

# writers
writer1 = mv.find('a', href=re.compile('tt_ov_wr')).get_text().strip()
writer2 = mv.find_all('a', href=re.compile('tt_ov_wr'))[1].get_text().strip()
print(writer1, writer2)

$19,400,000
22 December 2017
Guillermo del Toro Vanessa Taylor
