pip install beautifulsoup4

Data Wrangling

In [2]:
import bs4
import pandas as pd
import requests

In [5]:
def numeric_value(movie, tag, class_=None, order=None):
    if order:
        if len(movie.findAll(tag, class_)) > 1:
            to_extract = movie.findAll(tag, class_)[order]['data-value']
        else:
            to_extract = None
    else:
        to_extract = movie.find(tag, class_)['data-value']

    return to_extract


def text_value(movie, tag, class_=None):
    if movie.find(tag, class_):
        return movie.find(tag, class_).text
    else:
        return


def nested_text_value(movie, tag_1, class_1, tag_2, class_2, order=None):
    if not order:
        return movie.find(tag_1, class_1).find(tag_2, class_2).text
    else:
        return [val.text for val in movie.find(tag_1, class_1).findAll(tag_2, class_2)[order]]


def extract_attribute(soup, tag_1, class_1='', tag_2='', class_2='',
                      text_attribute=True, order=None, nested=False):
    movies = soup.findAll('div', class_='lister-item-content')
    data_list = []
    for movie in movies:
        if text_attribute:
            if nested:
                data_list.append(nested_text_value(movie, tag_1, class_1, tag_2, class_2, order))
            else:
                data_list.append(text_value(movie, tag_1, class_1))
        else:
            data_list.append(numeric_value(movie, tag_1, class_1, order))

    return data_list

In [7]:
url = 'https://www.imdb.com/search/title/?count=100&groups=top_1000&sort=user_rating'
def get_page_contents(url):
    page = requests.get(url, headers={"Accept-Language": "en-US"})
    return bs4.BeautifulSoup(page.text, "html.parser")

soup = get_page_contents(url)

In [8]:
titles = extract_attribute(soup, 'a')
release = extract_attribute(soup, 'span', 'lister-item-year text-muted unbold')
audience_rating = extract_attribute(soup, 'span', 'certificate')
runtime = extract_attribute(soup, 'span', 'runtime')
genre = extract_attribute(soup, 'span', 'genre')
imdb_rating = extract_attribute(soup, 'div', 'inline-block ratings-imdb-rating', False)
votes = extract_attribute(soup, 'span' , {'name' : 'nv'}, False, 0)
earnings = extract_attribute(soup, 'span' , {'name' : 'nv'}, False, 1)
directors = extract_attribute(soup, 'p', '', 'a', '', True, 0, True)
actors = extract_attribute(soup, 'p', '', 'a', '', True, slice(1, 5, None), True)


df_dict = {'Title': titles, 'Relase': release, 'Audience Rating': audience_rating,
           'Runtime': runtime, 'Genre': genre, 'IMDB Rating': imdb_rating,
           'Votes': votes, 'Box Office Earnings': earnings, 'Director': directors,
           'Actors': actors}
df = pd.DataFrame(df_dict)
df

Unnamed: 0,Title,Relase,Audience Rating,Runtime,Genre,IMDB Rating,Votes,Box Office Earnings,Director,Actors
0,The Shawshank Redemption,(1994),R,142 min,\nDrama,\n\n9.3\n,2200425,2200425,Frank Darabont,"[Tim Robbins, Morgan Freeman, Bob Gunton, Will..."
1,The Godfather,(1972),R,175 min,"\nCrime, Drama",\n\n9.2\n,1516103,1516103,Francis Ford Coppola,"[Marlon Brando, Al Pacino, James Caan, Diane K..."
2,The Dark Knight,(2008),PG-13,152 min,"\nAction, Crime, Drama",\n\n9.0\n,2182036,2182036,Christopher Nolan,"[Christian Bale, Heath Ledger, Aaron Eckhart, ..."
3,The Godfather: Part II,(1974),R,202 min,"\nCrime, Drama",\n\n9.0\n,1062090,1062090,Francis Ford Coppola,"[Al Pacino, Robert De Niro, Robert Duvall, Dia..."
4,The Lord of the Rings: The Return of the King,(2003),PG-13,201 min,"\nAdventure, Drama, Fantasy",\n\n8.9\n,1564021,1564021,Peter Jackson,"[Elijah Wood, Viggo Mortensen, Ian McKellen, O..."
...,...,...,...,...,...,...,...,...,...,...
95,Good Will Hunting,(1997),R,126 min,"\nDrama, Romance",\n\n8.3\n,807984,807984,Gus Van Sant,"[Robin Williams, Matt Damon, Ben Affleck, Stel..."
96,Children of Heaven,(1997),PG,89 min,"\nDrama, Family, Sport",\n\n8.3\n,58439,58439,Majid Majidi,"[Mohammad Amir Naji, Amir Farrokh Hashemian, B..."
97,The Bandit,(1996),TV-MA,128 min,"\nCrime, Drama, Thriller",\n\n8.3\n,58915,58915,Yavuz Turgul,"[Sener Sen, Ugur Yücel, Sermin Hürmeriç, Yesim..."
98,Toy Story,(1995),G,81 min,"\nAnimation, Adventure, Comedy",\n\n8.3\n,839929,839929,John Lasseter,"[Tom Hanks, Tim Allen, Don Rickles, Jim Varney]"
