In [74]:
import requests
import json
from bs4 import BeautifulSoup
import pandas as pd

# Scraping information from the Top 250 Rated TV Shows list (IMDB)

In [75]:
url = 'https://www.imdb.com/chart/toptv'
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')

In [76]:
# finding the right data extraction method
soup.findAll('tr')[1].findAll('td')[1].getText().split('\n')[1:4]

['      1.', '      Planet Earth II', '(2016)']

In [77]:
# making sure we have 250 rows (skipping the header).. this will also help for list comprehensions later
num_rows = len(soup.findAll('tr')[1:])
num_rows

250

In [78]:
# retrieving the rank, title and year for the top 250 shows
rank_title_year = [soup.findAll('tr')[i+1].findAll('td')[1].getText().split('\n')[1:4] for i in range(num_rows)]

# preview of results
rank_title_year[:5]

[['      1.', '      Planet Earth II', '(2016)'],
 ['      2.', '      Planet Earth', '(2006)'],
 ['      3.', '      Band of Brothers', '(2001)'],
 ['      4.', '      Breaking Bad', '(2008)'],
 ['      5.', '      Chernobyl', '(2019)']]

In [79]:
# creating a dataframe from our list of lists
df = pd.DataFrame(rank_title_year, columns=['rank', 'title', 'release_year'])

In [80]:
df.head()

Unnamed: 0,rank,title,release_year
0,1.0,Planet Earth II,(2016)
1,2.0,Planet Earth,(2006)
2,3.0,Band of Brothers,(2001)
3,4.0,Breaking Bad,(2008)
4,5.0,Chernobyl,(2019)


In [81]:
# stripping whitespace and removing '.' character from rank
df['rank'] = df['rank'].str.strip().str.replace('.','')

# stripping whitespace from title
df['title'] = df['title'].str.strip()

# stripping whitespace and removing '(' and ')' characters from released_year
df['release_year'] = df['release_year'].str.strip().str.replace('(','').str.replace(')','')

In [82]:
# retrieving the rating for the top 250 shows
rating = [soup.findAll('tr')[i+1].findAll('td')[2].getText().replace('\n','') for i in range(num_rows)]

# preview of results
rating[:5]

['9.5', '9.4', '9.4', '9.4', '9.4']

In [83]:
# adding the rating values to our dataframe
df['rating'] = rating

In [89]:
# small sanity check to ensure correct alignments of ratings, rank 218 is the first show with a rating below 8.5 (8.4)
df[215:219]

Unnamed: 0,rank,title,release_year,rating
215,216,Luther,2010,8.5
216,217,Feud,2017,8.5
217,218,Boku no hîrô akademia,2016,8.4
218,219,Naruto: Shippûden,2007,8.4


# Sub-requests for additional information (from each TV Shows IMDB page)

In [63]:
# lists which will hold the genre and number of seasons for each show to be added to the dataframe
genres = []
num_seasons = []

for i in range(250):
    # creating a new soup object which will parse the HTML of the shows IMDB page
    movie_link = soup.findAll('tr')[i+1].findAll('td')[1].findAll('a', href=True)[0]['href']
    sub_url = 'https://www.imdb.com' + movie_link
    sub_r = requests.get(sub_url)
    sub_soup = BeautifulSoup(sub_r.text, 'lxml')
    
    show_genres = []
    # each genre can be located in the getText() property of the 'a' tags in this div, excluding the last one
    genre_list = sub_soup.find('div', {'class': 'title_wrapper'}).findAll('a', href=True)[:-1]
    for genre in genre_list:
        show_genres.append(genre.getText())
    genres.append(show_genres)
    
    # the latest (or final) season number can be located in the getText() property of the first 'a' tag in this div
    seasons = sub_soup.find('div', {'class': 'seasons-and-year-nav'}).findAll('a')[0].getText()
    num_seasons.append(seasons)

In [65]:
# checking that we have 250 items in the genres list
len(genres)

250

In [66]:
# checking that we have 250 items in the num_seasons list
len(num_seasons)

250

In [68]:
num_seasons[-5:]

['14', '1', '3', '1', '1']

In [90]:
genres[-5:]

[['Comedy', 'Talk-Show'],
 ['Biography', 'Drama', 'History'],
 ['Crime', 'Drama'],
 ['Drama', 'Romance'],
 ['Drama']]

In [91]:
df['genre'] = genres
df['num_seasons'] = num_seasons

# Cleaning the dataframe now that all information has been added

In [117]:
# checking the first 5 rows
df.head()

Unnamed: 0,rank,title,release_year,rating,genre,num_seasons
0,1,Planet Earth II,2016,9.5,[Documentary],1
1,2,Planet Earth,2006,9.4,[Documentary],1
2,3,Band of Brothers,2001,9.4,"[Action, Drama, History]",1
3,4,Breaking Bad,2008,9.4,"[Crime, Drama, Thriller]",5
4,5,Chernobyl,2019,9.4,"[Drama, History, Thriller]",1


In [118]:
# checking the last 5 rows, everything seems right
df.tail()

Unnamed: 0,rank,title,release_year,rating,genre,num_seasons
245,246,The Angry Video Game Nerd,2004,8.4,"[Comedy, Talk-Show]",14
246,247,Jesus of Nazareth,1977,8.4,"[Biography, Drama, History]",1
247,248,Happy Valley,2014,8.4,"[Crime, Drama]",3
248,249,Brideshead Revisited,1981,8.4,"[Drama, Romance]",1
249,250,House of Cards,1990,8.4,[Drama],1


In [126]:
# casting the rank, release_year, rating and num_seasons columns as numeric values for aggregation purposes
df['rank'] = df['rank'].astype(int)
df['release_year'] = df['release_year'].astype(int)
df['rating'] = df['rating'].astype(float)
df['num_seasons'] = df['num_seasons'].astype(int)

In [127]:
# setting the index of the dataframe to be the rank, this is going to make it look way better!
df.set_index('rank', drop=True, inplace=True)

In [130]:
# previewing the results
df.head()

Unnamed: 0_level_0,title,release_year,rating,genre,num_seasons
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Planet Earth II,2016,9.5,[Documentary],1
2,Planet Earth,2006,9.4,[Documentary],1
3,Band of Brothers,2001,9.4,"[Action, Drama, History]",1
4,Breaking Bad,2008,9.4,"[Crime, Drama, Thriller]",5
5,Chernobyl,2019,9.4,"[Drama, History, Thriller]",1


### Cleaning up the genre column 

In [131]:
# changing the genre column to string type so we can use the .str methods and do some manipulations on it
df['genre'] = df['genre'].astype(str)

In [250]:
# categorizing all animated shows as 'Animation'
df.loc[df.genre.str.contains('Animation'), 'genre'] = 'Animation'

# categorizing all documentaries as 'Documentary'
df.loc[df.genre.str.contains('Documentary'), 'genre'] = 'Documentary'

# categorizing all the talk-shows as 'Talk-Show'
df.loc[df.genre.str.contains('Talk-Show'), 'genre'] = 'Talk-Show'

# categorizing all the historical as 'History'
df.loc[df.genre.str.contains('History'), 'genre'] = 'History'
df.loc[df.title == 'Das Boot', 'genre'] = 'History'

# omitting The Handmaid's tale from the sci-fi group and categorizing the rest as 'Sci-Fi'
df.loc[df.title == 'The Handmaid\'s Tale', 'genre'] = 'Drama'
df.loc[df.genre.str.contains('Sci-Fi'), 'genre'] = 'Sci-Fi'
df.loc[df.title.str.contains('Battlestar'), 'genre'] = 'Sci-Fi'
df.loc[df.title.str.contains('Star Trek'), 'genre'] = 'Sci-Fi'

# omitting a few shows from the Crime Drama group and categorizing the rest as Crime Drama
df.loc[df.title == 'The Boys', 'genre'] = 'Action'
df.loc[df.title == 'Daredevil', 'genre'] = 'Action'
df.loc[df.title == 'The Punisher', 'genre'] = 'Action'
df.loc[df.title == 'Hannibal', 'genre'] = 'Horror'
df.loc[df.title.str.contains('Spartacus'), 'genre'] = 'Action'
df.loc[df.genre.str.contains('Crime'), 'genre'] = 'Crime Drama'
df.loc[df.title.str.contains('Aranyélet'), 'genre'] = 'Crime Drama'

# categorizing all comedies as 'Comedy'
df.loc[df.genre.str.contains('Comedy'), 'genre'] = 'Comedy'

# categorizing all shows with only the drama tag as 'Drama'
df.loc[df.genre == "['Drama']", 'genre'] = 'Drama'

# categorizing the haunting of hill house as Horror
df.loc[df.title == 'The Haunting of Hill House', 'genre'] = 'Horror'

# categorizing Mr. Rogers Neighborhood as a Family show
df.loc[df.title == "MisteRogers' Neighborhood", 'genre'] = 'Family'
df.loc[df.title == "MisteRogers' Neighborhood", 'title'] = 'Mr. Rogers Neighborhood'

# categorizing Queery Eye as a Reality-TV show
df.loc[df.title.str.contains('Queer'), 'genre'] = 'Reality-TV'

# categorizing the remaining shows as Drama, which is a tag that they all include
df.loc[df.genre.str.contains('\['), 'genre'] = 'Drama'

In [251]:
df.genre.value_counts()

Comedy         55
Animation      52
Crime Drama    51
Drama          29
Documentary    27
History        12
Sci-Fi          9
Talk-Show       6
Action          5
Horror          2
Family          1
Reality-TV      1
Name: genre, dtype: int64

# Outputting the final dataframe to excel 

In [258]:
df.head(15)

Unnamed: 0_level_0,title,release_year,rating,genre,num_seasons
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Planet Earth II,2016,9.5,Documentary,1
2,Planet Earth,2006,9.4,Documentary,1
3,Band of Brothers,2001,9.4,History,1
4,Breaking Bad,2008,9.4,Crime Drama,5
5,Chernobyl,2019,9.4,History,1
6,Blue Planet II,2017,9.3,Documentary,1
7,The Wire,2002,9.3,Crime Drama,5
8,Game of Thrones,2011,9.3,Drama,8
9,Our Planet,2019,9.3,Documentary,1
10,Cosmos,2014,9.2,Documentary,1


In [253]:
df.to_excel('imdb_top_250_tv.xlsx', sheet_name='top_250')