In [2]:
from bs4 import BeautifulSoup
from datetime import datetime
import json
import pandas as pd
import requests

In [73]:
# Set target URL
base_url = 'https://www.imdb.com/title/'
url_list = [
            'tt0417299',   # Avatar: The Last Airbender
            'tt1695360',   # The Legend Of Korra
            'tt1865718',   # Gravity Falls
            'tt8050756',   # The Owl House
            'tt8050740',   # Amphibia
            'tt2758770',   # Star Vs. The Forces Of Evil
            'tt3061046',   # Steven Universe
            'tt13714610',  # Steven Universe Future
            'tt4163486',   # Bee and PuppyCat
            'tt7745956',   # She-Ra And The Princesses Of Power
            'tt10482560',  # Kipo And The Age Of Wonderbeasts
            'tt3718778',   # Over The Garden Wall
            'tt11126994',  # Arcane: League of Legends
            'tt10384610',  # Scissor Seven
            'tt3398228',   # Bojack Horseman
            'tt9248538',   # Kid Cosmic
            'tt9894516',   # Oni: Thunder God's Tale
            'tt8688814',   # The Dragon Prince
            'tt0108783',   # Gargoyles
            'tt0278238',   # Samurai Jack
            'tt1942683',   # The Amazing World of Gumball
            'tt1710308',   # Regular Show
            'tt0088500',   # Count Duckula
            'tt9561862'   # Love, Death & Robots
            ]

global_aux_df = pd.DataFrame()
global_reviews_df = pd.DataFrame()
global_main_df = pd.DataFrame()

for url_index, url in enumerate(url_list):

    # Get main page HTML information
    main_page = requests.get(base_url + url, headers={'User-Agent': 'Mozilla/5.0'})
    main_soup = BeautifulSoup(main_page.content, 'html.parser')

    # Number of Episodes
    n_episodes = main_soup.find_all('span', class_='ipc-title__subtext')[0].text

    # Number of Seasons
    seasons_html = main_soup.find('label', class_='ipc-simple-select__label')

    # Fix issue with Over the Garden Wall number of seasons
    try:
        n_seasons = int(seasons_html.text.split(' ')[0])
    except AttributeError as attribute_error:
        print('\nEncountered the following issue for Number of Seasons:', attribute_error, '\n')
        data_json = main_soup.find_all('script', id="__NEXT_DATA__")[0]
        data_attributes = json.loads(data_json.text)['props']['pageProps']
        n_seasons = data_attributes['mainColumnData']['episodes']['seasons'][0]['number'] 

    # List each season's URL
    seasons_urls = [base_url + url + '/episodes?season={}'.format(n+1) for n in range(n_seasons)]

##########################################################################################################
##########################################################################################################

####                                 AUXILIARY DATA

##########################################################################################################
##########################################################################################################

    auxiliary_data = {}

    # Get JSON data embedded in main page HTML containing general show information
    data_json = main_soup.find_all('script', id="__NEXT_DATA__")[0]
    data_attributes = json.loads(data_json.text)['props']['pageProps']['aboveTheFoldData']

    # Original Title
    original_title = data_attributes['originalTitleText']['text']
    print(original_title)
    auxiliary_data['Original Show Title'] = original_title

    print('Processing series', str(url_index + 1), 'of', str(len(url_list)) + ':', original_title)

    # Overall Rating
    overall_rating = data_attributes['ratingsSummary']['aggregateRating']
    auxiliary_data['Overall Rating'] = overall_rating

    # Overall Votes
    overall_votes = data_attributes['ratingsSummary']['voteCount']
    auxiliary_data['Total Votes'] = overall_votes

    # Episode Runtime
    runtime = str(int(data_attributes['runtime']['seconds'] / 60)) + 'min.'
    auxiliary_data['Runtime'] = runtime

    # Series Genres
    genres = [item['text'] for item in data_attributes['genres']['genres']]
    auxiliary_data['Genres'] = [genres]

    # Country of Origin
    country_of_origin = data_attributes['countriesOfOrigin']['countries'][0]['id']
    auxiliary_data['Country of Origin'] = country_of_origin

    # Air Dates (Years)
    start_year = data_attributes['releaseYear']['year']
    end_year = data_attributes['releaseYear']['endYear']
    auxiliary_data['Start Year'] = start_year
    auxiliary_data['End Year'] = end_year

    # Poster Image
    poster_image = data_attributes['primaryImage']['url']
    auxiliary_data['Poster Image'] = poster_image

    # Credits (Creators//Production Company/Writers)
    # Fix issue with Arcane missing show creators
    try:
        show_creators = [credit['name']['nameText']['text'] for credit in data_attributes['creatorsPageTitle'][0]['credits']]
    except IndexError as index_error:
        print('\nEncountered the following issue in', original_title, 'for Show Creators:', index_error, '\n')
        show_creators = ['Not Found']

    production_company = data_attributes['production']['edges'][0]['node']['company']['companyText']['text']
    auxiliary_data['Show Creators'] = [show_creators]
    auxiliary_data['Production Company'] = production_company

    # Total Amount of Reviews
    n_reviews = data_attributes['reviews']['total']
    auxiliary_data['Total Reviews'] = n_reviews

    # Plot
    plot_text = data_attributes['plot']['plotText']['plainText']
    auxiliary_data['Plot'] = plot_text

    # Save current series data
    auxiliary_df = pd.DataFrame.from_dict(auxiliary_data)

    # Save data to global DataFrame
    global_aux_df = pd.concat([global_aux_df, auxiliary_df], axis=0)

##########################################################################################################
##########################################################################################################

####                                 REVIEWS DATA

##########################################################################################################
##########################################################################################################

    start_url = base_url + url + '/reviews'
    link = base_url + url + '/reviews/_ajax'

    params = {
            'ref_': 'undefined',
            'paginationKey': ''
            }

    reviews = {
            'Original Show Title': [],
            'Review Title': [],
            'Review Content': [],
            'Review Date': []
            }

    with requests.Session() as session:
        result = session.get(start_url, headers={'User-Agent': 'Mozilla/5.0'})

        while True:

            # Get reviews page HTML information
            soup = BeautifulSoup(result.text,'html.parser')
            for item in soup.select('.review-container'):

                # Review Title, Date, and Content
                review_title = item.find_all('a', class_='title')[0].text.strip()
                review_content = item.find_all('div', class_='text show-more__control')[0].text.strip().replace("n\\'t", "n't")
                review_date = item.find_all('span', class_='review-date')[0].text

                # Save data
                reviews['Original Show Title'].append(original_title)
                reviews['Review Title'].append(review_title)
                reviews['Review Content'].append(review_content)
                reviews['Review Date'].append(review_date)

            # Load more reviews into the page
            try:
                pagination_key = soup.select_one('.load-more-data[data-key]').get('data-key')
            except AttributeError:
                break
            params['paginationKey'] = pagination_key
            result = session.get(link, params=params)

    # Save current series data
    reviews_df = pd.DataFrame(data=reviews)

    # Save data to global DataFrame
    global_reviews_df = pd.concat([global_reviews_df, reviews_df], axis=0)

##########################################################################################################
##########################################################################################################

####                                 MAIN DATA

##########################################################################################################
##########################################################################################################

    full_dataset_df = pd.DataFrame()
 
    for season in seasons_urls:

        # Season Number
        season_num = int(season[-1])

        # Get current season page HTML information
        season_page = requests.get(season, headers={'User-Agent': 'Mozilla/5.0'})
        season_soup = BeautifulSoup(season_page.content, 'html.parser')

        # Episode Title
        episode_title_html = season_soup.find_all('a', href=lambda x: '?ref_=ttep_ep' in x)
        episodes_title = [block.text.split(' ∙ ')[1] for b, block in enumerate(episode_title_html) if b % 2 != 0]

        # Episode Number
        episodes_number = [int(block.text.split(' ∙ ')[0][4:]) for b, block in enumerate(episode_title_html) if b % 2 != 0]

        # Episode Rating
        episode_rating_html = season_soup.find_all('span', class_='ipc-rating-star ipc-rating-star--base ipc-rating-star--imdb ratingGroup--imdb-rating')
        episodes_rating = [round(float(block.text[:3]), 1) for block in episode_rating_html]

        # Episode Votes
        episodes_votes = [int(block.text.split('(')[1][:-2].replace('.', '') + '00') if '.' in block.text.split('(')[1] \
                            else int(block.text.split('(')[1][:-2] + '000') for block in episode_rating_html]

        # Episode Air Date
        episode_airdate_html = season_soup.find_all('span', class_='sc-9115db22-10 jAfkDE')

        # Fix issue with Gargoyles and The Amazing World of Gumball air dates
        episodes_airdate_text = []
        for b, block in enumerate(episode_airdate_html):
            if len(block.text) < 16:
                print('\nEcountered an issue in', original_title, 'for S' + str(season_num) + '.E' + str(episodes_number[b]), 'Air Date:', block.text, '\n')
                month = block.text.split(' ')[0]
                day = str(int(episodes_airdate_text[b - 1][3:6].strip()) + 1)
                year = block.text.split(' ')[1]
                reconstructed_date = ' '.join([month, day, year])
                episodes_airdate_text.append(reconstructed_date)
            else:
                episodes_airdate_text.append(block.text[5:].replace(',', ''))
        episodes_airdate_date = [datetime.strptime(text, '%b %d %Y').strftime('%d/%m/%Y') for text in episodes_airdate_text]

        # Episode Description
        episode_description_html = season_soup.find_all('div', class_='ipc-html-content-inner-div')
        episodes_description = [block.text for block in episode_description_html]

        # Remove "Episode 0" from season 1 of ATLA
        if original_title == 'Avatar: The Last Airbender' and season_num == 1:
            episodes_number = episodes_number[1:]
            episodes_title = episodes_title[1:]
            episodes_rating = episodes_rating[1:]
            episodes_votes = episodes_votes[1:]
            episodes_airdate_date = episodes_airdate_date[1:]
            episodes_description = episodes_description[1:]
        
        # Save current series data
        data = {
                'Original Show Title': original_title,
                'Season': season_num,
                'Episode Number': episodes_number,
                'Episode Title': episodes_title,
                'Rating': episodes_rating,
                'Votes': episodes_votes,
                'Air Date': episodes_airdate_date,
                'Description': episodes_description
                }

        # Fix issue with unaired season with unavailable data for The Amazing World of Gumball season 7 and Arcane season 2
        try:
            season_df = pd.DataFrame.from_dict(data)
        except ValueError as value_error:
            print('\nEncountered the following issue in', original_title, 'for season', str(season_num), 'data:', value_error, '\n')
            break

        full_dataset_df = pd.concat([full_dataset_df, season_df])

    # Save data to global DataFrame
    global_main_df = pd.concat([global_main_df, full_dataset_df], axis=0)

# Export all saved data to Excel spreadsheets
global_aux_df.to_excel('aux_data.xlsx', header=True, index=False)
global_reviews_df.to_excel('reviews_data.xlsx', header=True, index=False)
global_main_df.to_excel('main_data.xlsx', header=True, index=False)

global_main_df

Avatar: The Last Airbender
Processing series 1 of 24: Avatar: The Last Airbender
The Legend of Korra
Processing series 2 of 24: The Legend of Korra
Gravity Falls
Processing series 3 of 24: Gravity Falls
The Owl House
Processing series 4 of 24: The Owl House
Amphibia
Processing series 5 of 24: Amphibia
Star vs. the Forces of Evil
Processing series 6 of 24: Star vs. the Forces of Evil
Steven Universe
Processing series 7 of 24: Steven Universe
Steven Universe Future
Processing series 8 of 24: Steven Universe Future
Bee and PuppyCat
Processing series 9 of 24: Bee and PuppyCat
She-Ra and the Princesses of Power
Processing series 10 of 24: She-Ra and the Princesses of Power
Kipo and the Age of Wonderbeasts
Processing series 11 of 24: Kipo and the Age of Wonderbeasts

Ecountered the following issue for Number of Seasons: 'NoneType' object has no attribute 'text' 

Over the Garden Wall
Processing series 12 of 24: Over the Garden Wall
Arcane: League of Legends
Processing series 13 of 24: Arcane

Unnamed: 0,Original Show Title,Season,Episode Number,Episode Title,Rating,Votes,Air Date,Description
0,Avatar: The Last Airbender,1,1.0,The Boy in the Iceberg,7.9,5800.0,21/02/2005,The legend of the Avatar is told. Katara and S...
1,Avatar: The Last Airbender,1,2.0,The Avatar Returns,8.1,5200.0,21/02/2005,Aang and Katara inadvertently set off a trap t...
2,Avatar: The Last Airbender,1,3.0,The Southern Air Temple,8.4,5100.0,25/02/2005,After his departure from the Southern Air Temp...
3,Avatar: The Last Airbender,1,4.0,The Warriors of Kyoshi,8.0,4900.0,04/03/2005,The gang arrives at Kyoshi Island and are capt...
4,Avatar: The Last Airbender,1,5.0,The King of Omashu,8.1,4900.0,18/03/2005,Aang's abilities as an airbender are challenge...
...,...,...,...,...,...,...,...,...
4,"Love, Death & Robots",3,5.0,Kill Team Kill,6.4,9700.0,20/05/2022,US Special Forces are trained to neutralize an...
5,"Love, Death & Robots",3,6.0,Swarm,6.9,9500.0,20/05/2022,Two human scientists study the secrets of an a...
6,"Love, Death & Robots",3,7.0,Mason's Rats,7.6,10000.0,20/05/2022,Welcome to the Ratpocalypse! Farmer Mason know...
7,"Love, Death & Robots",3,8.0,In Vaulted Halls Entombed,7.2,9300.0,20/05/2022,"Deep in the mountains of Afghanistan, a squad ..."
