In [1]:
import json
import requests
import logging
import re
from datetime import datetime
from bs4 import BeautifulSoup
from fake_useragent import UserAgent

In [2]:
class BaseFetcher():
    """
    Utility class to provide methods that allow fetching data from the web, 
    logging precedures and converting fetched responses to BeautifulSoup 
    object for further parsing.

    Uses fake-useragent library to rotate headers to avoid being blocked.
    """
    def __init__(self, logging_level='info'):
        self._init_logger()
        self.set_logger_level(logging_level)
            
    def _init_logger(self):
        self.logger = logging.getLogger(self.__class__.__name__)
        if not self.logger.handlers:
            handler = logging.StreamHandler()
            formatter = logging.Formatter(
                fmt='%(asctime)s - %(name)s - %(levelname)s.%(funcName)s - %(message)s',
                datefmt='%Y-%m-%d %H:%M:%S'
            )
            handler.setFormatter(formatter)
            self.logger.addHandler(handler)
            
    def _request(self, url):
        return requests.get(url, headers={'User-Agent': UserAgent().random})
        
    def _soup(self, r):
        return BeautifulSoup(r.text, 'html.parser')
    
    def _format_date(self, date_string, expected_format, desired_format='%Y-%m-%d'):
        return datetime.strptime(date_string, expected_format).strftime(desired_format)
    
    def set_logger_level(self, level:str='info'):
        str2level = {
            'notset': logging.NOTSET,
            'debug': logging.DEBUG,
            'info': logging.INFO,
            'warning': logging.WARNING,
            'error': logging.ERROR,
            'critica;': logging.CRITICAL,
        }
        
        self.logger.setLevel(str2level.get(level.lower(), logging.NOTSET))
        
    def request_and_soup(self, url):
        '''
        Fires request to given url and converts the content to BeautifulSoup for parsing.
        Throw HTTPError if page was not found, to be dealt with on the client side
        '''
        r = self._request(url)
        if r.status_code == requests.codes.ok:
            self.logger.debug(f'GET request to {url} successful.')
            return self._soup(r)
        
        self.logger.warning(f'GET request to {url} failed. Status Code: {r.status_code}')
        r.raise_for_status()

In [3]:
class IMDBFetcher(BaseFetcher):
    def __init__(self):
        super().__init__()
        self.imdb = 'https://www.imdb.com'
        
    def by_id(self, id, fetch_episodes=False):
        endpoint = f'{self.imdb}/title/{id}/'
        soup = self.request_and_soup(endpoint)
        
        results = {
            'title': self._get_title(soup),
            'rating': self._get_rating(soup)
        }
        
        if fetch_episodes:
            results['episodes'] = self._get_episodes(id)
            
        return results
        
    def _get_title(self, soup):
        try:
            result = soup.select("div[class*='OriginalTitle']")[0].text.replace('Original title: ', '')
        except Exception as e:
            self.logger.warning(f'Could not extract original title. Error: {e}')
            result = None
            
        return result
    
    def _get_rating(self, soup):
        try:
            result = float(soup.select("div[class*='AggregateRatingButton']")[1].text.replace('/10', ''))
        except Exception as e:
            self.logger.warning(f'Could not extract score. Error: {e}')
            result = None
            
        return result
    
    def _get_episodes(self, id, results=None, current_season=1, latest_season=None):
        if current_season == latest_season:
            return results
        if not results:
            results = []
        
        endpoint = f'{self.imdb}/title/{id}/episodes?season={current_season}'
        soup = self.request_and_soup(endpoint)
        
        # Get selectable seasons
        seasons = []
        for option in soup.select("select[id='bySeason']")[0].find_all('option'):
            seasons.append(int(option.text.strip()))
        
        for episode in soup.select('div[class="list detail eplist"] > div'):
            results.append({
                'season': current_season,
                'episode': int(episode.select('meta[content]')[0]['content']),
                'airdate': self._format_date(episode.select('div[class="airdate"]')[0].text.strip(), '%d %b. %Y'),
                'title': episode.select('strong')[0].text.strip(),
                'rating': float(episode.select('span[class*="star__rating"]')[0].text.strip()),
                'votes': int(re.sub(r'\(|\)|,', '', episode.select('span[class*="star__total-votes"]')[0].text.strip())),
                'description': episode.select('div[class="item_description"]')[0].text.strip()
            })
            
        results = self._get_episodes(id, results, current_season+1, seasons[-1])
        return results

In [4]:
fetcher = IMDBFetcher()

In [5]:
witcher = fetcher.by_id('tt5180504', fetch_episodes=True)

In [6]:
with open('witcher-episodes.json', 'w') as f:
    json.dump(witcher, f, indent=2)