tvtime.py

# -*- coding: utf-8 -*-
"""TvTime.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1W-6vMDcK4tFkFU3oByDgf4lfEg9zxp6I
"""

import logging
from urllib.parse import urljoin
import requests
import re
import pandas as pd
import json

from bs4 import BeautifulSoup

logging.basicConfig(
    format='%(asctime)s %(levelname)s:%(message)s',
    level=logging.INFO)

class Crawler:

    def __init__(self, url=''):
        self.url = url
        self.visited_urls = []
        self.watched = []
        self.headers = {'User-Agent':'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'}
    def download_url(self, url):
        return requests.get(url, headers=self.headers).text

    def get_linked_urls(self, html):
        soup = BeautifulSoup(html, 'html.parser')
        for link in soup.find(id="all-shows").find_all('a',{'class':'show-link'}):
            path = link.get('href')
            if path and path.startswith('/'):
              path = urljoin("https://tvtime.com", path)
            yield path

    def get_info(self, url):
       try:
          html = self.download_url(url);
          soup = BeautifulSoup(html, 'html.parser')
          script = soup.find("div",{"class":'main-block-container'}).find("script",{"type":'text/javascript'}).text
          result = re.search(r"show\s+\:(.*)",script).group(0).replace("show : ",'').replace('\&quot;','"').replace('\\"',"`").replace("\`",'').replace("\&#039;",'')
          result_json = json.loads(result.lstrip("'").rstrip("'"))
          self.watched.append({
            'imdb_id':result_json['imdb_id'],
            'name':result_json['name'],
            'genres':result_json['genres']
          })
          logging.info(f'Crawl: {url}')
       except:
          logging.exception(f'Failed to Craw: {url}')

    def crawl(self, url):
        html = self.download_url(url)
        for url in self.get_linked_urls(html):
            self.get_info(url)


    def run(self):
      url = self.url
      try:
          self.crawl(url)
          watched = pd.DataFrame(self.watched) 
          watched.to_csv("watched.csv")
          print(watched)
      except Exception:
          logging.exception(f'Failed to crawl: {url}')

if __name__ == '__main__':
    Crawler(url='https://www.tvtime.com/pt_BR/user/4311595/profile').run()