In [109]:
import logging
from urllib.parse import urljoin
import requests
import re
import pandas as pd
import json

from bs4 import BeautifulSoup

In [110]:
logging.basicConfig(
    format='%(asctime)s %(levelname)s:%(message)s',
    level=logging.INFO)

In [111]:
class Crawler:

    def __init__(self, url=''):
        self.url = url
        self.visited_urls = []
        self.watched = []
        self.headers = {'User-Agent':'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'}
    def download_url(self, url):
        return requests.get(url, headers=self.headers).text

    def get_linked_urls(self, html):
        soup = BeautifulSoup(html, 'html.parser')
        for link in soup.find(id="all-shows").find_all('a',{'class':'show-link'}):
            path = link.get('href')
            if path and path.startswith('/'):
              path = urljoin("https://tvtime.com", path)
            yield path

    def get_info(self, url):
       try:
          html = self.download_url(url);
          soup = BeautifulSoup(html, 'html.parser')
          script = soup.find("div",{"class":'main-block-container'}).find("script",{"type":'text/javascript'}).text
          result = re.search(r"show\s+\:(.*)",script).group(0).replace("show : ",'').replace('\&quot;','"').replace('\\"',"`").replace("\`",'').replace("\&#039;",'')
          result_json = json.loads(result.lstrip("'").rstrip("'"))
          self.watched.append({
            'imdb_id':result_json['imdb_id'],
            'name':result_json['name'],
            'genres':result_json['genres']
          })
          logging.info(f'Crawl: {url}')
       except:
          logging.exception(f'Failed to Craw: {url}')

    def crawl(self, url):
        html = self.download_url(url)
        for url in self.get_linked_urls(html):
            self.get_info(url)


    def run(self):
      url = self.url
      try:
          self.crawl(url)
          watched = pd.DataFrame(self.watched) 
          watched.to_csv("watched.csv")
          print(watched)
      except Exception:
          logging.exception(f'Failed to crawl: {url}')

if __name__ == '__main__':
    Crawler(url='https://www.tvtime.com/pt_BR/user/4311595/profile').run()

2022-04-06 14:17:27,967 INFO:Crawl: https://tvtime.com/pt_BR/show/272644
2022-04-06 14:17:28,621 INFO:Crawl: https://tvtime.com/pt_BR/show/306304
2022-04-06 14:17:29,144 INFO:Crawl: https://tvtime.com/pt_BR/show/248646
2022-04-06 14:17:30,708 INFO:Crawl: https://tvtime.com/pt_BR/show/250487
2022-04-06 14:17:31,193 INFO:Crawl: https://tvtime.com/pt_BR/show/253350
2022-04-06 14:17:31,799 INFO:Crawl: https://tvtime.com/pt_BR/show/322971
2022-04-06 14:17:33,466 INFO:Crawl: https://tvtime.com/pt_BR/show/257655
2022-04-06 14:17:34,136 INFO:Crawl: https://tvtime.com/pt_BR/show/267440
2022-04-06 14:17:34,515 INFO:Crawl: https://tvtime.com/pt_BR/show/196921
2022-04-06 14:17:35,096 INFO:Crawl: https://tvtime.com/pt_BR/show/273181
2022-04-06 14:17:35,669 INFO:Crawl: https://tvtime.com/pt_BR/show/253463
2022-04-06 14:17:36,655 INFO:Crawl: https://tvtime.com/pt_BR/show/74796
2022-04-06 14:17:37,116 INFO:Crawl: https://tvtime.com/pt_BR/show/248035
2022-04-06 14:17:37,892 ERROR:Failed to Craw: https:

       imdb_id                            name  \
0    tt3148266                      12 Monkeys   
1    tt4834206  A Series of Unfortunate Events   
2    tt1728102                        Alcatraz   
3    tt1844624           American Horror Story   
4    tt1986770                Anger Management   
..         ...                             ...   
134  tt0475784                       Westworld   
135  tt2288064             Witches of East End   
136  tt4168956                  Wrecked (2016)   
137  tt6226232                   Young Sheldon   
138  tt0185133                   Yu Yu Hakusho   

                                                genres  
0    [Science Fiction, Suspense, Thriller, Mystery,...  
1    [Mystery, Family, Adventure, Children, Comedy,...  
2    [Science Fiction, Thriller, Mystery, Crime, Ac...  
3                            [Horror, Thriller, Drama]  
4                                    [Romance, Comedy]  
..                                                 ...  


In [112]:
watched = pd.read_csv("./watched.csv")

In [113]:
watched

Unnamed: 0.1,Unnamed: 0,imdb_id,name,genres
0,0,tt3148266,12 Monkeys,"['Science Fiction', 'Suspense', 'Thriller', 'M..."
1,1,tt4834206,A Series of Unfortunate Events,"['Mystery', 'Family', 'Adventure', 'Children',..."
2,2,tt1728102,Alcatraz,"['Science Fiction', 'Thriller', 'Mystery', 'Cr..."
3,3,tt1844624,American Horror Story,"['Horror', 'Thriller', 'Drama']"
4,4,tt1986770,Anger Management,"['Romance', 'Comedy']"
...,...,...,...,...
134,134,tt0475784,Westworld,"['Western', 'Science Fiction', 'Mystery', 'Adv..."
135,135,tt2288064,Witches of East End,"['Horror', 'Suspense', 'Mystery', 'Fantasy', '..."
136,136,tt4168956,Wrecked (2016),"['Thriller', 'Mystery', 'Adventure', 'Action',..."
137,137,tt6226232,Young Sheldon,"['Family', 'Comedy']"
