<a href="https://colab.research.google.com/github/parkrye/Python/blob/main/Others/WebCrawler_ThreadPoolExecutor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

지정된 웹사이트의 모든 페이지를 크롤링하는 웹크롤러

출처: https://github.com/elliotforbes/python-crawler

## Page Class

In [19]:
class Page():

    def __init__(self, statusCode, requestTime, url):
        self.statusCode = statusCode
        self.requestTime = requestTime
        self.url = url

## Checkable Queue Class

In [20]:
import queue

In [21]:
class CheckableQueue(queue.Queue): # or OrderedSetQueue
  def __contains__(self, item):
    with self.mutex:
      return item in self.queue

  def __len__(self):
    return len(self.queue)

## Crawler Class

In [22]:
from urllib.request import Request, urlopen, urljoin, URLError
from urllib.parse import urlparse
import ssl
from bs4 import BeautifulSoup

In [23]:
class Crawler:

  base_url = ''
  myssl = ssl.create_default_context();
  myssl.check_hostname=False
  myssl.verify_mode=ssl.CERT_NONE
  crawledLinks = set()
  errorLinks = set()

  def __init__(self, base_url):
    Crawler.base_url = base_url
    Crawler.myssl = ssl.create_default_context();
    Crawler.myssl.check_hostname=False
    Crawler.myssl.verify_mode=ssl.CERT_NONE

  @staticmethod
  def crawl(thread_name, url, linksToCrawl):
    try:
      link = urljoin(Crawler.base_url, url)
      if (urlparse(link).netloc == urlparse(Crawler.base_url).netloc) and (link not in Crawler.crawledLinks):
        request = Request(link, headers={'User-Agent': 'Mozilla/5.0'})
        response = urlopen(request, context=Crawler.myssl)
        
        Crawler.crawledLinks.add(link)
        print("> Url {} Crawled with Status: {} : {} Crawled In Total".format(response.geturl(), response.getcode(), len(Crawler.crawledLinks)))
        
        soup = BeautifulSoup(response.read(), "html.parser")
        Crawler.enqueueLinks(soup.find_all('a'), linksToCrawl)
        return url, response.getcode()
    except URLError as e:
      print("URL {} threw this error when trying to parse: {}".format(link, e.reason))
      Crawler.errorLinks.add(link)
      # raise Exception("URL {} threw URLError: {}".format(link, e.reason))
      return url, response.getcode()
    except Exception as e:
      Crawler.errorLinks.add(link)
      # raise Exception("URL {} threw Exception: {}".format(link, e.reason))
      return url, response.getcode()

  @staticmethod
  def enqueueLinks(links, linksToCrawl): 
    for link in links:
      if (urljoin(Crawler.base_url, link.get('href')) not in Crawler.crawledLinks):
        if (urljoin(Crawler.base_url, link.get('href')) not in linksToCrawl):
          linksToCrawl.put(link.get('href'))

## Main Class

In [24]:
import threading
import queue
from concurrent.futures import ThreadPoolExecutor
from concurrent.futures import as_completed
import csv

In [25]:
THREAD_COUNT = 20
linksToCrawl = CheckableQueue()

def run(url):
    try:
      result = Crawler.crawl(threading.current_thread(), url, linksToCrawl)
      linksToCrawl.task_done()
      return result
    except:
      raise Exception("Exception thrown with link: {}".format(url))


def appendToCSV(result):
  print("{} Appending result to CSV File: {}".format(threading.current_thread(), result))
  with open('results.csv', 'a') as csvfile:
    resultwriter = csv.writer(csvfile, delimiter=' ', quotechar='|', quoting=csv.QUOTE_MINIMAL)
    resultwriter.writerow(result)
    

def main():
  url = input("Website > ")
  Crawler(url)
  linksToCrawl.put(url)
  while not linksToCrawl.empty():
    with ThreadPoolExecutor(max_workers=THREAD_COUNT) as executor:
      url = linksToCrawl.get()
      futures = []
      
      if url is not None:
        future = executor.submit(run, url)
        futures.append(future)

      for future in as_completed(futures):
        try:
          if future.result() != None:
            appendToCSV(future.result())  
        except:
          print(future.exception())

  print("Total Links Crawled: {}".format(len(Crawler.crawledLinks)))
  print("Total Errors: {}".format(len(Crawler.errorLinks)))

if __name__ == '__main__':
  main()

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
> Url https://tutorialedge.net/python/python-modules-tutorial/ Crawled with Status: 200 : 678 Crawled In Total
<_MainThread(MainThread, started 139829578938240)> Appending result to CSV File: ('https://tutorialedge.net/python/python-modules-tutorial/', 200)
> Url https://tutorialedge.net/python/python-logging-best-practices/ Crawled with Status: 200 : 679 Crawled In Total
<_MainThread(MainThread, started 139829578938240)> Appending result to CSV File: ('https://tutorialedge.net/python/python-logging-best-practices/', 200)
> Url https://tutorialedge.net/python/python-decorators-tutorial/ Crawled with Status: 200 : 680 Crawled In Total
<_MainThread(MainThread, started 139829578938240)> Appending result to CSV File: ('https://tutorialedge.net/python/python-decorators-tutorial/', 200)
> Url https://tutorialedge.net/python/python-generator-tutorial/ Crawled with Status: 200 : 681 Crawled In Total
<_MainThread(MainThread, started 139829578938