In [31]:
%run ../utils.ipynb
from bs4 import BeautifulSoup
import requests
import sys
import time
import pandas as pd
import numpy as np
import urllib.robotparser as urobot
from tqdm import tqdm_notebook as tqdm
import validators
import os
import threading
import logging
import random

header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Safari/605.1.15'}

urls = [
        "https://www.magazineluiza.com.br",
        "https://www.colombo.com.br",
        "https://www.amazon.com.br",
        "https://www.taqi.com.br",
        "https://www.kabum.com.br",
        "https://www.ricardoeletro.com.br",
        "https://www.cissamagazine.com.br",
        "https://www.promobit.com.br",
        "https://www.havan.com.br",
        "https://www.avenida.com.br"]

#urls = ["https://www.avenida.com.br"]

#to control concurrent executions
LOCK = threading.Lock()

#configures logger to track progress of the threads
logger = logging.getLogger()
logger.setLevel(logging.INFO)

In [34]:
#Class definition of the heuristic_crawler
class heuristic_crawler:
    def __init__(self, url):
        self.url = url                
        self.robotParser = getRobot(url)           #robot parser to check if a link is valid or not
        self.links_list = []                       #list of allowed links
        self.invalid_links = []                    #list of invalid links
        self.file_name  = set_file_name(url)       #name of the files generate by the crawler 

    def get_links(self):
        actual_link = self.url
        link_count = 0
        number_of_links = 1000
        next_links = []
        total_links = 0
        
        with LOCK:
            print("Starts crawler on: {}".format(self.url))
            pbar = tqdm(total=number_of_links)
            
        while (len(self.links_list) < number_of_links):
            try:
                req = requests.get(actual_link, headers=header)
                if(req.status_code == 200):    
                    soup = BeautifulSoup(req.text)
                    pageLinks = soup.findAll("a", href=True)
                    
                    for a in pageLinks:
                        #formata o link na tentativa de obter um link válido
                        link = format_link(self.url, a["href"])
                        print(link)
                        if(link not in next_links and link not in self.links_list):
                            next_links.append(link)
                            total_links += 1
                        #print(link)
                        #verifica se o link é valido
                        if(not validators.url(link)):
                            if(link not in self.invalid_links):
                                self.invalid_links.append(link)
                            
                        #adiciona na lista de links do crawler caso seja um link não visitado, válido e que o robots.txt permita
                        elif((self.robotParser.can_fetch("*", link)) and (link != self.url) and (self.heuristic_check(link))):
                            #print("{} - {}".format(link_count, link))
                            self.links_list.append(link)
                            link_count += 1
                            
                            pbar.update(1)
                            if(len(self.links_list) >= number_of_links):
                                break
                        
                        
            except Exception:
                if(link not in self.invalid_links):
                    self.invalid_links.append(link)
            finally: 
                if(len(next_links) < 1 or len(self.links_list) >= number_of_links):
                    logging.info("****END: {}****".format(self.url))
                    with LOCK:
                        #saves the results to a file stats.csv; columns: site,valid_links,invalid_links
                        content = "{},{},{},{}\n".format(self.url.split(".")[1],len(self.links_list),len(self.invalid_links),total_links)
                        save_file(content, "./", "stats.csv", mode="a")
                        pbar.close()
                    return
                actual_link = next_links.pop(0)
                #print("Link Atual: {}".format(actual_link))
                time.sleep(random.randint(1,5))
                
    #saves the links as csv 
    def save_as_csv(self):
        folders = ["links", "invalid_links"]
        for folder in folders:
            if not os.path.exists(folder):
                os.makedirs(folder)
        
        #saves valid links
        ID = np.arange(len(self.links_list))
        dictionary = {'id' : ID, 'links' : self.links_list}
        df = pd.DataFrame(dictionary)
        df.to_csv(('links/' + self.file_name + '.csv'),header=True, index=False, encoding='utf-8')
        
        #saves invalid links
        ID = np.arange(len(self.invalid_links))
        dictionary = {'id' : ID, 'links' : self.invalid_links}
        df = pd.DataFrame(dictionary)
        df.to_csv(('invalid_links/' + self.file_name + '.csv'),header=True, index=False, encoding='utf-8')
    
    #save the robots from a url
    def save_robot(self):
        robot_url = self.url + "/robots.txt"
        req = requests.get(robot_url,headers=header)
        content = req.text
        save_file(content, "robots", (self.file_name + ".txt"))
    
    #verifies if a link passes in the coditions of the heuristic
    def heuristic_check(self, link):
        KEYWORDS = ["Celular", "celular", "Smartphone",
                    "smartphone","iphone", "Telefon",
                    "telefon"]
        BADWORDS  = ["televenda", "Televenda",
                     "sac", "carr", "javascript",
                     "acessorio", "Acessorio"]
        result = False
        
        for word in KEYWORDS:
            if(word in link):
                result = True
                for bad_word in BADWORDS:
                    if(bad_word in link):
                        result = False
        return result
    
    #getters and setters methods
    def get_url(self):
        return self.url
    def set_url(self, url):
        self.url = url
    def get_links_list(self):
        return self.links_list
    def set_links_list(self, links):
        self.links_list = links

In [35]:
#Saves the links from each site
def save_links(crawler):
    crawler = heuristic_crawler(url)
    crawler.save_robot()
    crawler.get_links()
    crawler.save_as_csv()

# Parallel saves the links in .csv 

In [None]:
#configures logger to track progress of the threads
logger = logging.getLogger()
logger.setLevel(logging.INFO)

threads = []
for url in urls:
    thread_name = url.split(".")[1]
    crawler = heuristic_crawler(url)
    thread = threading.Thread(target=save_links, args=(crawler,), name=thread_name)
    threads.append(thread)
    print("Starts crawler on: {}".format(url))
    thread.start()
    time.sleep(0.2)
    
for thread in tqdm(threads):
    thread.join()
    logging.info("finished thread: {}".format(thread.name))

# Saves the links from one site

In [36]:
url = urls[5]
crawler = heuristic_crawler(url)
save_links(url)

SSLError: HTTPSConnectionPool(host='www.ricardoeletro.com.br', port=443): Max retries exceeded with url: /robots.txt (Caused by SSLError(SSLError("bad handshake: SysCallError(54, 'ECONNRESET')")))

[2, 3, 4]
