In [11]:
%run ../utils.ipynb
from bs4 import BeautifulSoup
import requests
import sys
import time
import pandas as pd
import numpy as np
import urllib.robotparser as urobot
from tqdm import tqdm_notebook as tqdm
import validators
import os
import threading

header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Safari/605.1.15'}

urls = [
        "https://www.magazineluiza.com.br",
        "https://www.saraiva.com.br",
        "https://www.amazon.com.br",
        "https://www.taqi.com.br",
        "https://www.kabum.com.br",
        "https://www.ibyte.com.br",
        "https://www.cissamagazine.com.br",
        "https://www.promobit.com.br",
        "https://www.havan.com.br",
        "https://www.avenida.com.br"]

#urls = ["https://www.avenida.com.br"]

In [12]:
#Class definition of the bfs_crawler
class bfs_crawler:
    def __init__(self, url):
        self.url = url                
        self.robotParser = getRobot(url)      #robot parser to check if a link is valid or not
        self.links_list = []                  #list of allowed links
        self.invalid_links = []                    #list of invalid links
        self.file_name  = set_file_name(url)  #name of the files generate by the crawler 
        
#     def __init__(self):
#         self.url = None
#         self.robotParser = None
#         self.links_list = []
    
    def get_links(self):
        actual_link = self.url
        link_count = 0
        index = 0
        number_of_links = 1000
        visited_links = [self.url]
        
        while (len(self.links_list) < number_of_links):
            try:
                req = requests.get(actual_link, headers=header)
                if(req.status_code == 200):    
                    soup = BeautifulSoup(req.text)
                    pageLinks = soup.findAll("a", href=True)
                    
                    for a in pageLinks:
                        #formata o link na tentativa de obter um link válido
                        link = format_link(self.url, a["href"])  
                        #verifica se o link é valido
                        if(not validators.url(link)):
                            self.invalid_links.append(link)
                            
                        #adiciona na lista de links do crawler caso seja um link não visitado, válido e que o robots.txt permita
                        elif(self.robotParser.can_fetch("*", link) and link not in visited_links):
                            #print("{} - {}".format(link_count, link))
                            self.links_list.append(link)
                            visited_links.append(link)
                            link_count += 1
                        
                    
            except Exception:
                self.invalid_links.append(link)
            finally: 
                if(index >= len(self.links_list) or len(self.links_list) >= number_of_links):
                    print("END")
                    return
                actual_link = self.links_list[index]
                print("Link Atual: {}".format(actual_link))
                index += 1
                time.sleep(1)
                
    #saves the links as csv 
    def save_as_csv(self):
        folders = ["links", "invalid_links"]
        for folder in folders:
            if not os.path.exists(folder):
                os.makedirs(folder)
        
        #saves valid links
        ID = np.arange(len(self.links_list))
        dictionary = {'id' : ID, 'links' : self.links_list}
        df = pd.DataFrame(dictionary)
        df.to_csv(('links/' + self.file_name + '.csv'),header=True, index=False, encoding='utf-8')
        
        #saves invalid links
        ID = np.arange(len(self.invalid_links))
        dictionary = {'id' : ID, 'links' : self.invalid_links}
        df = pd.DataFrame(dictionary)
        df.to_csv(('invalid_links/' + self.file_name + '.csv'),header=True, index=False, encoding='utf-8')
    
    #save the robots from a url
    def save_robot(self):
        robot_url = self.url + "/robots.txt"
        req = requests.get(robot_url,headers=header)
        content = req.text
        save_file(content, "robots", (self.file_name + ".txt"))
    
    #getters and setters methods
    def get_url(self):
        return self.url
    def set_url(self, url):
        self.url = url
    def get_links_list(self):
        return self.links_list
    def set_links_list(self, links):
        self.links_list = links

In [13]:
#Saves the links from each site
def save_links(crawler):
    crawler = bfs_crawler(url)
    crawler.save_robot()
    crawler.get_links()
    crawler.save_as_csv()
    #print(crawler.robotParser.can_fetch("*", "https://www.shopfacil.com.br"))

# Parallel saves the links in .csv 

In [None]:
threads = []
for url in urls:
    crawler = bfs_crawler(url)
    thread = threading.Thread(target=save_links, args=(crawler,))
    threads.append(thread)
    thread.start()
    time.sleep(0.2)

# Saves the links from one site

In [14]:
url = urls[3]
crawler = bfs_crawler(url)
save_links(url)

Link Atual: http://www.taqi.com.br/
Link Atual: http://www.taqi.com.br/sub/central-atendimento
Link Atual: http://www.taqi.com.br/javascript:void(0);
Link Atual: http://www.taqi.com.br/perfil/login?redirectURL=/index.jsp
Link Atual: http://www.taqi.com.br/perfil/login?newUser=true&redirectURL=/index.jsp
Link Atual: http://www.taqi.com.br/listadepresentes
Link Atual: http://www.taqi.com.br/newTemplate/perfil/login;jsessionid=0zzKdPZT9ZBQWw2Nhg0QDLXCp1qlJYTs8MsX5h2HtMvvv5w2kTqW!764376371?newUser=true
Link Atual: http://www.taqi.com.br/taqi/material-de-construcao/cat60087
Link Atual: http://www.taqi.com.br/taqi/construcao/portas-janelas/cat60143
Link Atual: http://www.taqi.com.br/taqi/construcao/portas-janelas/porta/cat60145
Link Atual: http://www.taqi.com.br/taqi/construcao/portas-janelas/janela/cat60144
Link Atual: http://www.taqi.com.br/taqi/construcao/portas-janelas/portao/cat60147
Link Atual: http://www.taqi.com.br/taqi/construcao/portas-janelas/fechadura-e-puxador/cat60109
Link Atua