In [7]:
%run ../utils.ipynb
from sklearn.preprocessing import MinMaxScaler
from bs4 import BeautifulSoup
import requests
import sys
import time
import pandas as pd
import numpy as np
import urllib.robotparser as urobot
from tqdm import tqdm_notebook as tqdm
import validators
import os
import threading
import logging
import random
import pickle

header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Safari/605.1.15'}

urls = [
        "https://www.magazineluiza.com.br",
        "https://www.colombo.com.br",
        #"https://www.amazon.com.br",
        "https://www.taqi.com.br",
        #"https://www.kabum.com.br",
        "https://www.ibyte.com.br",
        #"https://www.cissamagazine.com.br",
        #"https://www.ricardoeletro.com.br",
        "https://www.havan.com.br",
        #"https://www.avenida.com.br"
        ]

#urls = ["https://www.avenida.com.br"]

#to control concurrent executions
LOCK = threading.Lock()

#configures logger to track progress of the threads
logger = logging.getLogger()
logger.setLevel(logging.INFO)

In [8]:
class DenseTransformer(MinMaxScaler):
 
    def fit(self, X, y=None, **fit_params):
        return self
 
    def transform(self, X, y=None, **fit_params):
        return X.todense()

#carrega o modelo de classificador de links
filename = "link_clf3.sav"
loaded_model = pickle.load(open(filename, 'rb'))



In [9]:
#Class definition of the classifier_crawler
class classifier_crawler:
    def __init__(self, url):
        self.url = url                
        self.robotParser = getRobot(url)           #robot parser to check if a link is valid or not
        self.links_list = []                       #list of allowed links
        self.invalid_links = []                    #list of invalid links
        self.file_name  = set_file_name(url)       #name of the files generate by the crawler 
    
    #saves the relevant links and pages from the site
    def get_links(self):
        actual_link = self.url
        number_of_links = 500
        next_links = []
        total_links = 0
        visited_links = [self.url]
        
        with LOCK:
            print("Starts crawler on: {}".format(self.url))
            pbar = tqdm(total=number_of_links)
        
        while (len(self.links_list) < number_of_links):
            try:
                req = requests.get(actual_link, headers=header)
                if(req.status_code == 200):    
                    soup = BeautifulSoup(req.text)
                    pageLinks = soup.findAll("a", href=True)
                    
                    for a in pageLinks:
                        #formata o link na tentativa de obter um link válido
                        link = format_link(self.url, a["href"])
                        print(link)
                        #verifica se o link é valido
                        if(not validators.url(link)):
                            if(link not in self.invalid_links):
                                self.invalid_links.append(link)
                                
                        elif(link not in visited_links):
                            next_links.append(link)
                            total_links += 1
                            #adiciona na lista de links do crawler caso seja um link não visitado, válido e que o robots.txt permita
                            if((self.robotParser.can_fetch("*", link)) and (link != self.url) and classifier_check([link])):
                                self.links_list.append(link)
                            
                                pbar.update(1)
                                if(len(self.links_list) >= number_of_links):
                                    break  
                        #print(link)
                        
                        visited_links.append(link)
                
            except Exception:
                if(link not in self.invalid_links):
                    print("except")
                    self.invalid_links.append(link)
            finally: 
                if(len(next_links) < 1 or len(self.links_list) >= number_of_links):
                    logging.info("****END: {}****".format(self.url))
                    with LOCK:
                        #saves the results to a file stats.csv; columns: site,valid_links,invalid_links,total_links
                        content = "{},{},{},{}\n".format(self.url.split(".")[1],len(self.links_list),len(self.invalid_links),total_links)
                        save_file(content, "./", "stats.csv", mode="a")
                        pbar.close()
                    return
                actual_link = next_links.pop(0)
                #print("Link Atual: {}".format(actual_link))
                time.sleep(2)
    
    #saves the links as csv 
    def save_as_csv(self):
        folders = ["links", "invalid_links"]
        
        for folder in folders:
            if not os.path.exists(folder):
                os.makedirs(folder)
                
        #saves valid links
        ID = np.arange(len(self.links_list))
        dictionary = {'id' : ID, 'links' : self.links_list}
        df = pd.DataFrame(dictionary)
        df.to_csv(('links/' + self.file_name + '.csv'),header=True, index=False, encoding='utf-8')
        
        #saves invalid links
        ID = np.arange(len(self.invalid_links))
        dictionary = {'id' : ID, 'links' : self.invalid_links}
        df = pd.DataFrame(dictionary)
        df.to_csv(('invalid_links/' + self.file_name + '.csv'),header=True, index=False, encoding='utf-8')
    
    #save the robots from a url
    def save_robot(self):
        robot_url = self.url + "/robots.txt"
        req = requests.get(robot_url,headers=header)
        content = req.text
        save_file(content, "robots", (self.file_name + ".txt"))
    
    def classifier_check(self, link):  
        result = loaded_model.predict(link).astype(bool)
        return result[0]
    
    def save_page(self, identifier, link):
        PAGE_NAME = "page_{}.html"
        SITE_NAME = self.url.split(".")[1]
        PAGES_DIRECTORY = "pages/" + SITE_NAME
        
        time.sleep(random.randint(1,5))
        try: 
            req = requests.get(link, headers=header)
            if(req.status_code == 200):
                html = req.text
                save_file(html, PAGES_DIRECTORY, PAGE_NAME.format(identifier))
                return True
            else: 
                logging.info("SAVE_PAGE:status_code: {} - {}".format(req.status_code, link))
                return False
        except:
            logging.info("EXCEPT_SAVE_PAGE: {} - {}".format(req.status_code, link))
            return False
        
    #getters and setters methods
    def get_url(self):
        return self.url
    def set_url(self, url):
        self.url = url
    def get_links_list(self):
        return self.links_list
    def set_links_list(self, links):
        self.links_list = links

## Auxiliar functions

In [10]:
#Saves the links from each site
def save_links(crawler):
    crawler = classifier_crawler(url)
    crawler.save_robot()
    crawler.get_links()
    crawler.save_as_csv()
    
def classifier_check(link):  
    result = loaded_model.predict(link).astype(bool)
    return result[0]

## Parallel saves the links and pages from all sites

In [6]:
threads = []
for url in urls:
    crawler = classifier_crawler(url)
    thread = threading.Thread(target=save_links, args=(crawler,))
    threads.append(thread)
    thread.start()

Starts crawler on: https://www.magazineluiza.com.br


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

Starts crawler on: https://www.colombo.com.br


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

Starts crawler on: https://www.taqi.com.br


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

Starts crawler on: https://www.havan.com.br


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

Starts crawler on: https://www.ibyte.com.br


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))

## Saves the links and pages from one site

In [None]:
url = urls[1]
crawler = classifier_crawler(url)
save_links(url)