In [None]:
import nltk
from nltk.tokenize import word_tokenize

import urllib3
import re
from bs4 import BeautifulSoup
import os
import justext

# Implementing a web crawler to get corpora data

In [None]:
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
user_agent = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)'}

http = urllib3.PoolManager(cert_reqs='CERT_NONE', num_pools=10, headers=user_agent)


class Crawler:
    
    def __init__(self, corpus_path, max_files, seed_url, url_pattern, corpus_start_number=1):
        self.corpus_path = corpus_path
        self.max_files = max_files
        self.seed_url = seed_url
        self.url_pattern = url_pattern
        self.visited_links = {}
        self.to_be_visited = []
        self.corpus_number = corpus_start_number
        
        if not os.path.exists(self.corpus_path):
            os.makedirs(self.corpus_path)
        
    def crawl(self):
        #gets and saves the first page 
        first_urls = self.get_page(self.seed_url)
        
        #stores the links found in the first page
        self.add_links(first_urls)
        
        #finds the next link to be visited
        next_link = self.get_next_link()
        
        #visits all links found and gets pages 
        while next_link and self.corpus_number <= self.max_files:
            links = self.get_page(next_link)
            self.add_links(links)
            next_link = self.get_next_link()
        
        self.to_be_visited = []
        self.visited_links = {}
    
    def get_page(self, url):
        
        print("getting page {}".format(url))
        response = http.request('GET', url)

        # store text content
        paragraphs = justext.justext(response.data, justext.get_stoplist("Portuguese"))
        paragraphs = [p for p in paragraphs if not p.is_boilerplate]
        
        if(paragraphs): # tests if the page has at least one paragraph
           # print("saving page {}".format(url))
            corpus_name = str(self.corpus_number) + '_' + url.replace(".", "_").replace("/","-")
            corpus_path_name = "{}/{}.txt".format(self.corpus_path, corpus_name)
            with open(corpus_path_name, "w", encoding='utf-8') as output_file:
                for paragraph in paragraphs:
                    output_file.write(paragraph.text.replace('\n', ' ') + ' ')
                self.corpus_number += 1

        # get links 
        complete_url = re.compile("^http.*")
        soup = BeautifulSoup(response.data, 'html.parser')
        links = [link.get('href') if complete_url.match(link.get('href')) 
                 else self.seed_url + link.get('href')
                 for link in soup.findAll('a', attrs={'href': re.compile(self.url_pattern)})]
            
        #print("Found the folowing links: " + str(links))
        print("Visited links: " + str(self.visited_links))
        return links

    def add_links(self, links):
        links = list(set(links))
        self.to_be_visited.extend([link for link in links if link not in self.visited_links])

    def get_next_link(self):
        next_link = None
        if(self.to_be_visited):
            next_link = self.to_be_visited.pop(0)
            self.visited_links[next_link] = None
        return next_link

# Getting corpus "ciencia_e_tecnologia"

In [None]:
#getting corpora from the site Inovação Tecnológica
tec_path = "../data/corpora/ciencia_e_tecnologia"
tec_seed = "https://www.inovacaotecnologica.com.br/"
tec_reg_exp = "(../)*noticias/.*"
crawler_ciencia_tec = Crawler(tec_path, 500, tec_seed, tec_reg_exp)

In [None]:
#crawler_ciencia_tec.crawl()

# Getting corpus "estporte"

In [None]:
crawler_esporte = Crawler("../data/corpora/esporte", 500, "http://www.blogdoesporte.net/",
                           "^http://www.blogdoesporte.net/\d+")



In [None]:
#crawler_esporte.crawl()