In [None]:
!pip3 install beautifulsoup4
!pip3 install lxml
!pip3 install html5lib
!pip3 install grequests
!pip3 install gevent

In [1]:
#csv function definitions:

import csv


def saveSetToFile(my_set, csv_file_name):
    for url in my_set:
        appendRowToCsvFile(url, csv_file_name)
        
        
def appendRowToCsvFile(csvRow, csv_file_name):
    with open(csv_file_name, 'a') as f:
        writer = csv.writer(f)
        writer.writerow([csvRow])

        
def loadCsvToSet(my_set, csv_file_name):
    mycsv = csv.reader(open(csv_file_name))
    for row in mycsv:
        url = row[0]
        my_set.add(url)

        
#saveSetToFile(crawled_links, csv_file_name)



In [2]:
#url helper functions
from bs4 import BeautifulSoup
import requests
import urllib.robotparser as robotparser
from urllib.parse import urlparse
from urllib.parse import urljoin


def does_url_exist(url):
    try: 
        r = requests.head(url)
        if r.status_code < 400:
            return True
        else:
            return False
    except requests.exceptions.RequestException as e:
        #print(e)
        return False

    
def getUrlBase(url):
    url_parsed = urlparse(url)
    return (url_parsed.scheme + '://' + url_parsed.netloc)
    
        
def hasRobotText(url):
    return does_url_exist(urljoin(getUrlBase(url), 'robots.txt'))
                          

def canFetchUrl(url):
    rp = robotparser.RobotFileParser()
    can_fetch = False
    robotUrl = urljoin(getUrlBase(url), 'robots.txt')          
    if hasRobotText(robotUrl):
        try:
            rp.set_url(robotUrl) #robotparser get address
            rp.read() #robotparser read  
            can_fetch = rp.can_fetch("*",url) # true if we can fetch the url according to robot.txt
        except BaseException as e:
            print("An exception ocurred : %s" % str(e)) 
            #print(url)
            #print("")
            #print(". . .") 
    else: can_fetch = True #no robot means everything allowed
    return can_fetch


def getLinksFromPage(url):
    page_links = [] #links in this page
        
    resp = requests.get(url)
    html = BeautifulSoup(resp.text, 'html.parser') #get html
    for a in html.find_all('a'):
        link = a.get('href')
        if isinstance(link, str) and link.startswith('http'):
            page_links.append(link.lower()) #extract href url
    return page_links
    

        

In [3]:
import time
#import grequests


#init:
initial_url_address = 'https://en.wikipedia.org/wiki/Web_intelligence' 
current_batch = [initial_url_address] #initialize crawl batch
crawled_links = {initial_url_address} #hold the overall url links repo
max_crawl_depth = 5 #how deep is the breadth-first approach
max_crawl_pages = 100 #limit of web pages to crawl

csv_file_name = "crawlResults_" + str(round(time.time())) + ".csv" #csv unique name

print("starting the crawler...")
print("saving results on %s" %csv_file_name)
        

#TODO: 
# 1. Cache robots.txt, otherwise it is constatntly requesting!
# 2. Make asyncronous
  
    
crawled_pages = 0
    
# main loop
for i in range(max_crawl_depth):
    print("")
    print(". . .")
    print("step: %s" % (i+1))
    print("pages to crawl in this step: %s" % len(current_batch))
    print("crawled unique links so far: %s" % len(crawled_links))
    print(". . .")
    print("")

    next_batch = []
    
    for url in current_batch:
        page_links = getLinksFromPage(url)
        crawled_pages +=1
        
        if (crawled_pages > max_crawl_pages):
            current_batch = []
            break
        else:
            print("")
            print("fetching page %s" % crawled_pages + ", it might take a while : %s" % url)
            new_links = 0
            for link in page_links:
                if link not in crawled_links:
                    if canFetchUrl(link):
                        new_links +=1
                        #Here we only save the allowed links for the next batch
                        crawled_links.add(link)
                        appendRowToCsvFile(link, csv_file_name)
                        next_batch.append(link) #since it was a new link, we will crawl it in the next batch                    
                    #else: print("not allowed to fetch : %s" % link)
            print("allowed new links found in page: %s" % new_links)
        current_batch = []
        current_batch = next_batch
    

    
print("")
print(". . .")
print("FINISH!")
print("unique links size: %s" % len(crawled_links))

starting the crawler...
saving results on crawlResults_1549563162.csv

. . .
step: 1
pages to crawl in this step: 1
crawled unique links so far: 1
. . .


fetching page 1, it might take a while : https://en.wikipedia.org/wiki/Web_intelligence
allowed new links found in page: 15

. . .
step: 2
pages to crawl in this step: 15
crawled unique links so far: 16
. . .


fetching page 2, it might take a while : http://ieeexplore.ieee.org/xpl/freeabs_all.jsp?arnumber=884768
allowed new links found in page: 15

fetching page 3, it might take a while : http://www.iospress.nl/journal/web-intelligence
allowed new links found in page: 38

. . .
step: 3
pages to crawl in this step: 0
crawled unique links so far: 69
. . .


. . .
step: 4
pages to crawl in this step: 0
crawled unique links so far: 69
. . .


. . .
step: 5
pages to crawl in this step: 0
crawled unique links so far: 69
. . .


. . .
FINISH!
unique links size: 69


In [4]:
# Optional, for loading the results CSV:
test_set={initial_url_address}
loadCsvToSet(test_set, csv_file_name)

print(len(test_set))


69


In [5]:
## test robots.txt
from urllib.parse import urlparse

test1 = 'https://www.buzzfeed.com/buzzfeed/api/' #not allowed by robot.txt
test2 = 'https://github.com/lluissuros?tab=overview&from=2019-02-01&to=2019-02-07' # not allowed?
test3 = 'https://slate.com/culture/sports' #allowed
test4 = 'http://www.capcom.co.jp/newproducts/consumer/value/' #no robots.txt, therefore allowed



print(hasRobotText(test1))
print(hasRobotText(test2))
print(hasRobotText(test3))
print(hasRobotText(test4))

print("..")

print(canFetchUrl(test1))
print(canFetchUrl(test2))
print(canFetchUrl(test3))
print(canFetchUrl(test4))




True
True
True
False
..
False
False
True
True


In [7]:
len(crawled_links)

1383

In [9]:
len(csv_file_name)

30

In [12]:
'https://help.imdb.com/imdb?ref_=cons_ftr_imdb' not in crawled_links

False