Web Crawler Libraries:

HTML parser - BeautifulSoup

HTTP requests - Python requests lib

In [0]:
try: 
    from BeautifulSoup import BeautifulSoup # html parser
except ImportError:
    from bs4 import BeautifulSoup
import requests #http request handling library
import pandas as pd #data structure for intermediate storage of links and output of CSV
import numpy as np
import re 
import math
import urllib
import urllib.robotparser as urobot #robot.text
from urllib.parse import urljoin
from urllib.parse import urlparse
from urllib.parse import urlunparse
from urllib.parse import urlsplit
from urllib.parse import urldefrag
from tabulate import tabulate
from tqdm import tqdm #progress bar creation

# **Utility Functions**

HTTP request functions

In [0]:
def get_html(url):
  try:
    r = requests.get(url) #add verify=False to handle the SSL Error
    if r.status_code == requests.codes.ok and 'text/html' in r.headers['content-type']:
      return r.text
    else:
      return False
  except Exception as e:
    return False

**HTML parsing functions**

In [0]:
def parse_html(html_doc,base_url) :
  #first create a BeautifulSoup object
  soup = BeautifulSoup(html_doc, 'html.parser')
  links = set()

  #parse all links
  for link in soup.find_all('a',href=True):
    link = urlsplit(link.get('href'))
    url=link.geturl().rstrip('/') #get the href and remove slash (/) if '/'is at the end of the link
    if (url !='/') and (url)  and ('@' not in url) :
      if ('http:'not in link.scheme) and ('https:' not in link.scheme):
        if url.startswith('//'):        #handle the link that starts with '//'  
          url=link._replace(scheme='https').geturl()
        elif url not in base_url : # check if the url and href has the same top level domain(TLD)
          url=urljoin(base_url,url)
      if '#' in url:
        url = urldefrag(url)[0]
      if 'index.shtml' in url or 'index.html' in url:
        url = url.split('/index.')[0] 
      links.add(url)  
      
  #urls to add to queue, list of urls
  return links 

**Add row and column to the outlink table**

In [0]:
def updates_outlink_table(outlink,domain,codomain):
 
  if codomain > len(outlink)-1:
    outlink.append([[0]*(codomain),0])
    
    for row in range (len(outlink)):
      outlink[row][0].append(0)
      if row == codomain:
        outlink[codomain][0][domain]=1
        outlink[domain][1]+=1       
      row+=1
  elif outlink[codomain][0][domain] != 1:
    outlink[domain][1]+=1
    outlink[codomain][0][domain]=1
#   print (*outlink,sep='\n')   

**Proportional division of outlinks**

In [0]:
def divide_outlinks(outlinks): 
  for domain in range(len(outlinks)):
    for codomain in range(len(outlinks)):
      if outlinks[domain][1] != 0:
        outlinks[codomain][0][domain]=outlinks[codomain][0][domain]/outlinks[domain][1]
#   print (*outlinks,sep='\n') 

In [0]:
# #test updates_outlink_table and divide outlinks

# list1=[[[0],0]]
# updates_outlink_table(list1,0,1)
# updates_outlink_table(list1,1,0)
# updates_outlink_table(list1,1,2)
# updates_outlink_table(list1,1,3)
# updates_outlink_table(list1,2,0)
# updates_outlink_table(list1,2,1)
# updates_outlink_table(list1,2,3)
# updates_outlink_table(list1,3,0)
# updates_outlink_table(list1,3,2)
# divide_outlinks(list1)
# print(list1)

# #Example 2 from lecture
# list= [[[1],1]]
# updates_outlink_table(list,0,1)
# print('\n'+str(list))
# updates_outlink_table(list,1,0)
# print(list)
# updates_outlink_table(list,1,2)
# print(list)
# updates_outlink_table(list,2,1)
# divide_outlinks(list)
# print('\n')
# print(list)

**Check if a link is in a list**

In [0]:
def has_link(list, link,pos):
  for page in list:
    if link in page[0]:
      page[1].append(pos)
      return True
  return False  


In [0]:
def has_same_base_url(domain, url):
  return domain in url

**Calcuate PageRank**

In [0]:
def calculate_page_rank(page_rank,outlinks,_lambda):
  for row in range (len(page_rank)):
    page_rank[row][1]=1/len(page_rank)
  
  temp= page_rank
  i=0
  
  while True:
    page_rank = temp
    temp= [page[:] for page in page_rank]
    
    for row in range (len(temp)):
      col=0
      result=0
      for page in page_rank:
        result=result+page[1]*outlinks[row][0][col]
        col+=1
      temp[row][1]=_lambda/len(page_rank)+ (1-_lambda)*result
      row+=1
    if temp == page_rank:
      break;
#     print('\nSum All Page Ranks: ',sum(row[1] for row in temp))  
    i+=1

  print('Number of iteration: '+str(i)+'\n') 
  return page_rank

In [0]:
# #test calculate_page_rank

# #example 2
# list1=[[[0,0.5,1/3,.5],2],[[1,0,1/3,0],2],[[0,0,0,.5],3],[[0,0.5,1/3,0],2]]
# print(*list1, sep='\n')

# page={'A':0,'B':0,'C':0,'D':0}
# page=calculate_page_rank(page,list1,.15)
# # page_rank=calculate_page_rank(page_rank,0.15,outlinks)
# print (page)

# **Main Driver**

In [0]:
#constants
MAX_PAGES = 2500
SEED = 'https://www.cpp.edu'

queue = [(SEED,[0])]
i_pages = 0
visited_links = {} #serves as a history of visited links and creation of url
outlinks= [[[0],0]]
page_rank= []

with tqdm(total=MAX_PAGES) as pbar:
  while len(queue) != 0 and i_pages<MAX_PAGES:
    url,positions= queue.pop(0)
    html = get_html(url)
 
    if html != False:
      result = parse_html(html,url) #returns (links) 
      page_rank.append([url,0]) 
      pbar.update(1)
#       print('url: ',url)
      if i_pages > 0:
        for position in positions:
          updates_outlink_table(outlinks, position,i_pages)
      visited_links[url]=i_pages

      #check if the outlink has already been explored, if not add to queue and dataframe.
      for link in result:
        if has_same_base_url(SEED,link):
          if link not in visited_links:
              #print(link)
              if not has_link(queue, link,i_pages): 
                queue.append((link,[i_pages]))
          else:
            updates_outlink_table(outlinks, i_pages,visited_links[link]) 
#             print(url, i_pages,"-",link,visited_links[link])
#             print(*outlinks, sep='\n')
      #print(queue)
      i_pages += 1
divide_outlinks(outlinks)
#print(*outlinks, sep='\n')

100%|██████████| 2500/2500 [19:43<00:00,  1.11it/s]


#** Print PageRank **



In [0]:

page_rank=calculate_page_rank(page_rank,outlinks,_lambda=0.20)
sorted_index = sorted(page_rank, key=lambda x: x[1], reverse=True) #create a list that is the page_rank sorted by value

num_rows=100
print('-'*140)
print ('{:^5} {:^80} {:^18}'.format('Rank','Link','PageRank'))
print('-'*140)
for row in range(num_rows):
  print ('{:^5}| {:<80}| {:03.20f}'.format((row+1),sorted_index[row][0],sorted_index[row][1],'|'))
  
print('\nSum All Page Ranks: ',sum(row[1] for row in page_rank))  

--------------------------------------------------------------------------------------------------------------------------------------------
Rank                                        Link                                            PageRank     
--------------------------------------------------------------------------------------------------------------------------------------------
  1  | https://www.cpp.edu                                                             | 0.04525294055587124331
  2  | https://www.cpp.edu/~aboutcpp                                                   | 0.04525294055587124331
  3  | https://www.cpp.edu/file-viewers.shtml                                          | 0.04521133778471275733
  4  | https://www.cpp.edu/~library                                                    | 0.02367918239143323653
  5  | https://www.cpp.edu/accessibility.shtml                                         | 0.02236785926170458197
  6  | https://www.cpp.edu/website-feedback.php     

In [0]:
link_df=pd.DataFrame(sorted_index, columns=['Link', 'Position'])
link_df.to_csv("report.csv") 