In [16]:
#import required libraries
import requests as rq
import pandas as pd
from bs4 import BeautifulSoup as bs
import json as js

In [17]:
#DEFINE ALL CONSTANTS

#main URL
main_url = 'https://simple.wikipedia.org'

#initial URL
init_url = main_url + '/wiki/Climate_change'

#processing keys
key_data = 'data'
key_code = 'code'
key_a = 'a'
key_div = 'div'
key_script = 'script'
key_footer = 'footer'
key_internal = 'internal'
key_external = 'external'
key_local = 'local'
key_extra = 'extra'
key_page_cnt = 'Pagecount'
key_int_cnt = 'INTcount'
key_ext_cnt = 'EXTcount'
key_url_cnt = 'URLfragments'
key_tmstp = 'timestamp'

#HTML attributes
attr_cnt = {'class':'mw-body', 'id':'content', 'role':'main'}
attr_ftr = {'class':'mw-footer', 'role':'contentinfo'}
attr_nav = {'id':'mw-panel'}
attr_nhead = {'id':'mw-head'}
attr_scr = {'type':'application/ld+json'}

#runtime error message
run_err_msg = 'Runtime Processing Error'

#response codes
code_success = 200
code_err = 999
code_missing = 404

#HTML section names
main_body = 'Main content'
side_nav = 'Side navigation'
top_nav = 'Top navigation'
footer = 'footer'

In [18]:
#defining functin for printing run time error
def printRuntimeError(err):
    
    #print error received in arguments
    print(f'Unexpected {err=}, {type(err)=}')

In [19]:
#defining function for getting HTML content of provided URL 
def getHTMLPage(web_url):
    
    #setting intitial response
    response = {key_code:code_success, key_data: None}
    
    #wrapping code in try except block for runtime error handling
    try:
        
        #retreiving HTML content of given URL using requests
        result = rq.get(web_url)
        
        #verifying response of HTTP request
        if result.status_code == code_success:
            
            #converting HTML text in BeautifulSoup object
            html_content = bs(result.text, 'html.parser')
            
            #setting response
            response = {key_code:code_success, key_data: html_content}
        else:
            
            #setting fail response
            response = {key_code:result.status_code, key_data: 'Unable to process given URL'}
            
    except BaseException as err:
        #print rutime error
        printRuntimeError(err)
        
        #runtime error response
        response = {key_code:code_err, key_data: run_err_msg}
    
    #returning response
    return response

In [20]:
#defining function for retreiving specified section from given HTML content
def getHTMLSection(bs_object, tag_name, tag_attrs):
    
    #setting intitial response
    response = {key_code:code_success, key_data: None}
    
    #wrapping code in try except block for runtime error handling
    try:
        
        #finding HTML section using tag and attributes
        section = bs_object.find(tag_name, attrs=tag_attrs)
        
        #setting response
        response = {key_code: code_success, key_data: section}
        
    except BaseException as err:
        #print rutime error
        printRuntimeError(err)
        
        #setting runtime error message
        response = {key_code: code_err, key_data: run_err_msg}
    
    #returning response
    return response

In [21]:
#defining function for retreiving anchor tags from given HTML content 
def getAnchorTags(bs_object, tag_name):
    
    #setting intitial response
    response = {key_code:code_success, key_data: None}
    
    #wrapping code in try except block for runtime error handling
    try:
        
        #finding all anchor tags from HTML content 
        tags = bs_object.find_all(tag_name, href=True)
        
        #setting response
        response = {key_code: code_success, key_data: tags}
        
    except BaseException as err:
        #print rutime error
        printRuntimeError(err)
        
        #setting runtime error message
        response = {key_code: code_err, key_data: run_err_msg}
    
    #returning response
    return response

In [30]:
#defining function for extracting timestamp from given content
def getTimeStamp(scr_data):
    
    #setting intitial response
    response = {key_code:code_missing, key_data: None}
    
    #wrapping code in try except block for runtime error handling
    try:
        
        #setting constant key of last page modification in variable
        date_key = 'dateModified'
        
        #verifying whether content is available
        if scr_data != None:
            
            #converting plain text in JSON using json librabry
            json_data = js.loads(scr_data.string)
        
            #verifying whether last modification is available
            if date_key in json_data:
                
                #setting response
                response = {key_code: code_success, key_data: json_data[date_key]}
                
    except BaseException as err:
        #print rutime error
        printRuntimeError(err)
        
        #setting runtime error message
        response = {key_code: code_err, key_data: run_err_msg}
    
    #returning response
    return response

In [31]:
#defining function for categorising URL's into specified category
def identifyLinks(data, cur_url):
    
    #setting intitial response
    response = {key_code: code_success, key_data: {key_local:[], key_internal:[], key_external:[], key_extra:[]}}
    
    #wrapping code in try except block for runtime error handling
    try:
        
        #iterating over all the given URLS
        for a in data:
            
            #extracting HREF from anchor tags
            href = a['href']
            
            #verifying whether iterating URL similar to crawled page
            if cur_url == href:
                
                #adding crawled page URL to EXTRA for avoiding repeatation crawling
                response[key_data][key_extra].append(f'{main_url}{href}')
            
            #verifying whether iterating URL is internal
            elif href.startswith('/wiki'):
                
                #adding internal page URL to INTERNAL with appending main url
                response[key_data][key_internal].append(f'{main_url}{href}')
                
            #verifying whether iterating URL is local
            elif href.startswith('#'):
                
                #adding local page URL to LOCAL
                response[key_data][key_local].append(f'{href}')
                
            #verifying whether iterating URL is external
            elif href.startswith('http') or href.startswith('https') or href.startswith('//'):
                
                #adding external page URL to EXTERNAL
                response[key_data][key_external].append(f'{href}')
                
            #verifying whether iterating URL is internal but leads to page action
            elif href.startswith('/'):
                
                #adding internal page URL to INTERNAL as iterating url links to same domain
                response[key_data][key_extra].append(f'{main_url}{href}')
                
    except BaseException as err:
        #print rutime error
        printRuntimeError(err)
        
        #setting runtime error message
        response = {key_code: code_err, key_data: run_err_msg}
        
    #returning response
    return response

In [32]:
#defining function for getting section data from given data dictionary
def getAnchorFromSection(sec_dict, sec_name, url):
    #setting initial response
    response = None
    
    #verifying whether data dictionary has SUCCESS response
    if sec_dict[key_code] == code_success:
        
        #getting anchor tags and setting response
        response = getAnchorTags(sec_dict[key_data], key_a)
    else:
        
        #printing message for failed response
        print(f'{sec_name} section extraction failed for URL: {url}')
        
    #returning response
    return response

In [33]:
#defining function for getting links data from given data dictionary
def getIdentifiedLinks(link_dict, cur_url, sec_name, url):
    
    #setting initial response
    response = None
    
    #verifying whether data dictionary has SUCCESS response
    if link_dict[key_code] == code_success:
        
        #identifying links and setting response
        response = identifyLinks(link_dict[key_data], cur_url)
    else:
        
        #printing message for failed response
        print(f'{sec_name} link extraction failed for URL: {url}')
        
    #returning response
    return response

In [34]:
#defining function for generating crawled page report
def generateReport(report, url_list, lnk_dict, sec_name, url):
    
    #verifying whether data dictionary is available and response is success
    if lnk_dict != None and lnk_dict[key_code] == code_success:
        
        #selecting data from data dictionary
        lnk_dict = lnk_dict[key_data]
        
        #appending list of internal URLS to given URL list for processing 
        url_list.extend(lnk_dict[key_internal])
        
        #calculating INTERNAL links
        report[key_int_cnt] += len(lnk_dict[key_internal]) + len(lnk_dict[key_extra])
        
        #calculating EXTERNAL links
        report[key_ext_cnt] += len(lnk_dict[key_external])
        
        #calculating LOCAL links
        report[key_url_cnt] += len(lnk_dict[key_local])
    else:
        
        #printing message for failed response
        print(f'{sec_name} link identification failed for URL: {url}')

In [35]:
#defining function for web page crawling process
def processCrawling(start_url):
    
    #setting initial response
    response = {key_code:code_success, key_data: []}
    
    #wrapping code in try except block for runtime error handling
    try:
        #setting page count
        total_crawl = 1
        
        #setting list index
        li_index = -1
        
        #defining URL list
        url_list = [start_url]
        
        #defining visited URL list
        visited_urls = []
        
        #defining loop for crawling only 200 pages
        while total_crawl <= 200:
            
            #incrementing list index for URL list index
            li_index += 1
            
            #accessing URL from list
            url = url_list[li_index]
            
            #verifying whether URL is already processed or not, if it is then continue
            if url in visited_urls:
                continue
            else:
                #adding current URL to already visited URLS
                visited_urls.append(url)
            
            #printing message for current processing URL
            print(f'Processing Index: {total_crawl}, URL is: {url}')
            
            #retreiving HTML page of URL
            html_page = getHTMLPage(url)
            
            #verifying whether retreiving HTML content was success or not
            if html_page[key_code] == code_success:
                
                #accessing HTML data from HTML data dictionary
                data = html_page[key_data]
                    
                #accessing MAIN BODY HTML section from HTML content
                cnt_sec = getHTMLSection(data, key_div, attr_cnt)
                
                #accessing SIDE NAVIGATION HTML section from HTML content
                nav_sec = getHTMLSection(data, key_div, attr_nav)
                
                #accessing FOOTER HTML section from HTML content
                ftr_sec = getHTMLSection(data, key_footer, attr_ftr)
                
                #accessing SCRIPT HTML section from HTML content for timestamp
                scr_sec = getHTMLSection(data, key_script, attr_scr)
                
                #accessing TOP NAVIGATION HTML section from HTML content
                navh_sec = getHTMLSection(data, key_div, attr_nhead)
                
                #defining variable for storing timestamp
                last_modified = None
                
                #verifying whether SCRIPT section extraction was successfull
                if scr_sec[key_code] == code_success:
                    
                    #extracting TIMESTAMP from script text 
                    ts_result = getTimeStamp(scr_sec[key_data])
                    
                    #checking if TIMESTAMP is available and storing in variable
                    last_modified = ts_result[key_data] if ts_result[key_code] == code_success else None 
                        
                #accessing MAIN BODY links from HTML section
                cnt_link = getAnchorFromSection(cnt_sec, main_body, url)
                
                #accessing FOOTER links from HTML section
                ftr_link = getAnchorFromSection(ftr_sec, footer, url)
                
                #accessing SIDE NAVIGATION links from HTML section
                nav_link = getAnchorFromSection(nav_sec, side_nav, url)
                
                #accessing TOP NAVIGATION links from HTML section
                navh_link = getAnchorFromSection(navh_sec, top_nav, url)
                    
                #separating current pafe URI from FULL URL
                cur_url = url.replace(main_url, '')

                #retreiving MAIN CONTENT's identified links from all links
                cnt_lnk = getIdentifiedLinks(cnt_link, cur_url, main_body, url)
                
                #retreiving FOOTER's identified links from all links
                ftr_lnk = getIdentifiedLinks(ftr_link, cur_url, footer, url)
                
                #retreiving SIDE NAVIGATION's identified links from all links
                nav_lnk = getIdentifiedLinks(nav_link, cur_url, side_nav, url)
                
                #retreiving TOP NAVIGATION's identified links from all links
                navh_lnk = getIdentifiedLinks(navh_link, cur_url, top_nav, url)
                    
                #defining page report data dictionary
                page_report = {key_page_cnt: total_crawl, key_int_cnt: 0, 
                                key_ext_cnt: 0, key_url_cnt:0, key_tmstp: last_modified}
                    
                #calculating report for MAIN BODY content and appending INTERNAL links for processing
                generateReport(page_report, url_list, cnt_lnk, main_body, url)
                
                #calculating report for FOOTER content and appending INTERNAL links for processing
                generateReport(page_report, url_list, ftr_lnk, footer, url)
                
                #calculating report for SIDE NAVIGATION content and appending INTERNAL links for processing
                generateReport(page_report, url_list, nav_lnk, side_nav, url)
                
                #calculating report for TOP NAVIGATION content and appending INTERNAL links for processing
                generateReport(page_report, url_list, navh_lnk, top_nav, url)
                
                #appending page report to response
                response[key_data].append(page_report)
                
                #incrementing page counts
                total_crawl += 1
    except BaseException as err:
        #print rutime error
        printRuntimeError(err)
        
        #setting runtime error message
        response = {key_code: code_err, key_data: run_err_msg}
        
    #returning response
    return response

In [36]:
#defining function for converting dictionary into dataframe and writing csv file
def dictToDataframe(data):
    
    #setting initial response
    response = {key_code: code_success, key_data: 'Process completed successfully...'}
    
    #wrapping code in try except block for runtime error handling
    try:
        
        #generating dataframe from given data dictionary
        df = pd.DataFrame(data)
        
        #writing dataframe to csv
        df.to_csv('data.csv', index = False)
        
    except BaseException as err:
        #print rutime error
        printRuntimeError(err)
        
        #setting runtime error message
        response = {key_code: code_err, key_data: run_err_msg}
        
    #returning response
    return response

In [37]:
#checking if main function is executing
if __name__ == '__main__':
    
    #start crawling from INITIAL url
    data_dict = processCrawling(init_url)
    
    #verifying if crawling was successfull
    if data_dict[key_code] == code_success:
        
        #printing final process response
        print(dictToDataframe(data_dict[key_data])[key_data])
    else:
        
        #printing failed crawling response message
        print(f'Unable to generate dataframe due to {data_dict[key_data]}')

Processing Index: 1, URL is: https://simple.wikipedia.org/wiki/Climate_change
Processing Index: 2, URL is: https://simple.wikipedia.org/wiki/Global_warming
Processing Index: 3, URL is: https://simple.wikipedia.org/wiki/Climate
Processing Index: 4, URL is: https://simple.wikipedia.org/wiki/Earth
Processing Index: 5, URL is: https://simple.wikipedia.org/wiki/Ice_age
Processing Index: 6, URL is: https://simple.wikipedia.org/wiki/Earth%27s_orbit
Processing Index: 7, URL is: https://simple.wikipedia.org/wiki/Sun
Processing Index: 8, URL is: https://simple.wikipedia.org/wiki/Greenhouse_gas
Processing Index: 9, URL is: https://simple.wikipedia.org/wiki/Joseph_Fourier
Processing Index: 10, URL is: https://simple.wikipedia.org/wiki/John_Tyndall
Processing Index: 11, URL is: https://simple.wikipedia.org/wiki/Svante_Arrhenius
Processing Index: 12, URL is: https://simple.wikipedia.org/wiki/Nils_Gustaf_Ekholm
Processing Index: 13, URL is: https://simple.wikipedia.org/wiki/Sunspot
Processing Index: 

Processing Index: 103, URL is: https://simple.wikipedia.org/wiki/Wind_power
Processing Index: 104, URL is: https://simple.wikipedia.org/wiki/Nanotechnology
Processing Index: 105, URL is: https://simple.wikipedia.org/wiki/Ethanol
Processing Index: 106, URL is: https://simple.wikipedia.org/wiki/File:Joseph_Fourier.jpg
Processing Index: 107, URL is: https://simple.wikipedia.org/wiki/File:Arrhenius2.jpg
Processing Index: 108, URL is: https://simple.wikipedia.org/wiki/Celsius
Processing Index: 109, URL is: https://simple.wikipedia.org/wiki/Wikipedia:Citing_sources
Processing Index: 110, URL is: https://simple.wikipedia.org/wiki/Antarctica
Processing Index: 111, URL is: https://simple.wikipedia.org/wiki/Science_(journal)
Processing Index: 112, URL is: https://simple.wikipedia.org/wiki/Bangladesh
Processing Index: 113, URL is: https://simple.wikipedia.org/wiki/Florida
Processing Index: 114, URL is: https://simple.wikipedia.org/wiki/Netherland
Processing Index: 115, URL is: https://simple.wiki