<a href="https://colab.research.google.com/github/neslhan00/dsai301/blob/main/Web_Crawler.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Neslihan Gül

In [None]:
# WEB CRAWLER PROJECT
# As part of my project, I designed a web crawler to index a website, extract links, and rank pages based on their relevance.
# This tool simulates how a search engine works by crawling through a given website, gathering all the linked pages,
# and ranking them using a modified version of Google's PageRank algorithm.

In [1]:
# Helper Functions:
# I wrote the get_page function to fetch the page content
def get_page(url):
    try:
        import urllib.request
        page = urllib.request.urlopen(url).read()
        page = page.decode("utf-8")
        return page
    except:
        return ""

# I wrote the get_next_target function to extract links from the page content
def get_next_target(page):
    start_link = page.find('<a href=')  # Find the start of the link
    if start_link == -1:
        return None, 0  # Return None if no link is found
    start_quote = page.find('"', start_link)
    end_quote = page.find('"', start_quote + 1)
    url = page[start_quote + 1:end_quote]  # Extract the URL from the quotes
    return url, end_quote

# I wrote the get_all_links function to get all the links from the page content
def get_all_links(page):
    links = []
    while True:
        url, endpos = get_next_target(page)  # Extract the next link
        if url:
            links.append(url)  # Add the link to the list
            page = page[endpos:]  # Update the page content to the remaining part
        else:
            break  # Stop the loop when no more links are found
    return links

def union(p, q):
    for e in q:
        if e not in p:
            p.append(e)

# I wrote the add_toIndex function to add page URLs associated with a keyword to the index
def add_toIndex(index, keyword, url):
    if keyword in index:
        index[keyword].append(url)  # Append the URL to the keyword's list
    else:
        index[keyword] = [url]  # Add a new entry for the keyword with the URL

# I wrote the getclearpage function to clean up the content by removing unnecessary HTML tags
def getclearpage(content):
    title = content[content.find("<title>") + 7:content.find("</title>")]
    body = content[content.find("<body>") + 6:content.find("</body>")]
    while body.find(">") != -1:
        start = body.find("<")
        end = body.find(">")
        body = body[:start] + body[end + 1:]  # Remove tags from the body content
    return title + body  # Return the cleaned content with title and body

# I wrote the addPageToIndex function to add cleaned page content to the index
def addPageToIndex(index, url, content):
    content = getclearpage(content)  # Clean the page content
    words = content.split()  # Split the content into words
    for word in words:
        add_toIndex(index, word, url)  # Add each word's associated URL to the index


# I wrote the crawlWeb function to start crawling from a seed URL and build the index and graph
def crawlWeb(seed):
    tocrawl = [seed]  # Initialize the list of pages to crawl
    crawled = []  # List of crawled pages
    index = {}  # Index to store keyword-page mappings
    graph = {}  # Graph to store the links between pages
    while tocrawl:
        page = tocrawl.pop()  # Get the next page to crawl
        if page not in crawled:
            content = get_page(page)  # Get the page content
            addPageToIndex(index, page, content)  # Add the page content to the index
            outlinks = get_all_links(content)  # Extract the links from the page
            graph[page] = outlinks  # Add the page and its outlinks to the graph
            union(tocrawl, outlinks)  # Add the outlinks to the pages to crawl
            crawled.append(page)  # Mark the page as crawled
    return index, graph  # Return the index and graph

# I wrote the lookup function to look up pages associated with a keyword in the index
def lookup(index, keyword):
    if keyword in index:
        return index[keyword]  # Return the list of pages for the keyword
    else:
        return None  # Return None if the keyword is not found


# Run the crawlWeb function with a seed URL to see the resulting graph
index1, graph1 = crawlWeb("https://searchengineplaces.com.tr/")

In [2]:
# I printed the elements in the graph to visualize the connections between pages
print(f"The graph has {len(graph1)} elements. These are:")
for i, (page, outlinks) in enumerate(graph1.items(), 1):
    print(f"{i}.\t[{page}] : {outlinks}")

The graph has 10 elements. These are:
1.	[https://searchengineplaces.com.tr/] : ['http://www.searchengineplaces.com.tr/travel_guide.html']
2.	[http://www.searchengineplaces.com.tr/travel_guide.html] : ['http://www.searchengineplaces.com.tr/ankara.html', 'http://www.searchengineplaces.com.tr/konya.html', 'http://www.searchengineplaces.com.tr/istanbul.html', 'http://www.searchengineplaces.com.tr/oktayrecommends.html', 'http://www.searchengineplaces.com.tr/seymarecommends.html']
3.	[http://www.searchengineplaces.com.tr/seymarecommends.html] : ['http://www.searchengineplaces.com.tr/oktayrecommends.html', 'http://www.searchengineplaces.com.tr/konya.html']
4.	[http://www.searchengineplaces.com.tr/oktayrecommends.html] : ['http://www.searchengineplaces.com.tr/istanbul.html']
5.	[http://www.searchengineplaces.com.tr/istanbul.html] : ['http://www.searchengineplaces.com.tr/maidens_tower.html', 'http://www.searchengineplaces.com.tr/galata_tower.html']
6.	[http://www.searchengineplaces.com.tr/gala

In [3]:
# I wrote the computeRanks function to calculate the page ranks based on the graph structure
def computeRanks(graph):
    d = 0.8  # Damping factor for rank calculation
    N = len(graph)  # Number of pages
    numloops = 10  # Number of iterations to refine ranks
    ranks = {}

    # Initialize ranks: assign each page an equal rank initially
    for page in graph:
        ranks[page] = 1 / N

    # Run the rank calculation for a set number of iterations
    for i in range(numloops):
        newranks = {}
        for page in graph:
            newrank = (1 - d) / N  # Base rank contribution
            for node in graph:
                if page in graph[node]:  # If the page is linked to another page
                    newrank += d * (ranks[node] / len(graph[node]))  # Update rank based on linked pages
            newranks[page] = newrank  # Update the rank for the page
        ranks = newranks  # Update ranks after each iteration

    return ranks  # Return the final calculated ranks

# I computed and printed the ranks of the pages in the graph
ranks1 = computeRanks(graph1)
for page, rank in ranks1.items():
    print(f"The rank of the page {page} : {rank}")

The rank of the page https://searchengineplaces.com.tr/ : 0.019999999999999997
The rank of the page http://www.searchengineplaces.com.tr/travel_guide.html : 0.14780429869056003
The rank of the page http://www.searchengineplaces.com.tr/seymarecommends.html : 0.073060618584064
The rank of the page http://www.searchengineplaces.com.tr/oktayrecommends.html : 0.073060618584064
The rank of the page http://www.searchengineplaces.com.tr/istanbul.html : 0.17460832634470402
The rank of the page http://www.searchengineplaces.com.tr/galata_tower.html : 0.09025810358272002
The rank of the page http://www.searchengineplaces.com.tr/maidens_tower.html : 0.09025810358272002
The rank of the page http://www.searchengineplaces.com.tr/konya.html : 0.073060618584064
The rank of the page http://www.searchengineplaces.com.tr/mevlana.html : 0.04928445079552
The rank of the page http://www.searchengineplaces.com.tr/ankara.html : 0.043776167788544


In [5]:
# I wrote the rankedLookup function to sort pages by their ranks for a specific keyword
def rankedLookup(index, keyword, graph):
    pages = index.get(keyword, [])  # Get the pages associated with the keyword
    ranks = computeRanks(graph)  # Compute the ranks of the pages
    unique_pages = set(pages)  # Remove duplicate pages
    return sorted(unique_pages, key=lambda page: ranks.get(page, 0), reverse=True)  # Sort pages by rank

# I ran the ranked lookup for the keyword "in" and printed the results
results = rankedLookup(index1, "in", graph1)
for result in results:
    print(result)

http://www.searchengineplaces.com.tr/istanbul.html
http://www.searchengineplaces.com.tr/travel_guide.html
http://www.searchengineplaces.com.tr/maidens_tower.html
http://www.searchengineplaces.com.tr/galata_tower.html
http://www.searchengineplaces.com.tr/konya.html
http://www.searchengineplaces.com.tr/mevlana.html


In [6]:
# I wrote an updated lookup function to handle multiple argument cases
def lookup(index, keyword, *args):
    if len(args) == 0:
        pages = set(index.get(keyword, []))  # Get pages for the keyword
        return list(pages)
    elif len(args) == 2:
        graph, computeProcedure = args  # Get the graph and rank computation procedure
        ranks = computeProcedure(graph)  # Compute the ranks
        pages = set(index.get(keyword, []))  # Get pages for the keyword
        return sorted(pages, key=lambda page: ranks.get(page, 0), reverse=True)  # Sort by rank
    elif len(args) == 1:
        return "Warning: Unexpected number of arguments. Provide either 2 or 4 arguments."
    else:
        raise ValueError("Invalid number of arguments provided to the lookup function.")  # Raise an error if incorrect arguments are provided

In [10]:
# I tested the lookup function with and without ranks
see = lookup(index1, "in", graph1, computeRanks)
for e in see:
    print(e)

http://www.searchengineplaces.com.tr/istanbul.html
http://www.searchengineplaces.com.tr/travel_guide.html
http://www.searchengineplaces.com.tr/maidens_tower.html
http://www.searchengineplaces.com.tr/galata_tower.html
http://www.searchengineplaces.com.tr/konya.html
http://www.searchengineplaces.com.tr/mevlana.html


In [9]:
see1 = lookup(index1, "in")
for e in see1:
    print(e)

http://www.searchengineplaces.com.tr/istanbul.html
http://www.searchengineplaces.com.tr/maidens_tower.html
http://www.searchengineplaces.com.tr/galata_tower.html
http://www.searchengineplaces.com.tr/travel_guide.html
http://www.searchengineplaces.com.tr/konya.html
http://www.searchengineplaces.com.tr/mevlana.html


In [11]:
# I ensured that the rankedLookup and lookup functions return the same results
assert rankedLookup(index1, "in", graph1) == lookup(index1, "in", graph1, computeRanks)