In [6]:
%%capture
pip install openpyxl

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
%%capture
import requests
from bs4 import BeautifulSoup
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords, wordnet
from urllib.parse import urljoin
import time
import math
from collections import defaultdict
from nltk import download
import openpyxl
from concurrent.futures import ThreadPoolExecutor
import networkx as nx

# Download necessary NLTK data files (if not already installed)
download('wordnet')
download('stopwords')

In [23]:
class MultiPageSearchEngine:
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))
        self.page_indexes = {}
        self.global_index = defaultdict(set)
        self.word_frequency = defaultdict(int)  # Tracks overall word frequency
        self.total_documents = 0
        self.relevant_pages = []
        self.graph_links = defaultdict(set)
        self.count = 0

    def fetch_page_in_chunks(self, url):
        """Fetch page in chunks to handle truncation."""
        try:
            headers = {
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
                "Range": "bytes=0-"
            }
            response = requests.get(url, headers=headers, timeout=10, stream=True)

            if response.status_code in [200, 206]:
                content = b""
                for chunk in response.iter_content(chunk_size=1024):
                    content += chunk

                return BeautifulSoup(content.decode('utf-8', errors='ignore'), 'html.parser')
            else:
                print(f"Failed to fetch {url}: Status code {response.status_code}")
                return None
        except Exception as e:
            print(f"Error fetching {url}: {str(e)}")
            return None

    def get_links(self, soup, base_url):
        """Extract links from the page."""
        links = set()
        if soup:
            for link in soup.find_all('a', href=True):
                url = link['href']
                absolute_url = urljoin(base_url, url)
                # Allow more comprehensive link filtering
                if absolute_url.startswith(base_url) and re.search(r'/app/|/game/', absolute_url):
                    links.add(absolute_url)
        return links

    def clean_text(self, text):
        """Remove stopwords, non-alphanumeric characters, and apply lemmatization."""
        words = re.findall(r'\w+', text.lower())
        filtered_words = [self.lemmatizer.lemmatize(word) for word in words if word not in self.stop_words]
        return filtered_words

    def get_synonyms(self, word):
        """Get synonyms for a given word using WordNet."""
        synonyms = set()
        for syn in wordnet.synsets(word):
            for lemma in syn.lemmas():
                synonyms.add(lemma.name().lower())
        return synonyms

    def expand_query_with_synonyms(self, query_terms):
        """Expand the query with synonyms for each term."""
        expanded_query = set(query_terms)
        for term in query_terms:
            expanded_query.update(self.get_synonyms(term))
        return list(expanded_query)

    def index_words(self, soup, url):
        """Create word index for a page and calculate term frequencies."""
        index = defaultdict(int)
        if soup:
            main_content = soup.get_text()
            words = self.clean_text(main_content)
            for word in words:
                index[word] += 1
                self.global_index[word].add(url)
                self.word_frequency[word] += 1  # Update global word frequency
        return dict(index)

    def crawl_and_index_query_based(self, start_url, query, max_relevant_pages=20):
        """Crawl pages based on query relevance using multithreading."""
        visited = set()
        to_visit = {start_url}
        relevant_pages = 0
        query_terms = set(self.clean_text(query))

        def process_url(url):
            if url in visited:
                return None

            print(f"Crawling: {url}")
            soup = self.fetch_page_in_chunks(url)
            if soup:
                # Index the current page
                page_index = self.index_words(soup, url)
                if query_terms.intersection(page_index.keys()):
                    self.page_indexes[url] = page_index
                    nonlocal relevant_pages
                    relevant_pages += 1
                    self.total_documents += 1
                    print(f"Relevant page found: {url}")
                    self.relevant_pages.append(url)
                    self.count += 1

                # Add new links to visit
                links = self.get_links(soup, start_url)
                self.graph_links[url] = links
                return links
            return None

        with ThreadPoolExecutor(max_workers=5) as executor:
            while to_visit and relevant_pages < max_relevant_pages:
                futures = {executor.submit(process_url, url): url for url in to_visit}
                to_visit.clear()

                for future in futures:
                    url = futures[future]
                    try:
                        result = future.result()
                        if result:
                            visited.add(url)
                            to_visit.update(result - visited)
                    except Exception as e:
                        print(f"Error processing {url}: {e}")

        print(f"\nCrawling complete. Indexed {len(self.page_indexes)} relevant pages.")
        return visited

    def compute_tf(self, term, page_index):
        """Calculate normalized term frequency (TF)."""
        term_count = page_index.get(term, 0)
        total_terms = sum(page_index.values())
        if total_terms == 0:
            return 0
        return term_count / total_terms

    def compute_idf(self, term):
        """Calculate smoothed inverse document frequency (IDF)."""
        doc_count = len(self.global_index[term])
        if doc_count > 0:
            return math.log((self.total_documents / doc_count) + 1)
        return 0

    # def extract_relevant_pages_from_all(self, all_pages):
    #     """Extract relevant pages from all pages."""
    #     relevant_pages = defaultdict(set)
    #     page_keys = all_pages.keys()
    #     for page in page_keys:
    #         if page in self.relevant_pages:
    #             relevant_pages[page] = all_pages[page]
    #     return relevant_pages

    def search(self, query, max_relevant_pages=20):
        """Search across all indexed pages using phrase search, synonyms, and enhanced TF-IDF scoring."""
        query_terms, phrase_terms = self.parse_query(query)
        expanded_query_terms = self.expand_query_with_synonyms(query_terms)
        results = defaultdict(float)

        # Process expanded terms
        for term in expanded_query_terms:
            if term in self.global_index:
                idf = self.compute_idf(term)
                for url in self.global_index[term]:
                    page_index = self.page_indexes.get(url, {})
                    tf = self.compute_tf(term, page_index)
                    results[url] += tf * idf

        # Process phrases
        for phrase in phrase_terms:
            for url, page_index in self.page_indexes.items():
                if self.is_phrase_in_page(phrase, url):
                    results[url] += 1.0  # Boost score for phrase matches

        # Sort by aggregated score in descending order
        sorted_results = sorted(results.items(), key=lambda x: x[1], reverse=True)
        sorted_results = sorted_results[:max_relevant_pages]
        return sorted_results

    def parse_query(self, query):
        """Parse the query to extract regular terms and phrases."""
        terms = query.split()
        query_terms = []
        phrase_terms = []

        for term in terms:
            if term.startswith('"') and term.endswith('"'):
                phrase_terms.append(term[1:-1])  # Extract phrase without quotes
            else:
                query_terms.append(term)

        return query_terms, phrase_terms

    def is_phrase_in_page(self, phrase, url):
        """Check if a phrase exists in the indexed page content."""
        page_index = self.page_indexes.get(url, {})
        words = list(page_index.keys())
        phrase_words = phrase.split()

        # Search for the phrase sequence in the page words
        for i in range(len(words) - len(phrase_words) + 1):
            if words[i:i + len(phrase_words)] == phrase_words:
                return True
        return False

    def print_results(self, results):
        """Print search results in a formatted way."""
        if not results:
            print("No results found.")
            return

        print("\nSearch Results:")
        print("-" * 50)
        for url, score in results:
            print(f"URL: {url}")
            print(f"Score: {score:.4f}")
            print("-" * 50)

    def get_most_common_words(self, top_n=15):
        """Get the most common words across all indexed pages."""
        sorted_words = sorted(self.word_frequency.items(), key=lambda x: x[1], reverse=True)
        return sorted_words[:top_n]

    def build_inverted_index(self, most_common_words, max_pages_per_word=20):
        """Build an inverted index for the most common words."""
        inverted_index = defaultdict(list)

        for word, _ in most_common_words:
            pages = list(self.global_index[word])
            inverted_index[word] = pages[:max_pages_per_word]  # Limit to the first 20 pages

        return inverted_index

    def save_to_excel(self, visited_pages, most_common_words, search_results, pagerank_values, filename="search_engine_results.xlsx"):
        """Save visited pages, most common words, and search results to an Excel file."""
        wb = openpyxl.Workbook()

        # Sheet 1: Visited Pages
        ws1 = wb.active
        ws1.title = "Visited Pages"
        ws1.append(["Visited Pages"])
        for page in visited_pages:
            ws1.append([page])

        # Sheet 2: Most Common Words
        ws2 = wb.create_sheet(title="Most Common Words")
        ws2.append(["Word", "Frequency"])
        for word, frequency in most_common_words:
            ws2.append([word, frequency])

        # Sheet 3: Search Results
        ws3 = wb.create_sheet(title="Search Results")
        ws3.append(["URL", "Score"])
        for url, score in search_results:
            ws3.append([url, score])

        # Sheet 4: Page Rank Results
        ws4 = wb.create_sheet(title="Page Rank Results")
        ws4.append(["URL", "Page Rank"])
        for url, score in pagerank_values.items():
            ws4.append([url, score])

        # Save the Excel file
        wb.save(filename)
        print(f"\nResults saved to {filename}")

In [24]:
def calculate_new_pagerank(current_ranks, links):
    new_ranks = {}

    # Calculate new PageRank for each page
    for page in current_ranks:
        # Find who links to this page
        incoming_links = [p for p, outgoing in links.items() if page in outgoing]

        # Sum up PageRank contributions
        rank_sum = 0
        for source_page in incoming_links:
            # Get number of outgoing links from source page
            num_outgoing = len(links[source_page])
            # Add contribution from this source page
            rank_sum += current_ranks[source_page] / num_outgoing

        new_ranks[page] = rank_sum

    return new_ranks

def print_ranks(ranks):
    print("-" * 35)
    print("Page  |  PageRank Value")
    print("-" * 35)
    for page, rank in ranks.items():
        print(f"  {page}   |     {rank:.3f}")
    print("-" * 35)

def page_rank():
    # Define the web structure
    links = {
        'https://store.steampowered.com/': ['https://store.steampowered.com/app/2507950/Delta_Force/', 'https://store.steampowered.com/app/1149460/ICARUS/', 'https://store.steampowered.com/app/730/CounterStrike_2/', 'https://store.steampowered.com/app/271590/Grand_Theft_Auto_V/'],
        'https://store.steampowered.com/app/2507950/Delta_Force/': ['https://store.steampowered.com/', 'https://store.steampowered.com/app/730/CounterStrike_2/'],
        'https://store.steampowered.com/app/1149460/ICARUS/': ['https://store.steampowered.com/'],
        'https://store.steampowered.com/app/730/CounterStrike_2/': ['https://store.steampowered.com/','https://store.steampowered.com/app/2507950/Delta_Force/'],
        'https://store.steampowered.com/app/271590/Grand_Theft_Auto_V/': ['https://store.steampowered.com/', 'https://store.steampowered.com/app/2507950/Delta_Force/']
    }
    # Initialize PageRank values (1/5 for each page)
    pages = ['https://store.steampowered.com/', 'https://store.steampowered.com/app/2507950/Delta_Force/', 'https://store.steampowered.com/app/1149460/ICARUS/', 'https://store.steampowered.com/app/730/CounterStrike_2/', 'https://store.steampowered.com/app/271590/Grand_Theft_Auto_V/']
    current_ranks = {page: 1/len(pages) for page in pages}

    print("\n\nWeb structure:")
    for page, outlinks in links.items():
        print(f"Page {page} links to: {', '.join(outlinks)}")

    # Print initial values
    print("\nInitial PageRank Values:")
    print_ranks(current_ranks)

    for iteration in range(10):
        new_ranks = calculate_new_pagerank(current_ranks, links)
        current_ranks = new_ranks

    # print values after iterations
    print("PageRank Values:")
    print_ranks(current_ranks)
    # Find highest PageRank after all iterations
    highest_page = max(current_ranks.items(), key=lambda x: x[1])
    print(f"\nHighest PageRank:")
    print(f"Page {highest_page[0]} with PageRank value of {highest_page[1]:.3f}")
    return current_ranks

In [25]:
def main():
    search_engine = MultiPageSearchEngine()
    start_url = 'https://store.steampowered.com/'
    query = input("Enter your search query (use quotes for phrases): ")
    print("\nCrawling and indexing pages based on query '" + query + "' ...\n")

    visited_pages = search_engine.crawl_and_index_query_based(
        start_url,
        query,
        max_relevant_pages=20
    )

    print(f"\nIn total indexed {len(visited_pages)} pages.")
    for page in visited_pages:
        print(f"- {page}")

    # Get 15 most common words and build the inverted index
    most_common_words = search_engine.get_most_common_words()
    inverted_index = search_engine.build_inverted_index(most_common_words)

    print("\nTop 15 Most Common Words and Indexed Pages:")
    for word, pages in inverted_index.items():
        print(f"\nWord: {word}")
        print(f"Pages: {len(pages)}")
        for page in pages:
            print(f"- {page}")


    # Perform search
    results = search_engine.search(query, max_relevant_pages=20)
    search_engine.print_results(results)

    pagerank_values = page_rank()

    # Save outputs to an Excel file
    search_engine.save_to_excel(visited_pages, most_common_words, results, pagerank_values)

if __name__ == "__main__":
    main()

Enter your search query (use quotes for phrases): AMD Ryzen 9

Crawling and indexing pages based on query 'AMD Ryzen 9' ...

Crawling: https://store.steampowered.com/
Relevant page found: https://store.steampowered.com/
Crawling: https://store.steampowered.com/app/2890830/Streamer_Life_Simulator_2/?snr=1_4_4__tab-UpcomingCrawling: https://store.steampowered.com/app/2753600/Vambrace_Dungeon_Monarch/?snr=1_4_4__tab-Upcoming

Crawling: https://store.steampowered.com/app/2157210/Dead_of_Darkness/?snr=1_4_4__tab-Upcoming
Crawling: https://store.steampowered.com/app/3301060/Desktop_Mate/?snr=1_4_4__145Crawling: https://store.steampowered.com/app/2634950/Tokyo_Xtreme_Racer/?snr=1_4_4__tab-Upcoming

Crawling: https://store.steampowered.com/app/2216770/JOY_OF_PROGRAMMING__Software_Engineering_Simulator/?snr=1_4_4__145
Crawling: https://store.steampowered.com/app/553850/HELLDIVERS_2/?snr=1_4_4__tab-TopGrossing
Relevant page found: https://store.steampowered.com/app/2157210/Dead_of_Darkness/?snr=