<a href="https://colab.research.google.com/github/mituchowdhury1/data-mining/blob/main/Poject_3_Domain_Specific_Search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
import requests
from bs4 import BeautifulSoup


In [9]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

STOPWORDS = stopwords.words('english')
print(STOPWORDS)

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [10]:

custom_STOPWORDS = [] # Add your own stopwords here
STOPWORDS.extend(custom_STOPWORDS)

In [11]:
from collections import defaultdict

# Create an inverted index mapping each word to a set of URLs
inverted_index = defaultdict(lambda: set())

# Initialize an empty set to store unique URLs
url_list = set()

In [12]:
# Dictionary to store link connections with source and target lists
web_connection = {
    "source": [],
    "target": []
}

In [13]:
import re

# This function will clean the content of web page in order to build the inverted index.
def clean_and_tokenize(text):
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text.lower())  # Remove punctuation and lowercase
    tokens = text.split()
    return [t for t in tokens if t not in STOPWORDS and len(t) > 1]

In [14]:
from urllib.parse import urljoin, urlparse
import requests
from bs4 import BeautifulSoup

def crawl(url, base_domain, visited, visit_limit, limit):
    # Stop crawling if limit reached or visit limit exceeded
    if limit == 0 or len(visited) >= visit_limit:
        return

    try:
        response = requests.get(url, timeout=5)
        if response.status_code != 200:
            return
    except requests.RequestException:
        return

    visited.add(url)
    print("-" * (10 - limit), end=" ")
    print(f"Crawled: {url}")

    soup = BeautifulSoup(response.text, 'html.parser')
    text = soup.get_text(separator=' ', strip=True)
    words = clean_and_tokenize(text)  # Assume this function exists

    # Update inverted index and URL list with words from the current page
    for word in words:
        inverted_index[word].add(url)
        url_list.add(url)

    # Find all hyperlinks on the page
    for tag in soup.find_all('a', href=True):
        link = urljoin(url, tag['href'])
        parsed_link = urlparse(link)

        # Track the web connection from current url to the link
        web_connection['source'].append(url)
        web_connection['target'].append(link)

        # Only crawl links within the base domain and not visited yet
        if parsed_link.netloc == base_domain and link not in visited:
            crawl(link, base_domain, visited, visit_limit, limit - 1)


In [15]:
def crawl_roots(root_urls, max_per_root=2, visit_limit=50):
    for root_url in root_urls:
        print(f"\nStarting crawl from: {root_url}")
        domain = urlparse(root_url).netloc
        visited_urls = set()
        crawl(root_url, domain, visited_urls, visit_limit, max_per_root)

In [16]:

seed_urls = [
   'https://www.allrecipes.com/',
    'https://www.foodnetwork.com/',
    'https://www.epicurious.com/',
    'https://www.bbcgoodfood.com/',
    'https://www.tasteofhome.com/',
    'https://www.delish.com/',
    'https://www.seriouseats.com/',
    'https://www.food.com/',
    'https://www.simplyrecipes.com/',
    'https://www.yummly.com/',
    'https://www.cookinglight.com/',
    'https://www.bonappetit.com/',
    'https://www.myrecipes.com/',
    'https://www.marthastewart.com/1503760/recipes',
    'https://www.thekitchn.com/',
    'https://www.saveur.com/',
    'https://www.culinaryhill.com/',
    'https://www.deliciousmagazine.co.uk/',
    'https://www.tasty.co/',
    'https://www.geniuskitchen.com/'
]

crawl_roots(seed_urls, max_per_root=10)


Starting crawl from: https://www.allrecipes.com/
 Crawled: https://www.allrecipes.com/
- Crawled: https://www.allrecipes.com/#main
-- Crawled: https://www.allrecipes.com/authentication/login?regSource=3675&relativeRedirectUrl=%2F
-- Crawled: https://www.allrecipes.com/account/profile
-- Crawled: https://www.allrecipes.com/account/add-recipe
-- Crawled: https://www.allrecipes.com/account/settings
-- Crawled: https://www.allrecipes.com/authentication/logout?relativeRedirectUrl=%2F
--- Crawled: https://www.allrecipes.com/authentication/logout?relativeRedirectUrl=%2F#main
---- Crawled: https://www.allrecipes.com/authentication/login?regSource=3675&relativeRedirectUrl=%2F%3Fbanner%3Dlogout
---- Crawled: https://www.allrecipes.com/authentication/logout?relativeRedirectUrl=%2F%3Fbanner%3Dlogout
----- Crawled: https://www.allrecipes.com/authentication/logout?relativeRedirectUrl=%2F%3Fbanner%3Dlogout#main
------ Crawled: https://www.allrecipes.com/sweepstakes
------- Crawled: https://www.allre

In [28]:

def search_engine(query, index, scores):
    query_terms = query.lower().split()
    results = set()
    for term in query_terms:
        if term in index:
            if not results:
                results = set(index[term])
            else:
                results = results.intersection(index[term])  # Find common websites

    # Sort results based on score
    ranked_results = []
    for website in results:
        if website in scores:
          ranked_results.append((website, scores[website]))
    ranked_results.sort(key=lambda x: x[1], reverse=True)

    return ranked_results

In [30]:

# Query and display results
query = "pizza"
print(f"\nSearch Results for '{query}' using PageRank:")
results = search_engine(query, inverted_index, pagerank_scores) # Change index to inverted_index

for page, score in results:
    print(f"{page}: {score}")  # Print page and score. web_content isn't defined. # Remove reference to undefined web_content

print(f"\nSearch Results for '{query}' using HITS (Authorities):")
#results = search_engine(query, index, authorities)  # This part is commented out as 'authorities' is also not defined.
#for page, score in results:
#    print(f"{page}: {score}")# Remove reference to undefined authorities and web_content


Search Results for 'pizza' using PageRank:
https://www.saveur.com/: 0.00011898057861712579
https://www.saveur.com/category/culture/: 0.00011606247520679511
https://www.saveur.com/category/recipes-by-cuisine/: 0.00011606247520679511
https://www.saveur.com/category/features/: 0.00011606247520679511
https://www.bonappetit.com/cooking: 9.384207414002757e-05
https://www.bonappetit.com/podcast/dinner-sos: 9.384207414002757e-05
https://www.delish.com/kitchen-tools/kitchen-secrets/: 8.856042924641888e-05
https://www.delish.com/kitchen-tools/: 8.856042924641888e-05
https://www.delish.com/4th-july-recipes/: 8.856042924641888e-05
https://www.delish.com/food/: 8.856042924641888e-05
https://www.delish.com/weeknight-dinners/: 8.856042924641888e-05
https://www.delish.com/cooking/recipe-ideas/: 8.856042924641888e-05
https://www.delish.com/cooking/menus/: 8.856042924641888e-05
https://www.simplyrecipes.com/recipes-5090746: 8.633453761103275e-05
https://www.tasty.co/article/mikespohr/16-pizza-hacks-you