In [None]:
import subprocess
import requests
from bs4 import BeautifulSoup
from collections import Counter, defaultdict
from nltk.util import bigrams, trigrams
import nltk
from googlesearch import search

# Download NLTK data if not already installed
nltk.download('punkt')

# List of common words and phrases to exclude
EXCLUSION_LIST = {
    'the', 'and', 'a', 'of', 'to', 'in', 'with', 'for', 'is', 'that',
    'this', 'it', 'best', 'as', 'you', 'on', 'was', 'are', 'by', 'an',
    'at', 'be', 'from', 'or', 'if', 'not', 'have', 'had', 'has', 'but',
    'they', 'their', 'we', 'our', 'his', 'her', 'which', 'about', 'will',
    'would', 'can', 'all', 'more', 'what', 'when', 'who', 'one', 'do',
    'he', 'she', 'them', 'out', 'so', 'up', 'no', 'into', 'my', 'your',
    'there', 'use', 'how', 'time', 'just', 'like', 'some', 'other', 'than',
    'also', 'could', 'new', 'any', 'very', 'only', 'get', 'see', 'because',
    'been', 'people', 'these', 'over', 'its', 'even', 'most', 'me', 'back',
    'here', 'after', 'us', 'such', 'where', 'go', 'way', 'many', 'those',
    'full review', 'enable js', 'js disable', 'disable ad', 'ad blocker',
    'skip content', 'digital edition', 'privacy policy', 'enable js disable',
    'js disable ad', 'disable ad blocker', 'js', 'disable', 'ad', 'blocker',
    'privacy', 'content', 'rights', 'settings', 'went', 'policy', 'own',
    'pick', 'contact', 'edition'
}

# Function to search Google and return the top 10 results
def search_google(query):
    urls = list(search(query, num=10, stop=10, pause=2))
    print(f"Extracted URLs: {urls}")
    return urls

# Function to fetch the content of each page
def fetch_page_content(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    for script in soup(["script", "style"]):
        script.decompose()
    return soup.get_text()

# Function to count words, bigrams, and trigrams in a text
def count_words_bigrams_trigrams(text):
    words = [word for word in text.lower().split() if word.isalpha() and word not in EXCLUSION_LIST]
    word_counts = Counter(words)
    bigram_counts = Counter(bigrams(words))
    trigram_counts = Counter(trigrams(words))
    return word_counts, bigram_counts, trigram_counts

# Main function
def main(query, min_sites=1):
    results = search_google(query)
    word_site_counts = defaultdict(lambda: defaultdict(int))
    bigram_site_counts = defaultdict(lambda: defaultdict(int))
    trigram_site_counts = defaultdict(lambda: defaultdict(int))

    for url in results:
        try:
            print(f"Fetching content from: {url}")
            page_content = fetch_page_content(url)
            word_counts, bigram_counts, trigram_counts = count_words_bigrams_trigrams(page_content)

            for word, count in word_counts.items():
                word_site_counts[word][url] += count
            for bigram, count in bigram_counts.items():
                bigram_site_counts[bigram][url] += count
            for trigram, count in trigram_counts.items():
                trigram_site_counts[trigram][url] += count
        except Exception as e:
            print(f"Failed to fetch {url}: {e}")

    def aggregate_counts(site_counts):
        aggregated = {}
        for item, counts in site_counts.items():
            aggregated[item] = (len(counts), sum(counts.values()))
        return aggregated

    aggregated_words = aggregate_counts(word_site_counts)
    aggregated_bigrams = aggregate_counts(bigram_site_counts)
    aggregated_trigrams = aggregate_counts(trigram_site_counts)

    def print_top_items(aggregated, label, min_sites):
        sorted_items = sorted(aggregated.items(), key=lambda x: x[1][1], reverse=True)
        print(f"Most common {label} in the top 10 search results (appearing in at least {min_sites} sites):")
        for item, (site_count, total_count) in sorted_items:
            if site_count >= min_sites:
                print(f"{' '.join(item) if isinstance(item, tuple) else item}: {total_count} (in {site_count} sites)")

    print_top_items(aggregated_words, "words", min_sites)
    print("\n")
    print_top_items(aggregated_bigrams, "bigrams", min_sites)
    print("\n")
    print_top_items(aggregated_trigrams, "trigrams", min_sites)

# Settings
query = "Best sailing yachts"
min_sites = 3  # Set the minimum number of sites a keyword must appear on to be listed
main(query, min_sites)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Extracted URLs: ['https://www.sailmagazine.com/boats/best-boats-2023', 'https://www.reddit.com/r/sailing/comments/17r2eu0/what_brands_make_strong_reliable_boats_for_actual/', 'https://www.reddit.com/r/sailing/comments/vou859/best_sailboat_type_for_a_circumnavigation/', 'https://www.reddit.com/r/sailing/comments/zo9v9u/best_starter_boat/', 'https://www.reddit.com/r/sailing/comments/p3fou5/best_large_sail_boat_to_sail_the_world/', 'https://www.reddit.com/r/sailing/comments/1631696/what_brand_of_yacht_is_equivalent_to_a_rolls/', 'https://www.yachtingworld.com/yachts-and-gear/5-best-family-cruising-yachts-of-2022-137286', 'https://www.cruisingworld.com/sailboats/40-best-sailboats/', 'https://itboat.com/explore/14-luxury-sailing-yachts', 'https://www.boatinternational.com/yachts/the-register/top-largest-sailing-yachts--25057']
Fetching content from: https://www.sailmagazine.com/boats/best-boats-2023
Fetching content from: https://www.reddit.com/r/sailing/comments/17r2eu0/what_brands_make_st