In [3]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import re

# Function to get all href links from the main URL
def get_links_from_main_page(main_url):
    try:
        response = requests.get(main_url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        links = []

        for tag in soup.find_all('a', href=True):
            href = tag['href']
            # Resolve relative URLs to absolute ones
            full_url = urljoin(main_url, href)
            if is_valid_url(full_url):
                links.append(full_url)

        return list(set(links))  # Remove duplicates
    except requests.RequestException as e:
        print(f"Error fetching main page: {e}")
        return []

# Function to check if a URL is valid
def is_valid_url(url):
    regex = re.compile(
        r'^(https?://)'  # http:// or https://
        r'(\w+(\-\w+)*\.)+[a-z]{2,}'  # domain
    )
    return re.match(regex, url) is not None

# Function to scrape text from a given URL
def scrape_homepage_text(url):
    try:
        response = requests.get(url, timeout=5)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        # Extract visible text from the webpage
        text = soup.get_text(separator='\n', strip=True)
        return text
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return ""

# Main function
def main():
    main_url = "https://www.ai2incubator.com/companies"
    print(f"Fetching links from: {main_url}")
    links = get_links_from_main_page(main_url)

    print(f"Found {len(links)} links. Scraping text from each:")
    for idx, link in enumerate(links, start=1):
        print(f"\n[{idx}/{len(links)}] Scraping: {link}")
        text = scrape_homepage_text(link)
        if text:
            print(f"Scraped Text (First 500 characters):\n{text[:500]}")
        else:
            print("No text found or error occurred.")

if __name__ == "__main__":
    main()

Fetching links from: https://www.ai2incubator.com/companies
Found 44 links. Scraping text from each:

[1/44] Scraping: https://www.surveilx.ai/
Scraped Text (First 500 characters):
Home | My Site
top of page
Try SurveilX for Free
Sign Up
Superior Security.
Powered with AI.
Intelligent security system solution for asset protection
Get in touch
SurveilX is on a mission to secure the world’s physical spaces and improve economic outcomes.
Reduce Loss
Realtime alerts sent with precision accuracy.
Detect theft and threats in real time, and take action instantly, saving thousands of dollars at your store operations. Realtime prevention–real security.
Maximize Insights
Dive deeper

[2/44] Scraping: https://app.yoodli.ai
Scraped Text (First 500 characters):
Yoodli | Free Communication Coach
See the case study 👀
Ace your next
sales pitch
work presentation
job interview
crucial conversation
sales pitch
Improve your communication skills with private, real-time, and judgment-free coaching — powered