In [6]:
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import time

In [7]:
def setup_driver():
    # Set up the Selenium WebDriver (using Chrome here)
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')  # Run in headless mode for efficiency
    driver = webdriver.Chrome(options=options)
    return driver


In [8]:
def get_page_content_selenium(driver, url):
    try:
        driver.get(url)  # Load the page
        time.sleep(2)    # Allow time for JavaScript to execute
        return driver.page_source  # Get the rendered HTML
    except Exception as e:
        print(f"Error loading {url}: {e}")
        return None


In [9]:
def scrape_genz_marketing_with_selenium(base_url):
    driver = setup_driver()  # Initialize the WebDriver
    scraped_data = []

    try:
        # Get homepage content
        homepage_content = get_page_content_selenium(driver, base_url)
        if not homepage_content:
            return scraped_data
        
        soup = BeautifulSoup(homepage_content, 'html.parser')

        # Extract all links from the homepage
        links = soup.find_all('a', href=True)
        for link in links:
            page_url = link['href']
            # Handle relative URLs
            if not page_url.startswith('http'):
                page_url = base_url.rstrip('/') + '/' + page_url.lstrip('/')

            # Get content from each linked page
            page_content = get_page_content_selenium(driver, page_url)
            if not page_content:
                continue

            page_soup = BeautifulSoup(page_content, 'html.parser')

            # Extract title and main content
            title = page_soup.title.string if page_soup.title else "No Title"
            content = ' '.join(page_soup.stripped_strings)

            scraped_data.append({
                'title': title,
                'url': page_url,
                'content': content
            })

    finally:
        driver.quit()  # Ensure the driver is closed

    return scraped_data


In [10]:
# Main script to scrape data
base_url = "https://genzmarketing.xyz/"
data = scrape_genz_marketing_with_selenium(base_url)

# Save data to CSV
if data:
    df = pd.DataFrame(data)
    output_file = "genz_marketing_data.csv"
    df.to_csv(output_file, index=False)
    print(f"Data scraping completed. File saved as {output_file}")
else:
    print("No data scraped.")


Data scraping completed. File saved as genz_marketing_data.csv
