In [None]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import time
import json
from typing import List, Dict
import logging
import re
from urllib.parse import urlparse

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class HerkeyGroupScraper:
    def __init__(self, headless=True):
        """Initialize the group scraper with optional headless mode"""
        self.chrome_options = Options()
        if headless:
            self.chrome_options.add_argument("--headless")
        self.chrome_options.add_argument("--disable-gpu")
        self.chrome_options.add_argument("--window-size=1920,1080")
        self.chrome_options.add_argument("--no-sandbox")
        self.chrome_options.add_argument("--disable-dev-shm-usage")
    
    def scrape_groups(self, url: str = "https://www.herkey.com/groups", max_scroll=5, scroll_pause=2) -> List[Dict]:
        """
        Scrape group listings from Herkey with URL extraction
        
        Args:
            url: The URL to scrape
            max_scroll: Maximum number of scroll actions to perform (to load more groups)
            scroll_pause: Time to pause between scrolls (seconds)
            
        Returns:
            List of group dictionaries containing details of each group
        """
        driver = webdriver.Chrome(options=self.chrome_options)
        
        try:
            # Load the page
            driver.get(url)
            logger.info("Page loaded, waiting for group elements...")
            
            # Wait for group elements to load
            WebDriverWait(driver, 20).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "[data-test-id='featured-group'], .MuiGrid-container.css-1d3bbye"))
            )
            
            # Scroll down to load more groups
            for i in range(max_scroll):
                logger.info(f"Scrolling iteration {i+1}/{max_scroll}...")
                driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                time.sleep(scroll_pause)
            
            # Get count of total groups first (try multiple selectors)
            group_elements = driver.find_elements(By.CSS_SELECTOR, ".MuiGrid-container.css-1d3bbye")
            if not group_elements:
                group_elements = driver.find_elements(By.CSS_SELECTOR, "[data-test-id='featured-group']")
            if not group_elements:
                group_elements = driver.find_elements(By.CSS_SELECTOR, ".MuiGrid-item")
            
            total_groups = len(group_elements)
            logger.info(f"Found {total_groups} group listings")
            
            groups = []
            
            # Process each group by index to avoid stale element references
            for index in range(total_groups):
                try:
                    logger.info(f"Processing group {index + 1}/{total_groups}...")
                    
                    # Re-find all group elements to avoid stale reference
                    group_elements = driver.find_elements(By.CSS_SELECTOR, ".MuiGrid-container.css-1d3bbye")
                    if not group_elements:
                        group_elements = driver.find_elements(By.CSS_SELECTOR, "[data-test-id='featured-group']")
                    if not group_elements:
                        group_elements = driver.find_elements(By.CSS_SELECTOR, ".MuiGrid-item")
                    
                    # Check if we still have enough groups (in case page changed)
                    if index >= len(group_elements):
                        logger.warning(f"Group index {index} out of range, skipping...")
                        continue
                    
                    group_element = group_elements[index]
                    
                    # Extract basic group details first
                    group_name = ""
                    group_type = ""
                    member_count = 0
                    featured = False
                    icon_url = ""
                    banner_url = ""
                    category = ""
                    joinable = False
                    join_button_text = ""
                    
                    try:
                        # Extract group name
                        name_elem = group_element.find_element(By.CSS_SELECTOR, "[data-test-id='group-name']")
                        group_name = name_elem.text.strip()
                        clickable_element = name_elem  # Use group name as clickable element
                    except:
                        # Try alternative selector for group name
                        try:
                            name_elem = group_element.find_element(By.CSS_SELECTOR, ".MuiTypography-root.MuiTypography-h6")
                            group_name = name_elem.text.strip()
                            clickable_element = name_elem
                        except:
                            logger.warning(f"Could not find group name for group {index + 1}")
                            continue
                    
                    # Extract other group details
                    try:
                        type_elem = group_element.find_element(By.CSS_SELECTOR, "[data-test-id='group-type']")
                        group_type = type_elem.text.strip().lower()
                    except:
                        pass
                    
                    try:
                        members_elem = group_element.find_element(By.CSS_SELECTOR, "[data-test-id='group-members-count']")
                        members_text = members_elem.text.strip()
                        members_match = re.search(r'(\d+)\s+Members?', members_text)
                        if members_match:
                            member_count = int(members_match.group(1))
                    except:
                        pass
                    
                    try:
                        featured_elem = group_element.find_element(By.CSS_SELECTOR, "[data-test-id='featured-icon']")
                        featured = True
                    except:
                        featured = False
                    
                    try:
                        icon_elem = group_element.find_element(By.CSS_SELECTOR, "[data-test-id='group-icon']")
                        icon_url = icon_elem.get_attribute("src")
                    except:
                        pass
                    
                    try:
                        banner_elem = group_element.find_element(By.CSS_SELECTOR, ".css-12c20jy img")
                        banner_url = banner_elem.get_attribute("src")
                    except:
                        pass
                    
                    try:
                        category_elem = group_element.find_element(By.CSS_SELECTOR, ".MuiTypography-root.capitalize")
                        category = category_elem.text.strip()
                    except:
                        pass
                    
                    try:
                        join_btn = group_element.find_element(By.CSS_SELECTOR, "[data-test-id='join-btn'] button")
                        joinable = True
                        join_button_text = join_btn.text.strip()
                    except:
                        joinable = False
                    
                    # Now get the group URL by clicking on the group name
                    group_url = ""
                    current_url = driver.current_url
                    
                    try:
                        # Click on group name to navigate to group details
                        driver.execute_script("arguments[0].click();", clickable_element)
                        
                        # Wait a moment for navigation
                        time.sleep(2)
                        
                        # Check if we're on a new page or if URL changed
                        WebDriverWait(driver, 10).until(
                            lambda driver: driver.current_url != current_url
                        )
                        
                        # Get the group URL
                        group_url = driver.current_url
                        logger.info(f"Group URL: {group_url}")
                        
                        # Navigate back to the groups listings page
                        driver.back()
                        
                        # Wait for the group listings to load again
                        WebDriverWait(driver, 15).until(
                            EC.presence_of_element_located((By.CSS_SELECTOR, ".MuiGrid-container.css-1d3bbye, [data-test-id='featured-group'], .MuiGrid-item"))
                        )
                        
                        # Wait for the page to be stable
                        time.sleep(2)
                        
                        # Verify we're back on the main groups page
                        if driver.current_url != url:
                            logger.info(f"Not on main groups page, navigating back to {url}")
                            driver.get(url)
                            WebDriverWait(driver, 15).until(
                                EC.presence_of_element_located((By.CSS_SELECTOR, ".MuiGrid-container.css-1d3bbye, [data-test-id='featured-group'], .MuiGrid-item"))
                            )
                            time.sleep(2)
                        
                    except Exception as e:
                        logger.error(f"Error getting group URL for group {index + 1}: {e}")
                        # Try to navigate back if we're stuck
                        try:
                            if driver.current_url != url:
                                logger.info("Attempting to navigate back to main groups page...")
                                driver.get(url)
                                WebDriverWait(driver, 15).until(
                                    EC.presence_of_element_located((By.CSS_SELECTOR, ".MuiGrid-container.css-1d3bbye, [data-test-id='featured-group'], .MuiGrid-item"))
                                )
                                time.sleep(2)
                        except Exception as nav_error:
                            logger.error(f"Error navigating back: {nav_error}")
                    
                    # Extract group ID from URL if available
                    group_id = ""
                    if group_url:
                        try:
                            # Extract group ID from URL (last part after the last '/')
                            group_id = group_url.split('/')[-1]
                            if not group_id.isdigit():
                                # If last part is not a number, try to find ID in URL
                                id_match = re.search(r'/(\d+)/?$', group_url)
                                if id_match:
                                    group_id = id_match.group(1)
                        except:
                            pass
                    
                    # Create group dictionary
                    group = {
                        "name": group_name,
                        "type": group_type,
                        "member_count": member_count,
                        "featured": featured,
                        "icon_url": icon_url,
                        "banner_url": banner_url,
                        "category": category,
                        "joinable": joinable,
                        "join_button_text": join_button_text,
                        "group_url": group_url,
                        "group_id": group_id
                    }
                    
                    groups.append(group)
                    
                except Exception as e:
                    logger.error(f"Error extracting group details for group {index + 1}: {e}")
                    # Try to ensure we're on the main page before continuing
                    try:
                        if driver.current_url != url:
                            driver.get(url)
                            WebDriverWait(driver, 15).until(
                                EC.presence_of_element_located((By.CSS_SELECTOR, ".MuiGrid-container.css-1d3bbye, [data-test-id='featured-group'], .MuiGrid-item"))
                            )
                            time.sleep(2)
                    except:
                        pass
                    continue
            
            return groups
            
        except TimeoutException:
            logger.error("Timeout waiting for page to load")
            return []
        
        except Exception as e:
            logger.error(f"Error: {e}")
            return []
            
        finally:
            driver.quit()
    
    def scrape_groups_alternative_method(self, url: str = "https://www.herkey.com/groups", max_scroll=5, scroll_pause=2) -> List[Dict]:
        """
        Alternative method to scrape groups by looking for clickable elements or href attributes
        """
        driver = webdriver.Chrome(options=self.chrome_options)
        
        try:
            driver.get(url)
            logger.info("Page loaded, waiting for group elements...")
            
            WebDriverWait(driver, 20).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, ".MuiGrid-container.css-1d3bbye, [data-test-id='featured-group']"))
            )
            
            # Scroll to load more groups
            for i in range(max_scroll):
                logger.info(f"Scrolling iteration {i+1}/{max_scroll}...")
                driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                time.sleep(scroll_pause)
            
            # Get page source after all content is loaded
            page_content = driver.page_source
            soup = BeautifulSoup(page_content, 'html.parser')
            
            # Find all group container elements
            group_elements = soup.select("div.MuiGrid-container.css-1d3bbye")
            if not group_elements:
                group_elements = soup.select("[data-test-id='featured-group']")
            
            logger.info(f"Found {len(group_elements)} group elements")
            
            groups = []
            for group_element in group_elements:
                try:
                    group = {}
                    
                    # Extract group name
                    name_elem = group_element.select_one("[data-test-id='group-name']")
                    if name_elem:
                        group["name"] = name_elem.text.strip()
                    
                    # Look for clickable parent element or any element that might contain href
                    group_url = ""
                    group_id = ""
                    
                    # Method 1: Look for parent anchor tag
                    try:
                        parent_link = group_element.find_parent("a")
                        if parent_link and parent_link.has_attr('href'):
                            group_url = parent_link['href']
                    except:
                        pass
                    
                    # Method 2: Look for any anchor tag within the group element
                    if not group_url:
                        try:
                            link_elements = group_element.select("a[href]")
                            for link in link_elements:
                                href = link.get('href', '')
                                if "/groups/" in href and href != url:
                                    group_url = href
                                    break
                        except:
                            pass
                    
                    # Method 3: Check if group element has data attributes
                    if not group_url:
                        try:
                            # Look for data attributes that might contain group info
                            group_id_attr = group_element.get('data-group-id')
                            if group_id_attr:
                                group_id = group_id_attr
                                # Construct URL from group name and ID
                                if group.get("name"):
                                    name_slug = group["name"].lower().replace(" ", "-").replace("--", "-")
                                    group_url = f"https://www.herkey.com/groups/{name_slug}/{group_id}"
                        except:
                            pass
                    
                    # Extract group ID from URL if we have it
                    if group_url and not group_id:
                        try:
                            group_id = group_url.split('/')[-1]
                            if not group_id.isdigit():
                                id_match = re.search(r'/(\d+)/?$', group_url)
                                if id_match:
                                    group_id = id_match.group(1)
                        except:
                            pass
                    
                    # Extract other group details
                    type_elem = group_element.select_one("[data-test-id='group-type']")
                    if type_elem:
                        group["type"] = type_elem.text.strip().lower()
                    
                    members_elem = group_element.select_one("[data-test-id='group-members-count']")
                    if members_elem:
                        members_text = members_elem.text.strip()
                        members_match = re.search(r'(\d+)\s+Members?', members_text)
                        if members_match:
                            group["member_count"] = int(members_match.group(1))
                    
                    featured_elem = group_element.select_one("[data-test-id='featured-icon']")
                    group["featured"] = featured_elem is not None
                    
                    icon_elem = group_element.select_one("[data-test-id='group-icon']")
                    if icon_elem and icon_elem.has_attr('src'):
                        group["icon_url"] = icon_elem['src']
                    
                    banner_elem = group_element.select_one(".css-12c20jy img")
                    if banner_elem and banner_elem.has_attr('src'):
                        group["banner_url"] = banner_elem['src']
                    
                    category_elem = group_element.select_one(".MuiTypography-root.capitalize")
                    if category_elem:
                        group["category"] = category_elem.text.strip()
                    
                    join_btn = group_element.select_one("[data-test-id='join-btn'] button")
                    if join_btn:
                        group["joinable"] = True
                        group["join_button_text"] = join_btn.text.strip()
                    else:
                        group["joinable"] = False
                    
                    # Add URL and ID to group
                    group["group_url"] = group_url
                    group["group_id"] = group_id
                    
                    if group.get("name"):  # Only add if we have at least a name
                        groups.append(group)
                        
                except Exception as e:
                    logger.error(f"Error extracting group data: {e}")
            
            return groups
            
        except Exception as e:
            logger.error(f"Error: {e}")
            return []
            
        finally:
            driver.quit()
    
    def save_to_json(self, groups: List[Dict], filename: str = "herkey_groups.json"):
        """Save scraped groups to a JSON file"""
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(groups, f, ensure_ascii=False, indent=4)
        logger.info(f"Saved {len(groups)} groups to {filename}")
    
    def process_groups_for_recommendation(self, groups: List[Dict]) -> List[Dict]:
        """
        Process group data to make it suitable for AI recommendation engine
        Extracts and standardizes relevant fields
        """
        processed_groups = []
        
        for group in groups:
            processed_group = group.copy()
            
            # Extract topics/skills from group name where possible
            name = group.get("name", "").lower()
            
            # Extract potential skills/technologies
            tech_keywords = [
                "javascript", "python", "java", "c++", "ruby", "php", "golang", "react", 
                "angular", "vue", "node", "express", "django", "flask", "spring", "bootstrap",
                "html", "css", "sql", "nosql", "mongodb", "mysql", "postgresql", "oracle",
                "aws", "azure", "gcp", "cloud", "devops", "data science", "machine learning",
                "ai", "artificial intelligence", "blockchain", "iot", "mobile", "android",
                "ios", "swift", "kotlin", "flutter", "react native", "full stack", "frontend",
                "backend", "ui", "ux", "design", "product", "agile", "scrum", "kanban",
                "mern", "mean", "lamp", "microservices", "docker", "kubernetes", "jenkins",
                "ci/cd", "testing", "qa", "security", "cyber security", "data engineering",
                "big data", "hadoop", "spark", "tableau", "power bi", "data visualization"
            ]
            
            # Look for tech keywords in the group name
            found_keywords = [keyword for keyword in tech_keywords if keyword in name]
            processed_group["tech_keywords"] = found_keywords
            
            # Determine if group is for beginners, intermediate, or advanced
            level_indicators = {
                "beginner": ["beginner", "basic", "fundamental", "101", "intro", "start"],
                "intermediate": ["intermediate", "mid-level"],
                "advanced": ["advanced", "expert", "professional", "master", "senior"]
            }
            
            for level, indicators in level_indicators.items():
                if any(indicator in name for indicator in indicators):
                    processed_group["level"] = level
                    break
            else:
                processed_group["level"] = "all"  # Default if no level is detected
            
            # Add group category based on name (simplified)
            if any(term in name for term in ["developer", "coding", "programming", "engineer", "mern", "stack"]):
                processed_group["category"] = "development"
            elif any(term in name for term in ["design", "ui", "ux", "user experience"]):
                processed_group["category"] = "design"
            elif any(term in name for term in ["data", "analytics", "science", "machine learning", "ai"]):
                processed_group["category"] = "data_science"
            elif any(term in name for term in ["management", "leader", "agile", "scrum", "product"]):
                processed_group["category"] = "management"
            elif any(term in name for term in ["testing", "qa", "quality"]):
                processed_group["category"] = "testing"
            elif any(term in name for term in ["devops", "cloud", "aws", "azure", "gcp"]):
                processed_group["category"] = "devops"
            else:
                processed_group["category"] = "other"
            
            processed_groups.append(processed_group)
        
        return processed_groups

if __name__ == "__main__":
    # Example usage
    scraper = HerkeyGroupScraper(headless=False)  # Set to True for headless mode
    
    logger.info("Trying main method (clicking on group names)...")
    groups = scraper.scrape_groups(max_scroll=3)
    
    # If main method doesn't work well, try alternative method
    if not groups or not any(group.get('group_url') for group in groups):
        logger.info("Main method didn't get URLs, trying alternative method...")
        scraper2 = HerkeyGroupScraper(headless=False)
        groups = scraper2.scrape_groups_alternative_method(max_scroll=3)
    
    # Save raw group data
    scraper.save_to_json(groups)
    logger.info('Saved groups to JSON')
    
    # Print sample of URLs found
    urls_found = [group for group in groups if group.get('group_url')]
    logger.info(f"Found URLs for {len(urls_found)} out of {len(groups)} groups")
    if urls_found:
        logger.info("Sample URLs:")
        for group in urls_found[:3]:
            logger.info(f"- {group['name']}: {group['group_url']}")

2025-05-19 22:38:17,504 - __main__ - INFO - Scraping events from https://events.herkey.com/events
2025-05-19 22:38:19,533 - __main__ - INFO - Scraped 8 events
2025-05-19 22:38:19,535 - __main__ - INFO - Saved 8 events to herkey_events.json



Recommended events for Jane Doe:
1. Advance Your Career with a Premier Executive MBA from Great Lakes
   Date: 16th Apr, 2025 to 31st May, 2025 | Time: 10:00am to 6:00pm
   Type: Online | Price: Free
   Categories: Career Development, event

2. #STEMTheBias Scholarship - Avail Scholarships up to Rs. 80,000
   Date: 3rd Mar, 2025 to 25th Mar, 2025 | Time: 10:00am to 11:59pm
   Type: Online | Price: Free
   Categories: Career Development

3. LeadHERs in Tech: Tech Meets Talent , Innovation Meets Inspiration
   Date: 21st Mar, 2025 | Time: 8:30am to 4:00pm
   Type: Offline | Price: Free
   Categories: Career Development, Women In Tech, networking

4. SkillReBoot program: Restart Your Career Journey
   Date: 21st Apr, 2025 to 20th May, 2025 | Time: 6:00pm to 11:00pm
   Type: Online | Price: ₹ 1
₹ 30000
   Categories: Career Development

5. HerFreshStart: Scholarships of up to 75% for Mothers Returning to Work
   Date: 8th May, 2025 to 25th May, 2025 | Time: 3:43pm to 6:00pm
   Type: Offli

In [None]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import time
import json
from typing import List, Dict
import logging
import re

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class HerkeyGroupScraper:
    def __init__(self, headless=True):
        """Initialize the group scraper with optional headless mode"""
        self.chrome_options = Options()
        if headless:
            self.chrome_options.add_argument("--headless")
        self.chrome_options.add_argument("--disable-gpu")
        self.chrome_options.add_argument("--window-size=1920,1080")
        self.chrome_options.add_argument("--no-sandbox")
        self.chrome_options.add_argument("--disable-dev-shm-usage")
    
    def scrape_groups(self, url: str = "https://www.herkey.com/groups", max_scroll=5, scroll_pause=2) -> List[Dict]:
        """
        Scrape group listings from Herkey
        
        Args:
            url: The URL to scrape
            max_scroll: Maximum number of scroll actions to perform (to load more groups)
            scroll_pause: Time to pause between scrolls (seconds)
            
        Returns:
            List of group dictionaries containing details of each group
        """
        driver = webdriver.Chrome(options=self.chrome_options)
        
        try:
            # Load the page
            driver.get(url)
            logger.info("Page loaded, waiting for group elements...")
            
            # Wait for group elements to load
            WebDriverWait(driver, 20).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "[data-test-id='featured-group']"))
            )
            
            # Scroll down to load more groups
            for i in range(max_scroll):
                logger.info(f"Scrolling iteration {i+1}/{max_scroll}...")
                driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                time.sleep(scroll_pause)
            
            # Get page source after all content is loaded
            page_content = driver.page_source
            soup = BeautifulSoup(page_content, 'html.parser')
            
            # Find all group container elements
            group_elements = soup.select("div.MuiGrid-container.css-1d3bbye")
            logger.info(f"Found {len(group_elements)} group elements")
            
            groups = []
            for group_element in group_elements:
                try:
                    group = {}
                    
                    # Extract group name
                    name_elem = group_element.select_one("[data-test-id='group-name']")
                    if name_elem:
                        group["name"] = name_elem.text.strip()
                    
                    # Extract group type (private/public)
                    type_elem = group_element.select_one("[data-test-id='group-type']")
                    if type_elem:
                        group["type"] = type_elem.text.strip().lower()
                    
                    # Extract member count
                    members_elem = group_element.select_one("[data-test-id='group-members-count']")
                    if members_elem:
                        members_text = members_elem.text.strip()
                        # Extract number from text like "43 Members"
                        members_match = re.search(r'(\d+)\s+Members?', members_text)
                        if members_match:
                            group["member_count"] = int(members_match.group(1))
                    
                    # Check if it's a featured group
                    featured_elem = group_element.select_one("[data-test-id='featured-icon']")
                    group["featured"] = featured_elem is not None
                    
                    # Extract group icon URL
                    icon_elem = group_element.select_one("[data-test-id='group-icon']")
                    if icon_elem and icon_elem.has_attr('src'):
                        group["icon_url"] = icon_elem['src']
                    
                    # Extract banner image if available
                    banner_elem = group_element.select_one(".css-12c20jy img")
                    if banner_elem and banner_elem.has_attr('src'):
                        group["banner_url"] = banner_elem['src']
                    
                    # Extract group category/topic if available
                    # Note: This might not be directly visible in the HTML snippet provided
                    # We'll use a more generic approach to try and find it
                    category_elem = group_element.select_one(".MuiTypography-root.capitalize")
                    if category_elem:
                        group["category"] = category_elem.text.strip()
                    
                    # Extract join button status
                    join_btn = group_element.select_one("[data-test-id='join-btn'] button")
                    if join_btn:
                        group["joinable"] = True
                        group["join_button_text"] = join_btn.text.strip()
                    else:
                        group["joinable"] = False
                    
                    if "name" in group:  # Only add if we have at least a name
                        groups.append(group)
                        
                except Exception as e:
                    logger.error(f"Error extracting group data: {e}")
            
            return groups
            
        except TimeoutException:
            logger.error("Timeout waiting for page to load")
            return []
        
        except Exception as e:
            logger.error(f"Error: {e}")
            return []
            
        finally:
            driver.quit()
    
    def save_to_json(self, groups: List[Dict], filename: str = "herkey_groups.json"):
        """Save scraped groups to a JSON file"""
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(groups, f, ensure_ascii=False, indent=4)
        logger.info(f"Saved {len(groups)} groups to {filename}")
    
    def process_groups_for_recommendation(self, groups: List[Dict]) -> List[Dict]:
        """
        Process group data to make it suitable for AI recommendation engine
        Extracts and standardizes relevant fields
        """
        processed_groups = []
        
        for group in groups:
            processed_group = group.copy()
            
            # Extract topics/skills from group name where possible
            name = group.get("name", "").lower()
            
            # Extract potential skills/technologies
            tech_keywords = [
                "javascript", "python", "java", "c++", "ruby", "php", "golang", "react", 
                "angular", "vue", "node", "express", "django", "flask", "spring", "bootstrap",
                "html", "css", "sql", "nosql", "mongodb", "mysql", "postgresql", "oracle",
                "aws", "azure", "gcp", "cloud", "devops", "data science", "machine learning",
                "ai", "artificial intelligence", "blockchain", "iot", "mobile", "android",
                "ios", "swift", "kotlin", "flutter", "react native", "full stack", "frontend",
                "backend", "ui", "ux", "design", "product", "agile", "scrum", "kanban",
                "mern", "mean", "lamp", "microservices", "docker", "kubernetes", "jenkins",
                "ci/cd", "testing", "qa", "security", "cyber security", "data engineering",
                "big data", "hadoop", "spark", "tableau", "power bi", "data visualization"
            ]
            
            # Look for tech keywords in the group name
            found_keywords = [keyword for keyword in tech_keywords if keyword in name]
            processed_group["tech_keywords"] = found_keywords
            
            # Determine if group is for beginners, intermediate, or advanced
            level_indicators = {
                "beginner": ["beginner", "basic", "fundamental", "101", "intro", "start"],
                "intermediate": ["intermediate", "mid-level"],
                "advanced": ["advanced", "expert", "professional", "master", "senior"]
            }
            
            for level, indicators in level_indicators.items():
                if any(indicator in name for indicator in indicators):
                    processed_group["level"] = level
                    break
            else:
                processed_group["level"] = "all"  # Default if no level is detected
            
            # Add group category based on name (simplified)
            if any(term in name for term in ["developer", "coding", "programming", "engineer", "mern", "stack"]):
                processed_group["category"] = "development"
            elif any(term in name for term in ["design", "ui", "ux", "user experience"]):
                processed_group["category"] = "design"
            elif any(term in name for term in ["data", "analytics", "science", "machine learning", "ai"]):
                processed_group["category"] = "data_science"
            elif any(term in name for term in ["management", "leader", "agile", "scrum", "product"]):
                processed_group["category"] = "management"
            elif any(term in name for term in ["testing", "qa", "quality"]):
                processed_group["category"] = "testing"
            elif any(term in name for term in ["devops", "cloud", "aws", "azure", "gcp"]):
                processed_group["category"] = "devops"
            else:
                processed_group["category"] = "other"
            
            processed_groups.append(processed_group)
        
        return processed_groups

if __name__ == "__main__":
    # Example usage
    scraper = HerkeyGroupScraper(headless=False)  # Set to True for headless mode
    groups = scraper.scrape_groups(max_scroll=3)
    
    # Save raw group data
    scraper.save_to_json(groups)
    

2025-05-19 22:38:23,431 - __main__ - INFO - Page loaded, waiting for group elements...
2025-05-19 22:38:24,586 - __main__ - INFO - Scrolling iteration 1/3...
2025-05-19 22:38:26,706 - __main__ - INFO - Scrolling iteration 2/3...
2025-05-19 22:38:28,825 - __main__ - INFO - Scrolling iteration 3/3...
2025-05-19 22:38:30,942 - __main__ - INFO - Found 103 group elements
2025-05-19 22:38:33,173 - __main__ - INFO - Saved 102 groups to herkey_groups.json



Recommended groups for Jane Doe:
1. Women Engineers
   Type: public | Members: 485
   Category: development | Level: all
   Featured: No

2. Full Stack (MERN) Developer Program
   Type: private | Members: N/A
   Category: development | Level: all
   Featured: Yes

3. Ambassadors Club
   Type: private | Members: 4129
   Category: other | Level: all
   Featured: Yes

4. HerRising
   Type: public | Members: 39405
   Category: other | Level: all
   Featured: Yes

5. Open Ceiling Club
   Type: private | Members: 134
   Category: other | Level: all
   Featured: Yes



In [None]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import time
import json
import logging
import re
from datetime import datetime
from typing import List, Dict

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class HerkeySessionScraper:
    def __init__(self, headless=True):
        """Initialize the session scraper with optional headless mode"""
        self.chrome_options = Options()
        if headless:
            self.chrome_options.add_argument("--headless")
        self.chrome_options.add_argument("--disable-gpu")
        self.chrome_options.add_argument("--window-size=1920,1080")
        self.chrome_options.add_argument("--no-sandbox")
        self.chrome_options.add_argument("--disable-dev-shm-usage")
    
    def scrape_sessions(self, url: str = "https://www.herkey.com/sessions", max_scroll=5, scroll_pause=2) -> List[Dict]:
        """
        Scrape session listings from Herkey
        
        Args:
            url: The URL to scrape
            max_scroll: Maximum number of scroll actions to perform (to load more sessions)
            scroll_pause: Time to pause between scrolls (seconds)
            
        Returns:
            List of session dictionaries containing details of each session
        """
        driver = webdriver.Chrome(options=self.chrome_options)
        
        try:
            # Load the page
            driver.get(url)
            logger.info("Page loaded, waiting for session elements...")
            
            # Wait for session elements to load
            WebDriverWait(driver, 20).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "[datatestid='session-card']"))
            )
            
            # Scroll down to load more sessions
            for i in range(max_scroll):
                logger.info(f"Scrolling iteration {i+1}/{max_scroll}...")
                driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                time.sleep(scroll_pause)
            
            # Get all session cards
            session_elements = driver.find_elements(By.CSS_SELECTOR, "[datatestid='session-card'], [data-test-id='session-card']")
            logger.info(f"Found {len(session_elements)} session listings")
            
            sessions = []
            for session_element in session_elements:
                try:
                    session = {}
                    
                    # Extract session time, date
                    try:
                        time_data = session_element.find_element(By.CSS_SELECTOR, "[data-test-id='time-data']").text
                        if time_data:
                            # Parse date and time (format: "24 Apr 25 | 10:30 AM")
                            date_time_parts = time_data.split('|')
                            if len(date_time_parts) > 0:
                                session["date"] = date_time_parts[0].strip()
                            if len(date_time_parts) > 1:
                                session["time"] = date_time_parts[1].strip()
                    except:
                        logger.debug("Could not extract time data")
                    
                    # Extract session title
                    try:
                        discussion_subject = session_element.find_element(By.CSS_SELECTOR, "[data-test-id='discussion-subject'] p").text
                        if discussion_subject:
                            session["title"] = discussion_subject
                    except:
                        logger.debug("Could not extract session title")
                    
                    # Extract host info
                    try:
                        host_element = session_element.find_element(By.CSS_SELECTOR, "[data-test-id='nav-to-user-profile'] h6")
                        if host_element:
                            session["host"] = host_element.text
                    except:
                        logger.debug("Could not extract host name")
                    
                    # Extract host headline/role if available
                    try:
                        headline_element = session_element.find_element(By.CSS_SELECTOR, "[data-test-id='headline']")
                        if headline_element and headline_element.text.strip():
                            session["host_headline"] = headline_element.text
                    except:
                        logger.debug("Could not extract host headline")
                    
                    # Get host stage/level
                    try:
                        stage_element = session_element.find_element(By.CSS_SELECTOR, "[data-test-id='profile-stage'] span")
                        if stage_element:
                            session["host_stage"] = stage_element.text
                    except:
                        logger.debug("Could not extract host stage")
                    
                    # Get participant count
                    try:
                        participant_element = session_element.find_element(By.CSS_SELECTOR, "[data-test-id='profile-pic']")
                        participant_text = participant_element.text
                        if participant_text and participant_text.startswith("+"):
                            session["participant_count"] = int(participant_text.replace("+", "").strip())
                    except:
                        logger.debug("Could not extract participant count")
                    
                    # Get session type (past/upcoming)
                    try:
                        session_type = session_element.get_attribute("data-sessiontype")
                        if session_type:
                            session["session_type"] = session_type
                    except:
                        logger.debug("Could not extract session type")
                    
                    # Get session ID
                    try:
                        session_id = session_element.get_attribute("data-id")
                        if session_id:
                            session["id"] = session_id
                            # Construct URL
                            if "title" in session:
                                slug = session["title"].lower().replace(" ", "-")
                                session["url"] = f"{url}/{slug}/{session_id}"
                    except:
                        logger.debug("Could not extract session ID")
                    
                    # Check if it's a video session
                    try:
                        video_element = session_element.find_element(By.CSS_SELECTOR, "div[style*='youtube.com']")
                        if video_element:
                            session["is_video"] = True
                            video_url = video_element.get_attribute("style")
                            # Extract YouTube video ID
                            youtube_match = re.search(r'youtube.com/vi/([^/]+)/', video_url)
                            if youtube_match:
                                session["youtube_id"] = youtube_match.group(1)
                    except:
                        session["is_video"] = False
                    
                    # Get session status/action button text
                    try:
                        status_button = session_element.find_element(By.CSS_SELECTOR, "[data-test-id='discussion-status-button'] button p")
                        if status_button:
                            session["action"] = status_button.text.strip()
                    except:
                        logger.debug("Could not extract action button text")
                    
                    if session:  # Only add if we found some data
                        sessions.append(session)
                except Exception as e:
                    logger.error(f"Error extracting session data: {e}")
                    continue
            
            return sessions
            
        except TimeoutException:
            logger.error("Timeout waiting for page to load")
            return []
        
        except Exception as e:
            logger.error(f"Error: {e}")
            return []
            
        finally:
            driver.quit()
    
    def save_to_json(self, sessions: List[Dict], filename: str = "herkey_sessions.json"):
        """Save scraped sessions to a JSON file"""
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(sessions, f, ensure_ascii=False, indent=4)
        logger.info(f"Saved {len(sessions)} sessions to {filename}")
    
    def process_sessions_for_recommendation(self, sessions: List[Dict]) -> List[Dict]:
        """
        Process session data to make it suitable for AI recommendation engine
        Extracts and standardizes dates, topics and other relevant fields
        """
        processed_sessions = []
        
        for session in sessions:
            processed_session = session.copy()
            
            # Process date to datetime object if possible
            date_str = session.get("date", "")
            time_str = session.get("time", "")
            datetime_str = f"{date_str} {time_str}".strip()

            # In the process_sessions_for_recommendation method
            try:
                # Try various date formats
                date_obj = None  # Initialize date_obj to avoid the UnboundLocalError
                
                if len(date_str.split()) == 3:  # Format: "24 Apr 25"
                    date_obj = datetime.strptime(datetime_str, "%d %b %y %I:%M %p")
                else:
                    # Try other formats if the first one fails
                    date_formats = [
                        "%d %b %Y %I:%M %p",  # 24 Apr 2025 10:30 AM
                        "%d %B %Y %I:%M %p",  # 24 April 2025 10:30 AM
                        "%b %d, %Y %I:%M %p"  # Apr 24, 2025 10:30 AM
                    ]
                    
                    for date_format in date_formats:
                        try:
                            date_obj = datetime.strptime(datetime_str, date_format)
                            break
                        except ValueError:
                            continue
                
                # Check if date_obj was successfully set
                if date_obj:
                    processed_session["datetime_obj"] = date_obj
                    processed_session["is_upcoming"] = date_obj > datetime.now()
                else:
                    # Handle case where no format worked
                    raise ValueError("Could not parse date string")
                    
            except (ValueError, TypeError):
                # If we can't parse the date, determine upcoming from session_type
                if session.get("session_type") == "upcoming":
                    processed_session["is_upcoming"] = True
                else:
                    processed_session["is_upcoming"] = False
            
            # Extract topics from title
            title = session.get("title", "").lower()
            topics = []
            
            # Common tech topics to check in the title
            tech_topics = [
                "python", "javascript", "react", "angular", "vue", "node", "java", "c++", "c#", 
                "php", "ruby", "golang", "rust", "swift", "kotlin", "sql", "nosql", "mongodb",
                "database", "cloud", "aws", "azure", "gcp", "docker", "kubernetes", "devops",
                "ai", "machine learning", "data science", "deep learning", "nlp", "computer vision",
                "blockchain", "iot", "mobile", "web", "frontend", "backend", "fullstack", "ui", "ux",
                "testing", "qa", "security", "agile", "scrum", "kanban", "product management",
                "power bi", "tableau", "excel", "analytics", "big data", "hadoop", "spark", 
                "cybersecurity", "networking", "seo", "digital marketing", "dax"
            ]
            
            for topic in tech_topics:
                if topic in title:
                    topics.append(topic)
            
            processed_session["extracted_topics"] = topics
            
            # Determine if session is technical or soft-skills
            tech_indicators = ["programming", "code", "developer", "software", "tech", "data", 
                               "engineering", "algorithm", "system", "database", "cloud", "devops"]
            
            softskill_indicators = ["career", "leadership", "management", "communication",
                                   "soft skill", "interview", "resume", "cv", "personal",
                                   "growth", "mindset", "wellbeing", "mental health"]
            
            tech_score = sum(1 for indicator in tech_indicators if indicator in title)
            softskill_score = sum(1 for indicator in softskill_indicators if indicator in title)
            
            if tech_score > softskill_score:
                processed_session["session_category"] = "technical"
            elif softskill_score > tech_score:
                processed_session["session_category"] = "soft skills"
            else:
                processed_session["session_category"] = "general"
            
            # Determine experience level based on title
            if any(x in title for x in ["beginner", "basic", "introduction", "101", "fundamentals"]):
                processed_session["experience_level"] = "beginner"
            elif any(x in title for x in ["advanced", "expert", "mastery", "professional"]):
                processed_session["experience_level"] = "advanced"
            elif any(x in title for x in ["intermediate", "part 2", "level 2"]):
                processed_session["experience_level"] = "intermediate"
            else:
                processed_session["experience_level"] = "all levels"
            
            processed_sessions.append(processed_session)
        
        return processed_sessions



if __name__ == "__main__":
    # Example usage
    scraper = HerkeySessionScraper(headless=False)  # Set to True for headless mode
    sessions = scraper.scrape_sessions(max_scroll=3)
    
    # Save raw session data
    scraper.save_to_json(sessions)
    

2025-05-26 11:52:52,320 - __main__ - INFO - Page loaded, waiting for session elements...
2025-05-26 11:52:52,342 - __main__ - INFO - Scrolling iteration 1/3...
2025-05-26 11:52:54,357 - __main__ - INFO - Scrolling iteration 2/3...
2025-05-26 11:52:56,368 - __main__ - INFO - Scrolling iteration 3/3...
2025-05-26 11:52:58,407 - __main__ - INFO - Found 36 session listings
2025-05-26 11:53:07,777 - __main__ - INFO - Saved 36 sessions to herkey_sessions.json



Recommended sessions for John Doe:
1. Lookups Unleashed: Your Ultimate Excel Guide
   Date: 30 May 25 | Time: 2:00 PM
   Host: Rajashree Das
   Type: upcoming | Category: general
   Topics: ui, excel

2. Power BI Master Class (Forage Project)
   Date: 27 May 25 | Time: 10:30 AM
   Host: Swati Agarwal
   Type: upcoming | Category: general
   Topics: power bi

3. Power BI master class (Forage project)
   Date: Live | Time: N/A
   Host: Swati Agarwal
   Type: live | Category: general
   Topics: power bi

4. SQL Training for beginners by Intellipaat
   Date: Today | Time: 1:00 PM
   Host: Priya
   Type: upcoming | Category: general
   Topics: sql, ai

5. SQL Training for beginners by Intellipaat
   Date: 25 May 25 | Time: 1:00 PM
   Host: Priya
   Type: past | Category: general
   Topics: sql, ai



In [9]:
import requests
from bs4 import BeautifulSoup
from typing import List, Dict
import json
import logging
import re
from datetime import datetime

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class HerkeyEventScraper:
    def __init__(self):
        """Initialize the event scraper"""
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
    
    def _get_page_content(self, url: str) -> str:
        """Get HTML content from a URL"""
        try:
            response = requests.get(url, headers=self.headers)
            response.raise_for_status()
            return response.text
        except requests.exceptions.RequestException as e:
            logger.error(f"Error fetching URL {url}: {e}")
            return ""
    
    def scrape_events(self, url: str = "https://events.herkey.com/events") -> List[Dict]:
        """Scrape events from HerKey events page."""
        logger.info(f"Scraping events from {url}")
        html_content = self._get_page_content(url)
        if not html_content:
            return []
            
        soup = BeautifulSoup(html_content, 'html.parser')
        events = []
        
        # Extract event listings based on the provided HTML structure
        event_cards = soup.select(".event-details-card")
        
        for card in event_cards:
            try:
                event = {}
                
                # Extract event details based on the actual HTML structure
                title_elem = card.select_one(".card-heading")
                date_elem = card.select_one(".card-body-data:has(img[src*='calendar'])")
                time_elem = card.select_one(".card-body-data:has(img[src*='clock'])")
                location_elem = card.select_one(".card-body-data:has(img[src*='placeholder'])")
                event_type_elem = card.select_one("span.mr-1, .card-body-data:has(i.fa-bullseye)")
                category_elem = card.select_one(".card-body-data:has(img[src*='tag'])")
                
                if title_elem:
                    title_text = title_elem.get_text(strip=True)
                    # Remove any icon text from the title
                    if title_text:
                        event["title"] = title_text.split("Featured")[0].strip()
                
                if date_elem:
                    date_text = date_elem.get_text(strip=True)
                    if date_text:
                        event["date"] = date_text
                
                if time_elem:
                    time_text = time_elem.get_text(strip=True)
                    if time_text:
                        event["time"] = time_text
                
                if location_elem:
                    location_text = location_elem.get_text(strip=True)
                    if location_text:
                        event["location"] = location_text
                
                if event_type_elem:
                    event_type = event_type_elem.get_text(strip=True)
                    if event_type and "Online" in event_type or "Offline" in event_type:
                        event["event_type"] = event_type
                
                if category_elem:
                    # Parse categories (they're in a special format)
                    category_links = category_elem.select("a")
                    if category_links:
                        event["categories"] = [link.text.strip() for link in category_links]
                
                # Get the registration button type
                register_btn = card.select_one(".register")
                if register_btn:
                    event["registration_status"] = "Open"
                    event["registration_text"] = register_btn.text.strip()
                
                # Check if it's a paid event
                price_elem = card.select_one(".card-body-data:contains('₹')")
                if price_elem:
                    event["price"] = price_elem.text.strip()
                else:
                    event["price"] = "Free"
                
                # Get event ID and URL
                event_id_span = card.select_one("span[id]")
                if event_id_span and event_id_span.get('id'):
                    event_id = event_id_span.get('id')
                    if "title" in event:
                        event["url"] = f"{url}/{event['title'].lower().replace(' ', '-')}/{event_id}"
                
                # Check if it's a featured event
                featured_elem = card.select_one("img[src*='Featured'], img[width='22'][height='22']")
                if featured_elem:
                    event["featured"] = True
                
                if event:  # Only add if we found some data
                    events.append(event)
            except Exception as e:
                logger.error(f"Error extracting event data: {e}")
                
        logger.info(f"Scraped {len(events)} events")
        return events
    
    def save_to_json(self, events: List[Dict], filename: str = "herkey_events.json"):
        """Save scraped events to a JSON file"""
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(events, f, ensure_ascii=False, indent=4)
        logger.info(f"Saved {len(events)} events to {filename}")
    
    def process_events_for_recommendation(self, events: List[Dict]) -> List[Dict]:
        """
        Process event data to make it suitable for AI recommendation engine
        Extracts and standardizes categories, dates and other relevant fields
        """
        processed_events = []
        
        for event in events:
            processed_event = event.copy()
            
            # Process date to datetime object if possible
            date_str = event.get("date", "")
            try:
                # Example format: "25 Apr 2023" - adjust pattern as needed
                date_obj = datetime.strptime(date_str, "%d %b %Y")
                processed_event["datetime_obj"] = date_obj
                processed_event["is_upcoming"] = date_obj > datetime.now()
            except (ValueError, TypeError):
                # If we can't parse the date, keep it as is
                processed_event["is_upcoming"] = True  # Assume upcoming by default
            
            # Standardize event type
            event_type = event.get("event_type", "").lower()
            if "online" in event_type:
                processed_event["mode"] = "online"
            elif "offline" in event_type or "in-person" in event_type:
                processed_event["mode"] = "offline"
            else:
                processed_event["mode"] = "unknown"
            
            # Standardize categories
            categories = event.get("categories", [])
            processed_event["categories_lower"] = [cat.lower() for cat in categories]
            
            # Determine if event is free
            price = event.get("price", "")
            processed_event["is_free"] = price == "Free" or "free" in price.lower()
            
            processed_events.append(processed_event)
        
        return processed_events


def recommend_events(candidate_profile: Dict, events: List[Dict], num_recommendations: int = 5) -> List[Dict]:
    """
    Recommend events based on candidate profile
    
    Args:
        candidate_profile: Dictionary containing candidate interests, career stage, etc.
        events: List of event dictionaries
        num_recommendations: Number of events to recommend
        
    Returns:
        List of recommended event dictionaries
    """
    # Process events for recommendation if they haven't been processed
    if events and "categories_lower" not in events[0]:
        scraper = HerkeyEventScraper()
        events = scraper.process_events_for_recommendation(events)
    
    scored_events = []
    
    # Get candidate preferences
    interests = [interest.lower() for interest in candidate_profile.get("interests", [])]
    preferred_mode = candidate_profile.get("preferred_event_mode", "").lower()
    preferred_locations = [loc.lower() for loc in candidate_profile.get("preferred_locations", [])]
    career_stage = candidate_profile.get("career_stage", "").lower()
    
    # Only consider upcoming events
    upcoming_events = [event for event in events if event.get("is_upcoming", True)]
    
    for event in upcoming_events:
        score = 0
        
        # Match interests with event categories
        event_categories = event.get("categories_lower", [])
        for interest in interests:
            if any(interest in category for category in event_categories):
                score += 10
        
        # Match event mode preference (online/offline)
        event_mode = event.get("mode", "unknown")
        if preferred_mode and event_mode == preferred_mode:
            score += 5
        
        # Match location preference if it's an offline event
        if event_mode == "offline":
            location = event.get("location", "").lower()
            if any(loc in location for loc in preferred_locations):
                score += 5
        
        # Free events might be more appealing
        if event.get("is_free", False):
            score += 2
        
        # Featured events might be more relevant/important
        if event.get("featured", False):
            score += 3
        
        # Career stage specific events
        event_title = event.get("title", "").lower()
        if career_stage in event_title or any(career_stage in cat for cat in event_categories):
            score += 4
        
        scored_events.append((score, event))
    
    # Sort by score in descending order and take top recommendations
    scored_events.sort(reverse=True, key=lambda x: x[0])
    recommended_events = [event for score, event in scored_events[:num_recommendations]]
    
    return recommended_events


if __name__ == "__main__":
    # Example usage
    scraper = HerkeyEventScraper()
    events = scraper.scrape_events()
    
    # Save raw event data
    scraper.save_to_json(events)
    
    # Process data for recommendation engine
    processed_events = scraper.process_events_for_recommendation(events)
    
    # Example candidate profile
    example_candidate = {
        "name": "Jane Doe",
        "interests": ["Technology", "Career Development", "Leadership", "AI"],
        "preferred_event_mode": "online",
        "preferred_locations": ["Bangalore", "Mumbai", "Delhi"],
        "career_stage": "mid-level"
    }
    
    # Get recommendations
    recommendations = recommend_events(example_candidate, processed_events)
    
    print(f"\nRecommended events for {example_candidate['name']}:")
    for i, event in enumerate(recommendations, 1):
        print(f"{i}. {event['title']}")
        print(f"   Date: {event['date']} | Time: {event.get('time', 'N/A')}")
        print(f"   Type: {event.get('event_type', 'N/A')} | Price: {event.get('price', 'N/A')}")
        print(f"   Categories: {', '.join(event.get('categories', []))}")
        print()

2025-05-26 12:01:27,758 - __main__ - INFO - Scraping events from https://events.herkey.com/events
2025-05-26 12:01:29,640 - __main__ - INFO - Scraped 7 events
2025-05-26 12:01:29,643 - __main__ - INFO - Saved 7 events to herkey_events.json



Recommended events for Jane Doe:
1. Advance Your Career with a Premier Executive MBA from Great Lakes
   Date: 16th Apr, 2025 to 31st May, 2025 | Time: 10:00am to 6:00pm
   Type: Online | Price: Free
   Categories: Career Development, event

2. #STEMTheBias Scholarship - Avail Scholarships up to Rs. 80,000
   Date: 3rd Mar, 2025 to 25th Mar, 2025 | Time: 10:00am to 11:59pm
   Type: Online | Price: Free
   Categories: Career Development

3. SkillReBoot program: Restart Your Career Journey
   Date: 21st Apr, 2025 to 30th Jun, 2025 | Time: 6:00pm to 11:00pm
   Type: Online | Price: ₹ 1
₹ 30000
   Categories: Career Development

4. HerFreshStart: Scholarships of up to 75% for Mothers Returning to Work
   Date: 8th May, 2025 to 25th May, 2025 | Time: 3:43pm to 11:59pm
   Type: Offline | Price: Free
   Categories: Career Development, Diversity Drive

5. herShakti
   Date: 8th Mar, 2025 to 10th Apr, 2025 | Time: 12:00am to 11:00pm
   Type: Online | Price: Free
   Categories: JobsForHer found