### Imports

In [1]:
import requests
from bs4 import BeautifulSoup
import hashlib
import os
import time
import urllib.parse
import re
import json

### Define URLs

In [2]:
urls = [
    'https://amiv.ethz.ch/en/events/signup/6501bc6e5ff1d3cb04531966',
    'https://rw.ethz.ch/cse-life/apero-2020.html',
    # Add more URLs as needed.
]

### Scrape the pages and save them

In [None]:
# Global list to store discovered "apero" data.
found_apero = []

def load_visited(filename):
    """Load visited URLs from a JSON file."""
    try:
        with open(filename, "r", encoding="utf-8") as f:
            visited_list = json.load(f)
            return set(visited_list)
    except Exception as e:
        print(f"Could not load visited URLs from {filename}: {e}")
        return set()

def save_visited(filename, visited):
    """Save visited URLs to a JSON file."""
    try:
        with open(filename, "w", encoding="utf-8") as f:
            json.dump(list(visited), f, indent=2)
    except Exception as e:
        print(f"Could not save visited URLs to {filename}: {e}")

def extract_event_details(soup):
    """
    Attempts to extract event date and location from a BeautifulSoup
    object. This is a heuristic approach that:
      - Checks for a <time> element for the date.
      - Checks for an element with CSS classes like 'location' or 'venue'.
      - If those are not found, searches the plain text for patterns like 
        "Location:" or "Venue:".
    Returns a tuple (date, location).
    """
    date = "Not found"
    start_time = "Not found"
    end_time = "Not found"
    location = "Not found"
    full_text = soup.get_text("\n", strip=True)

    # Try to find a <time> element in the page
    time_element = soup.find('time')
    if time_element:
        date = time_element.get_text(strip=True)
    
    # Try to find an element with class 'location' or 'venue'
    location_element = soup.find(class_='location') or soup.find(class_='venue')
    if location_element:
        location = location_element.get_text(strip=True)
    else:
        # As a fallback, search for text patterns in the full page text.
        loc_match = re.search(r'(?:Venue|Location)[:\-]\s*([A-Za-z0-9 ,.-]+)', full_text, re.IGNORECASE)
        if loc_match:
            location = loc_match.group(1).strip()
            

    ''' For the AMIV database, we can also check for specific patterns in the text:
    - The date is usually in the format dd-./mm-./yyyy, hh:mm or d-./m-./yyyy, hh:mm.
    - The location is often mentioned in the html-line after the date, usually preceded by a '/'. '''

    #TODO: Fix this class-search!! (Fallback method in the next code-block)
    event_div = soup.find("div", 
                              class_=re.compile(r"^jss\d+$"), 
                              string=re.compile(r'\d{1,2}[-\.\/]\d{1,2}[-\.\/]\d{4}'))
    if event_div:
        event_text = event_div.get_text(" ", strip=True)
        # Search for a date and a time (start or time range) in the event block.
        dt_match = re.search(
            r'(\d{1,2}[-\.\/]\d{1,2}[-\.\/]\d{4})(?:,\s*|\s+)'  # date part with comma or whitespace separator
            r'(\d{1,2}:\d{2}(?::\d{2})?)'                       # starting time
            r'(?:\s*-\s*(\d{1,2}:\d{2}(?::\d{2})?))?',          # optional end time (time range)
            event_text
        )
        if dt_match:
            date = dt_match.group(1).strip()
            start_time = dt_match.group(2).strip()
            if dt_match.group(3):
                end_time = dt_match.group(3).strip()
            else:
                end_time = "Not found"
        # For the location, assume the location is in the div immediately following event_div.
        location_div = event_div.find_next_sibling("div", class_=re.compile(r"^jss\d+$"))
        if location_div:
            # Here we assume the location is the full text of that div.
            location = location_div.get_text(strip=True)

    # else:
    #     # If not found, try to extract a weekday (English or German).
    #     weekday_pattern = r'\b(?:Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday|Montag|Dienstag|Mittwoch|Donnerstag|Freitag|Samstag|Sonntag)\b'
    #     weekday_match = re.search(weekday_pattern, full_text, re.IGNORECASE)
    #     if weekday_match:
    #         date = weekday_match.group(0).strip()

    return date, start_time, end_time, location

def crawl(url, domain, visited, depth=0, max_depth=3):
    """
    Recursively crawl a URL (and its subpages) to find occurrences of 'apero'.
    
    When 'apero' is detected, extract a snippet, the page title, as well as
    event details such as date and location, and store the data.
    
    Parameters:
      - url: The page URL to crawl.
      - domain: The domain to restrict the crawl (e.g., "example.com").
      - visited: A set of URLs already crawled.
      - depth: Current recursion level.
      - max_depth: Maximum recursion depth allowed.
    """
    if url in visited:
        return
    visited.add(url)

    try:
        response = requests.get(url, timeout=10)
        if response.status_code != 200:
            print(f"Skipping {url} due to status code {response.status_code}")
            return
    except Exception as e:
        print(f"Error fetching {url}: {e}")
        return

    html = response.text

    # Parse the page with BeautifulSoup
    soup = BeautifulSoup(html, 'html.parser')
    title = soup.title.string.strip() if soup.title and soup.title.string else "No title"
    
    # Check if the word "apero" is present (case-insensitive)
    if "apero" in html.lower():
        # Extract event details (date and location)
        event_date, event_start_time, event_end_time, location = extract_event_details(soup)
        # Extract a snippet of up to 100 characters before and after the first occurrence of "apero" or "aperitif"
        match = re.search(r'.{0,100}aper.{0,100}', html, re.IGNORECASE)
        snippet = match.group(0) if match else "Snippet not available"
        found_apero.append({
            "url": url,
            "title": title,
            "snippet": snippet,
            "date": event_date,
            "start time": event_start_time,
            "end time": event_end_time,
            "location": location
        })
        print(f"Found 'apero' in: {url}")
        print(f"  Date: {event_date}")
        print(f"  Time: {event_start_time} to {event_end_time}")
        print(f"  Location: {location}")

    # Find and process all <a> tags on the page
    for link in soup.find_all('a'):
        href = link.get('href')
        if not href:
            continue

        # Resolve relative URLs to absolute URLs
        absolute_url = urllib.parse.urljoin(url, href)
        parsed_url = urllib.parse.urlparse(absolute_url)

        # Only follow links within the same domain
        if parsed_url.netloc != domain:
            continue

        # Skip common non-HTML file types (e.g., PDFs, images)
        if any(absolute_url.lower().endswith(ext) for ext in ['.pdf', '.jpg', '.jpeg', '.png', '.gif']):
            continue

        # Recurse if not visited and within the depth limit.
        if absolute_url not in visited and depth < max_depth:
            time.sleep(1)  # Pause a bit to be polite to the server
            crawl(absolute_url, domain, visited, depth + 1, max_depth)
    
# File to save visited URLs
state_filename = "visited_urls.json"

for url in urls:
    print(f"Starting crawl from: {url}")
    # Load visited state from previous runs (if exists)
    visited = load_visited(state_filename)
    domain = urllib.parse.urlparse(url).netloc
    
    # Begin crawling
    crawl(url, domain, visited)
    
    # After crawling, save the updated visited state for future runs
    save_visited(state_filename, visited)

# Save the found "apero" data to a JSON file.
output_filename = "apero_results.json"
with open(output_filename, "w", encoding="utf-8") as f:
    json.dump(found_apero, f, indent=2, ensure_ascii=False)

print(f"Apero data saved to {output_filename}")


Starting crawl from: https://amiv.ethz.ch/en/events/signup/6501bc6e5ff1d3cb04531966
Could not load visited URLs from visited_urls.json: Expecting value: line 1 column 1 (char 0)
Found 'apero' in: https://amiv.ethz.ch/en/events/signup/6501bc6e5ff1d3cb04531966
  Date: Not found
  Time: Not found to Not found
  Location: Not found


In [None]:
    # Split the text into lines.
    lines = full_text.splitlines()

    # Iterate over lines to find a date pattern.
    for i, line in enumerate(lines):
        date_match = re.search(r'(\d{1,2}[-\.\/]\d{1,2}[-\.\/]\d{4}),\s*(\d{1,2}:\d{2}(?::\d{2})?)', line)
        if date_match:

            print(f"DEBUG: Matched line {i}: {line}")
            date = date_match.group(1).strip()
            start_time = date_match.group(2).strip()

            #The event end time is two html lines after the start time (for AMIV).
            end_time = lines[i+2] if i + 2 < len(lines) else "Not found"
    
            # Check the next line for the location info
            if i + 1 < len(lines):

                # TODO: This does not work, because i+1 is not actually the next line but the next character!!!
                print(f"DEBUG: Location? {i+1}: {lines[i + 1]}")
                next_line = lines[i + 1]
                # Using regex to capture the substring starting with the last slash.
                # The pattern ".*(\/\S.*)$" matches as much as possible, then captures the final slash and all following non-newline characters.
                loc_match = re.search(r'.*(\/\s+.*)$', next_line)
                if loc_match:
                    location = loc_match.group(1).strip()
            break