In [426]:
import os, sys
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
import csv
import requests
import pandas as pd
import numpy as np
import re

In [394]:
driver = webdriver.Safari()
driver.get('https://ethz.ch/staffnet/en/service/rooms-and-buildings/roominfo.html')


soup = BeautifulSoup(driver.page_source, 'html.parser')


# Driver displays all rooms
# Button is an anchor element
#button_display_all = soup.find('a', {'title': 'display all'})
button_display_all = driver.find_element_by_css_selector('a[title="display all"]')


#Click anchor element
button_display_all.click()

In [336]:
def retrieve_room_details(soup : BeautifulSoup, header) -> list:
    '''
    Returns `list` containing room details
    '''

    all_details = soup.find_all('td')

    # Since the the features are not always the same, we keep track only of the needed features and enter nan values for the rest
    room_detail = [np.nan for i in range(len(header))]
    overview_points = [point.text.strip() for point in soup.find_all('th')[2:]]

    set_header = set(header)
    set_overview_points = set(overview_points)


    intersection = set_header.intersection(set_overview_points)
    intersection_indeces = [i for i, x in enumerate(overview_points) if x in intersection]


    index = 0

    for i in range(len(header)):
        if not header[i] in intersection:
            continue 

        print(i, index)
        room_detail[i] = all_details[intersection_indeces[index]].text.strip()
        index += 1

    return room_detail

def get_header(soup : BeautifulSoup) -> list:
    '''
    Collects Overview names (Building, Floor/Room, etc.)
    '''
    header = [row.text.strip() for row in soup.find_all('th')]

    return header[2:]

In [410]:
def room_schedule_header() -> list:

    weekdays = ['mo', 'tu', 'we', 'th', 'fr', 'sa']
    quarters = ['00', '15', '30', '45']
    hours = ['07', '08', '09']+[str(i) for i in range(10, 23)]

    labels = []

    for hour in hours:
        for quarter in quarters:
            for weekday in weekdays:
                labels.append('{}_{}_{}'.format(hour, quarter, weekday))

            if hour == '22':
                break

    return labels

In [None]:
# Find all the 'tr' elements with class 'trSubtext'
#rows = soup.find_all('tr', {'class':'trSubtext'})
rows = driver.find_elements_by_css_selector('.trSubtext')

# Open a CSV file for writing
with open('room_information.csv', 'w', newline='') as csvfile:
    # Create a CSV writer
    writer = csv.writer(csvfile)

    #Area is the first emelent to be extracted
    list_header_row = ['Area']
    

    for i, row in enumerate(rows):
        
        if i < 65:
            continue
            
        if i > 70:
            break

        row_html = row.get_attribute('innerHTML')

        # Create a BeautifulSoup object from the HTML string
        row_soup = BeautifulSoup(row_html, 'html.parser')

        # Collects the area of the room
        room_information = [row_soup.find_all('td')[1].text.strip()]

        
        # Gets link of room for further information retrieval
        room_details_link = 'https://ethz.ch' + row_soup.find('a', class_='eth-link').get('href')
        
        
        # Accesses room-link
        driver.get(room_details_link)


        # Searches for the first table-matrix, containing the room overview
        room_detail_tables = driver.find_elements_by_css_selector('.table-matrix')
        overview_table = room_detail_tables[0].get_attribute('innerHTML')
        overview_table_soup = BeautifulSoup(overview_table, 'html.parser')

        

        if i == 65:
            header = ['Area']
            header.extend(get_header(overview_table_soup))


            # Write the header row
            writer.writerow(header)

        # Add final information to room details
        room_information.extend(retrieve_room_details(overview_table_soup, header[1:]))


        # Write the information to the CSV file
        writer.writerow(room_information)

        



In [335]:
df = pd.read_csv('./room_information.csv')

df.head(15)

Unnamed: 0,Area,Build­ing,Floor / Room,Room type,Seats,Seat­ing,Floor area,Floor shape
0,Zürich Zentrum,ETZ,J 91,Seminars / Courses,32.0,variable,49.61,flat
1,Zürich Zentrum,ETZ,K 61,Seminars / Courses,23.0,variable,130.76,
2,Zürich Zentrum,ETZ,K 66,Technology laboratory,,variable,104.79,flat
3,Zürich Zentrum,ETZ,K 91,Seminars / Courses,32.0,variable,49.61,flat
4,Lindau Eschikon,FMG,B 17.1,Meeting room,16.0,variable,49.45,flat
5,Lindau Eschikon,FMG,B 17.2,Seminars / Courses,24.0,variable,49.29,flat


In [445]:
driver.quit()

InvalidSessionIdException: Message: 


In [329]:
driver.back()

In [444]:
def retrieve_room_schedule(room_url : BeautifulSoup, nbr_elements_schedule:int = 366) -> list:
    '''
    Returns schedule of room
    '''

    schedule = [np.nan for i in range(nbr_elements_schedule)]

    driver.get(room_url)

    schedule_html = driver.find_elements_by_css_selector('.scrollarea-content')[0].get_attribute('innerHTML')

    room_soup = BeautifulSoup(schedule_html, 'html.parser')


    table = room_soup.find('tbody')

    index = 0

    for row in table.find_all('tr'):

        for day in row.find_all('td'):
            if 'aria-label' in str(day):
                #schedule.append(schedule[-6])
                #continue
                label = day.get('aria-label')

                # Filters out name of the event
                module = label.split(',')[0]
                schedule[index] = module


                # Filters out duration of event
                time_stamp = label.split(',')[3]


                pattern = r'(\d{2}:\d{2})  o\'clock to (\d{2}:\d{2})  o\'clock\)$'
                match = re.search(pattern, time_stamp)
                
                if match:
                    start_time = match.group(1)
                    end_time = match.group(2)
                    
                event_duration = int(end_time.replace(':', ''))-int(start_time.replace(':',''))
                event_duration = event_duration//100 * 60
                
                # Fills in remaining duration of event to list
                for i in range(int(event_duration/15)):
                    schedule[index + i*6] = module

            index += 1

    return schedule

print(retrieve_room_schedule('https://ethz.ch/staffnet/en/service/rooms-and-buildings/roominfo/detail/room-allocation.html?room=CHN+D+42&from=2023-02-27&to=2023-04-01'))

[nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, 'Environmental and Agricultural Regulation: Law and Governance', nan, nan, nan, nan, nan, 'Environmental and Agricultural Regulation: Law and Governance', nan, nan, nan, nan, nan, 'Environmental and Agricultural Regulation: Law and Governance', nan, nan, nan, nan, nan, 'Komplexe Analysis', nan, nan, nan, nan, nan, 'Komplexe Analysis', nan, nan, nan, nan, nan, 'Komplexe Analysis', nan, nan, nan, nan, 'Analysis II: mehrere Variablen', 'Plant Ecology', 'Umweltproblemlösen II', nan, 'Parallele Programmierung', nan, 'Analysis II: mehrere Variablen', 'Plant Ecology', 'Umweltproblemlösen II', nan, 'Parallele Programmierung', nan, 'Analysis II: mehrere Variablen', 'Plant Ecology', 'Umweltproblemlösen II', nan, 'Parallele Programmierung', nan, 'Analysis II: mehrere Variablen', 'Plant Ecology', 'Umweltproblemlösen II', nan, 'Principles of Management for Sustaina