In [1]:
import os, sys
from bs4 import BeautifulSoup
import requests
from selenium import webdriver
import csv
import requests
import pandas as pd
import numpy as np
import re
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

In [2]:
driver = webdriver.Safari()
driver.get('https://ethz.ch/staffnet/en/service/rooms-and-buildings/roominfo.html')


soup = BeautifulSoup(driver.page_source, 'html.parser')


# Driver displays all rooms
# Button is an anchor element
#button_display_all = soup.find('a', {'title': 'display all'})
button_display_all = driver.find_element_by_css_selector('a[title="display all"]')


#Click anchor element
button_display_all.click()

# Find all the 'tr' elements with class 'trSubtext'
#rows = soup.find_all('tr', {'class':'trSubtext'})
rows = driver.find_elements_by_css_selector('.trSubtext')

In [3]:
def retrieve_room_details(room_details_link, header) -> list:
    '''
    Returns `list` containing room details
    '''

    for i in range(3): print()
    print(f'URL: {room_details_link}')
    # Accesses room-link
    driver.get(room_details_link)

    # Wait until the page is fully loaded
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'table-matrix')))

    # Searches for the first table-matrix, containing the room overview
    room_detail_tables = driver.find_elements_by_css_selector('.table-matrix')
    print(room_detail_tables)
    overview_table = room_detail_tables[0].get_attribute('innerHTML')
    overview_table_soup = BeautifulSoup(overview_table, 'html.parser')

    all_details = overview_table_soup.find_all('td')

    # Since the the features are not always the same, we keep track only of the needed features and enter nan values for the rest
    room_detail = [np.nan for i in range(len(header))]
    overview_points = [point.text.strip() for point in overview_table_soup.find_all('th')[2:]]

    for i, point in enumerate(overview_points):
        if '\xad' in point:
            overview_points[i] = point.replace('\xad', '')


    set_header = set(header)
    set_overview_points = set(overview_points)


    intersection = set_header.intersection(set_overview_points)
    intersection_indeces = [i for i, x in enumerate(overview_points) if x in intersection]


    index = 0

    for i in range(len(header)):
        if not header[i] in intersection:
            continue 

        room_detail[i] = all_details[intersection_indeces[index]].text.strip()
        index += 1

    return room_detail

In [4]:
def retrieve_room_schedule(room_url:str, nbr_elements_schedule:int = 366) -> list:
    '''
    Returns schedule of room
    '''
    
    schedule = [np.nan for i in range(nbr_elements_schedule)]

    # If there is no schedule!
    if room_url == '':
        return schedule

    driver.get(room_url)

    # Wait until the page is fully loaded
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'scrollarea-content')))

    schedule_html = driver.find_elements_by_css_selector('.scrollarea-content')[0].get_attribute('innerHTML')

    room_soup = BeautifulSoup(schedule_html, 'html.parser')


    table = room_soup.find('tbody')

    index = 0

    for row in table.find_all('tr'):

        for day in row.find_all('td'):
            if 'aria-label' in str(day):
                #schedule.append(schedule[-6])
                #continue
                label = day.get('aria-label')

                #print(f'day:{day}, label:{label}')

                # Filters out name of the event
                module = label.split(',')[0]
                schedule[index] = module


                # Filters out duration of event
                label_split = label.split(',')

                # Time_stamp not always in same slot
                time_stamp = None

                for label in label_split:
                    if 'o\'clock' in label:
                        time_stamp = label
                        break
                
                if time_stamp == None:
                    continue
                
                pattern = r'(\d{2}:\d{2})  o\'clock to (\d{2}:\d{2})  o\'clock\)$'
                match = re.search(pattern, time_stamp)
                
                if match:
                    start_time = match.group(1)
                    end_time = match.group(2)
                    
                event_duration = int(end_time.replace(':', ''))-int(start_time.replace(':',''))
                event_duration = event_duration//100 * 60
                
                # Fills in remaining duration of event to list
                for i in range(int(event_duration/15)):
                    schedule[index + i*6] = module

            index += 1

    driver.back()

    return schedule

In [5]:
def room_schedule_header() -> list:

    weekdays = ['mo', 'tu', 'we', 'th', 'fr', 'sa']
    quarters = ['00', '15', '30', '45']
    hours = ['07', '08', '09']+[str(i) for i in range(10, 23)]

    labels = []

    for hour in hours:
        for quarter in quarters:
            for weekday in weekdays:
                labels.append('{}_{}_{}'.format(hour, quarter, weekday))

            if hour == '22':
                break

    return labels

In [6]:
def room_details_header(soup : BeautifulSoup) -> list:
    '''
    Collects Overview names (Building, Floor/Room, etc.)
    '''
    header = [row.text.strip() for row in soup.find_all('th')]

    return header[2:]

In [7]:
# Open a CSV file for writing
with open('room_information.csv', 'w', newline='') as csvfile:
    # Create a CSV writer
    writer = csv.writer(csvfile)

    #Area is the first emelent to be extracted
    header = ['Area', 'Building', 'Floor / Room', 'Room type', 'Seats', 'Seating', 'Floor area', 'Floor shape']
    header.extend(room_schedule_header())

    # Write the header row
    writer.writerow(header)
    

    for i, row in enumerate(rows):
        
        # if i < 45:
        #     continue
            
        # if i > 50:
        #     break

        print(row)

        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'room-overview')))

        row_html = row.get_attribute('innerHTML')

        # Create a BeautifulSoup object from the HTML string
        row_soup = BeautifulSoup(row_html, 'html.parser')

        # Collects the area of the room
        room_information = [row_soup.find_all('td')[1].text.strip()]

        
        # Gets link of room for further information retrieval
        room_details_link = 'https://ethz.ch' + row_soup.find('a', class_='eth-link').get('href')
        
    
        room_details = retrieve_room_details(room_details_link, header[1:8])


        # Add final information to room details
        room_information.extend(room_details)


        # Get room allocation link
        room_schedule_links = driver.find_elements_by_css_selector('.detail-links')
        room_schedule_links_soup = BeautifulSoup(room_schedule_links[0].get_attribute('innerHTML'), 'html.parser')

        for link in room_schedule_links_soup.find_all('a', class_='eth-link'):
            if 'allocation' in link.get('href'):
                allocation_link = 'https://ethz.ch' + link.get('href')
            else:
                allocation_link = ''
            
            
        room_schedule = retrieve_room_schedule(allocation_link)
        room_information.extend(room_schedule)

        # Write the information to the CSV file
        writer.writerow(room_information)

        driver.back()

        


<selenium.webdriver.remote.webelement.WebElement (session="E89D5F76-E39B-445A-8BE3-B8916D16BB32", element="node-45C120AB-DB6D-42C3-A85A-0283B4649DD1")>



URL: https://ethz.ch/staffnet/en/service/rooms-and-buildings/roominfo/detail?building=CAB&floor=G&room=11
[<selenium.webdriver.remote.webelement.WebElement (session="E89D5F76-E39B-445A-8BE3-B8916D16BB32", element="node-D13E3555-0EFD-4BD8-96AF-C28B9EABE05D")>, <selenium.webdriver.remote.webelement.WebElement (session="E89D5F76-E39B-445A-8BE3-B8916D16BB32", element="node-DEFB3CA7-480C-4693-9FE1-2C65A937DB24")>, <selenium.webdriver.remote.webelement.WebElement (session="E89D5F76-E39B-445A-8BE3-B8916D16BB32", element="node-02AF2774-C46B-4121-84E5-C020229C2B5B")>]
<selenium.webdriver.remote.webelement.WebElement (session="E89D5F76-E39B-445A-8BE3-B8916D16BB32", element="node-B7806E66-DBA0-413B-BBE8-EACA174988D3")>



URL: https://ethz.ch/staffnet/en/service/rooms-and-buildings/roominfo/detail?building=CAB&floor=G&room=51
[<selenium.webdriv

In [26]:
df = pd.read_csv('./room_information.csv')

df.head(15)

Unnamed: 0,Area,Building,Floor / Room,Room type,Seats,Seating,Floor area,Floor shape,07_00_mo,07_00_tu,...,21_45_we,21_45_th,21_45_fr,21_45_sa,22_00_mo,22_00_tu,22_00_we,22_00_th,22_00_fr,22_00_sa
0,Zürich Zentrum,CAB,G 11,Lecture hall,190,fixed,212.95,graded,,,...,,,,,,,,,,
1,Zürich Zentrum,CAB,G 51,Lecture hall,90,fixed,101.06,graded,,,...,,,,,,,,,,
2,Zürich Zentrum,CAB,G 52,Seminars / Courses,40,variable,67.27,flat,,,...,,,,,,,,,,
3,Zürich Zentrum,CAB,G 56,Seminars / Courses,40,variable,67.32,flat,,,...,,,,,,,,,,
4,Zürich Zentrum,CAB,G 57,Seminars / Courses,24,variable,44.14,flat,,,...,,,,,,,,,,
5,Zürich Zentrum,CAB,G 59,Lecture hall,42,fixed,68.32,graded,,,...,,,,,,,,,,
6,Zürich Zentrum,CAB,G 61,Lecture hall,190,fixed,212.95,graded,,,...,,,,,,,,,,
7,Zürich Zentrum,CHN,C 14,Lecture hall,172,fixed,175.91,graded,,,...,,,,,,,,,,
8,Zürich Zentrum,CHN,D 29,Seminars / Courses,30,variable,78.87,,,,...,,,,,,,,,,
9,Zürich Zentrum,CHN,D 42,Seminars / Courses,30,variable,47.09,flat,,,...,,,,,,,,,,


In [27]:
driver.quit()