In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd

In [2]:
def get_degree_courses_links():
    
    chromedriver_path = '/opt/homebrew/bin/chromedriver'
    exclude_keywords = ['sitemap', 'accessibility', 'privacy', 'dipartimenti', 'scuole']
    all_links = []

    for page in range(1, 17):
        # Set up the Chrome options
        chrome_options = Options()
        chrome_options.add_argument("--headless")  # Run in headless mode
        
        # Set up the WebDriver
        service = Service(executable_path=chromedriver_path)
        driver = webdriver.Chrome(service=service, options=chrome_options)
        
        # Open the website
        url = f"https://www.corsi.unina.it/?page={page}"
        driver.get(url)
        
        # Wait for the page to load
        WebDriverWait(driver, 2).until(EC.presence_of_element_located((By.TAG_NAME, "a")))
        
        # Give some time for JS to load links
        time.sleep(1)
        
        # Extract all links
        links = driver.find_elements(By.TAG_NAME, "a")
        
        # Extract the href attribute
        links = [link.get_attribute('href') for link in links if link.get_attribute('href')]
        
        # Add the prefix to the links if they are relative
        prefix = 'https://www.corsi.unina.it'
        links = [link if link.startswith('http') else prefix + link for link in links]
        
        # Close the WebDriver
        driver.quit()
            
        # Give some time to the server
        time.sleep(1)
            
        # Filter out None links
        links = [link for link in links if link]
    
        # Filter out links containing javascript:void(0)
        links = [link for link in links if 'javascript:void(0)' not in link]
        
        # Remove links that are not useful
        links = links[5:-5]

        # Filter out None links and links containing keywords
        for link in links:
            if link and not any(keyword in link for keyword in exclude_keywords):
                all_links.append(link)

    return all_links

degree_courses_links = get_degree_courses_links()
print(degree_courses_links)


['https://www.corsi.unina.it/N70', 'https://www.corsi.unina.it/P14', 'https://www.corsi.unina.it/P53', 'https://www.corsi.unina.it/D06', 'https://www.corsi.unina.it/N14', 'https://www.corsi.unina.it/N17', 'https://www.corsi.unina.it/D07', 'https://www.corsi.unina.it/D18', 'https://www.corsi.unina.it/P16', 'https://www.corsi.unina.it/P30', 'https://www.corsi.unina.it/P58', 'https://www.corsi.unina.it/P54', 'https://www.corsi.unina.it/P07', 'https://www.corsi.unina.it/N79', 'https://www.corsi.unina.it/N80', 'https://www.corsi.unina.it/N76', 'https://www.corsi.unina.it/D09', 'https://www.corsi.unina.it/P64', 'https://www.corsi.unina.it/M15', 'https://www.corsi.unina.it/D08', 'https://www.corsi.unina.it/P56', 'https://www.corsi.unina.it/N75', 'https://www.corsi.unina.it/N83', 'https://www.corsi.unina.it/N84', 'https://www.corsi.unina.it/N31', 'https://www.corsi.unina.it/N86', 'https://www.corsi.unina.it/N37', 'https://www.corsi.unina.it/N39', 'https://www.corsi.unina.it/N42', 'https://www.

In [None]:
def gather_course_info(link):
    
    # Set up the Chrome options
    chrome_options = Options()
    chrome_options.add_argument("--headless")

    # Set up the WebDriver
    service = Service(executable_path='/opt/homebrew/bin/chromedriver')
    driver = webdriver.Chrome(service=service, options=chrome_options)

    # Open the degree course page
    driver.get(link)
    
    # Wait for the page to load
    WebDriverWait(driver, 2.5).until(EC.presence_of_element_located((By.TAG_NAME, "h1")))
    time.sleep(1)  # Allow extra time for the page to load fully
    
    course_info = {
        'Code': None,
        'Name': None,
        'English name': None,
        'Coordinator': None,
        'Contacts': [],
        'Website': [],
        'Students opinions': [],
        'Course type': None,
        'Duration': None,
        'Language': None,
        'Degree class': None,
        'Activation year': None,
        'School': None,
        'Department': None,
        'Locations': None,
        'Course Homepage': link
    }

    try:
      # Locate the element using XPath
      text_element = driver.find_element(By.XPATH, "/html/body/app-root/div/div/div/app-course-details/div/div/div/app-home-corso/div/div[2]")
      /html/body/app-root/div/div/div/app-course-details/div/div/div/app-home-corso/div/div[3]/div/div[3]/div/div/font/font
      /html/body/app-root/div/div/div/app-course-details/div/div/div/app-home-corso/div/div[3]/div/div[3]/div/div/text()
      # Check if the text contains "Phased out degree program" (case-insensitive)
      if "phased out degree program".lower() in text_element.text.lower():
          print(f"Skipping phased out degree program: {link}")
        # try:
        #     course_info['Code'] = driver.find_element(By.XPATH, "/html/body/app-root/div/div/div/app-course-details/div/div/div/app-home-corso/div/div[3]/div/div[1]").text.split("\n")[1]
        # except:
        #     pass
        # 
        # try:
        #     course_info['Name'] = driver.find_element(By.XPATH, "/html/body/app-root/div/div/div/app-course-details/div/div/div/app-home-corso/div/div[3]/div/div[1]").text.split("\n")[3]
        # except:
        #     pass
        # 
        # try:
        #     course_info['English name'] = driver.find_element(By.XPATH, "/html/body/app-root/div/div/div/app-course-details/div/div/div/app-home-corso/div/div[3]/div/div[1]").text.split("\n")[5]
        # except:
        #     pass
        # 
        # try:
        #     course_info['Coordinator'] = driver.find_element(By.XPATH, "/html/body/app-root/div/div/div/app-course-details/div/div/div/app-home-corso/div/div[3]/div/div[1]").text.split("\n")[7]
        # except:
        #     pass    
        # 
        # try:
        #     course_info['Contacts'] = driver.find_element(By.XPATH, "/html/body/app-root/div/div/div/app-course-details/div/div/div/app-home-corso/div/div[3]/div/div[1]/ul/li[5]").text.split("\n")[1:]
        # except:
        #     pass 
        # 
        # try:
        #     course_info['Website'] = driver.find_element(By.XPATH, "/html/body/app-root/div/div/div/app-course-details/div/div/div/app-home-corso/div/div[3]/div/div[1]/ul/li[6]").text.split("\n")[1:]
        # except:
        #     pass     
        #     
        # try:
        #     course_info['Students opinions'] = driver.find_element(By.XPATH, "/html/body/app-root/div/div/div/app-course-details/div/div/div/app-home-corso/div/div[3]/div/div[1]/ul/li[7]").text.split("\n")[1:]
        # except:
        #     pass 
        # 
        # try:
        #     course_info['Course type'] = driver.find_element(By.XPATH, "/html/body/app-root/div/div/div/app-course-details/div/div/div/app-home-corso/div/div[4]/div").text.split("\n")[1]
        # except:
        #     pass 
        # 
        # try:
        #     course_info['Duration'] = driver.find_element(By.XPATH, "/html/body/app-root/div/div/div/app-course-details/div/div/div/app-home-corso/div/div[4]/div").text.split("\n")[3]
        # except:
        #     pass 
        # 
        # try:
        #     course_info['Language'] = driver.find_element(By.XPATH, "/html/body/app-root/div/div/div/app-course-details/div/div/div/app-home-corso/div/div[4]/div").text.split("\n")[5]
        # except:
        #     pass 
        # 
        # try:
        #     course_info['Activation year'] = driver.find_element(By.XPATH, "/html/body/app-root/div/div/div/app-course-details/div/div/div/app-home-corso/div/div[4]/div").text.split("\n")[7]
        # except:
        #     pass 
        # 
        # try:
        #     course_info['Degree class'] = driver.find_element(By.XPATH, "/html/body/app-root/div/div/div/app-course-details/div/div/div/app-home-corso/div/div[4]/div").text.split("\n")[9]
        # except:
        #     pass 
        # 
        # try:
        #     course_info['School'] = driver.find_element(By.XPATH, "/html/body/app-root/div/div/div/app-course-details/div/div/div/app-home-corso/div/div[4]/div").text.split("\n")[11]
        # except:
        #     pass 
        # 
        # try:
        #     course_info['Department'] = driver.find_element(By.XPATH, "/html/body/app-root/div/div/div/app-course-details/div/div/div/app-home-corso/div/div[4]/div").text.split("\n")[13]
        # except:
        #     pass 
        # 
        # try:
        #     course_info['Locations'] = driver.find_element(By.XPATH, "/html/body/app-root/div/div/div/app-course-details/div/div/div/app-home-corso/div/div[4]/div/div[8]/div/div").text.split("\n")[1:]
        # except:
        #     pass     

      else:
        try:
            course_info['Code'] = driver.find_element(By.XPATH, "/html/body/app-root/div/div/div/app-course-details/div/div/div/app-home-corso/div/div[2]/div/div[1]").text.split("\n")[1]
        except:
            pass
        
        try:
            course_info['Name'] = driver.find_element(By.XPATH, "/html/body/app-root/div/div/div/app-course-details/div/div/div/app-home-corso/div/div[2]/div/div[1]").text.split("\n")[3]
        except:
            pass
        
        try:
            course_info['English name'] = driver.find_element(By.XPATH, "/html/body/app-root/div/div/div/app-course-details/div/div/div/app-home-corso/div/div[2]/div/div[1]").text.split("\n")[5]
        except:
            pass
        
        try:
            course_info['Coordinator'] = driver.find_element(By.XPATH, "/html/body/app-root/div/div/div/app-course-details/div/div/div/app-home-corso/div/div[2]/div/div[1]").text.split("\n")[7]
        except:
            pass    
    
        try:
            course_info['Contacts'] = driver.find_element(By.XPATH, "/html/body/app-root/div/div/div/app-course-details/div/div/div/app-home-corso/div/div[2]/div/div[1]/ul/li[5]").text.split("\n")[1:]
        except:
            pass 
        
        try:
            course_info['Website'] = driver.find_element(By.XPATH, "/html/body/app-root/div/div/div/app-course-details/div/div/div/app-home-corso/div/div[2]/div/div[1]/ul/li[6]").text.split("\n")[1:]
        except:
            pass     
            
        try:
            course_info['Students opinions'] = driver.find_element(By.XPATH, "/html/body/app-root/div/div/div/app-course-details/div/div/div/app-home-corso/div/div[2]/div/div[1]/ul/li[7]").text.split("\n")[1:]
        except:
            pass 
        
        try:
            course_info['Course type'] = driver.find_element(By.XPATH, "/html/body/app-root/div/div/div/app-course-details/div/div/div/app-home-corso/div/div[3]/div").text.split("\n")[1]
        except:
            pass 
        
        try:
            course_info['Duration'] = driver.find_element(By.XPATH, "/html/body/app-root/div/div/div/app-course-details/div/div/div/app-home-corso/div/div[3]/div").text.split("\n")[3]
        except:
            pass 
        
        try:
            course_info['Language'] = driver.find_element(By.XPATH, "/html/body/app-root/div/div/div/app-course-details/div/div/div/app-home-corso/div/div[3]/div").text.split("\n")[5]
        except:
            pass 
        
        try:
            course_info['Activation year'] = driver.find_element(By.XPATH, "/html/body/app-root/div/div/div/app-course-details/div/div/div/app-home-corso/div/div[3]/div").text.split("\n")[7]
        except:
            pass 
        
        try:
            course_info['Degree class'] = driver.find_element(By.XPATH, "/html/body/app-root/div/div/div/app-course-details/div/div/div/app-home-corso/div/div[3]/div").text.split("\n")[9]
        except:
            pass 
        
        try:
            course_info['School'] = driver.find_element(By.XPATH, "/html/body/app-root/div/div/div/app-course-details/div/div/div/app-home-corso/div/div[3]/div").text.split("\n")[11]
        except:
            pass 
        
        try:
            course_info['Department'] = driver.find_element(By.XPATH, "/html/body/app-root/div/div/div/app-course-details/div/div/div/app-home-corso/div/div[3]/div").text.split("\n")[13]
        except:
            pass 
        
        try:
            course_info['Locations'] = driver.find_element(By.XPATH, "/html/body/app-root/div/div/div/app-course-details/div/div/div/app-home-corso/div/div[3]/div/div[8]/div/div").text.split("\n")[1:]
        except:
            pass     
    except:
      print("Error: Element not found using the provided XPath.")


    
    driver.quit()
    time.sleep(1)

    return course_info

def get_all_course_info(links):
    
    all_course_info = []

    for link in links:
        course_info = gather_course_info(link)
        all_course_info.append(course_info)
        
    return all_course_info


sample_links = ["https://www.corsi.unina.it/N70", "https://www.corsi.unina.it/N25", "https://www.corsi.unina.it/M80"]
all_course_info = get_all_course_info(sample_links)

# all_course_info = get_all_course_info(degree_courses_links)

print(all_course_info)

In [None]:
course_info_df = pd.DataFrame(all_course_info)

In [None]:
course_info_df = course_info_df.dropna(subset=['Code']).reset_index(drop=True)

In [None]:
course_info_df