In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd

In [2]:
def get_degree_courses_links():
    
    chromedriver_path = '/opt/homebrew/bin/chromedriver'
    exclude_keywords = ['sitemap', 'accessibility', 'privacy', 'dipartimenti', 'scuole']
    all_links = []

    for page in range(1, 17):
        # Set up the Chrome options
        chrome_options = Options()
        chrome_options.add_argument("--headless")  # Run in headless mode
        
        # Set up the WebDriver
        service = Service(executable_path=chromedriver_path)
        driver = webdriver.Chrome(service=service, options=chrome_options)
        
        # Open the website
        url = f"https://www.corsi.unina.it/?page={page}"
        driver.get(url)
        
        # Wait for the page to load
        WebDriverWait(driver, 2).until(EC.presence_of_element_located((By.TAG_NAME, "a")))
        
        # Give some time for JS to load links
        time.sleep(1)
        
        # Extract all links
        links = driver.find_elements(By.TAG_NAME, "a")
        
        # Extract the href attribute
        links = [link.get_attribute('href') for link in links if link.get_attribute('href')]
        
        # Add the prefix to the links if they are relative
        prefix = 'https://www.corsi.unina.it'
        links = [link if link.startswith('http') else prefix + link for link in links]
        
        # Close the WebDriver
        driver.quit()
            
        # Give some time to the server
        time.sleep(1)
            
        # Filter out None links
        links = [link for link in links if link]
    
        # Filter out links containing javascript:void(0)
        links = [link for link in links if 'javascript:void(0)' not in link]
        
        # Remove links that are not useful
        links = links[5:-5]

        # Filter out None links and links containing keywords
        for link in links:
            if link and not any(keyword in link for keyword in exclude_keywords):
                all_links.append(link)

    return all_links

degree_courses_links = get_degree_courses_links()
print(degree_courses_links)


['https://www.corsi.unina.it/N70', 'https://www.corsi.unina.it/P14', 'https://www.corsi.unina.it/P53', 'https://www.corsi.unina.it/D06', 'https://www.corsi.unina.it/N14', 'https://www.corsi.unina.it/N17', 'https://www.corsi.unina.it/D07', 'https://www.corsi.unina.it/D18', 'https://www.corsi.unina.it/P16', 'https://www.corsi.unina.it/P30', 'https://www.corsi.unina.it/P58', 'https://www.corsi.unina.it/P54', 'https://www.corsi.unina.it/P07', 'https://www.corsi.unina.it/N79', 'https://www.corsi.unina.it/N80', 'https://www.corsi.unina.it/N76', 'https://www.corsi.unina.it/D09', 'https://www.corsi.unina.it/P64', 'https://www.corsi.unina.it/M15', 'https://www.corsi.unina.it/D08', 'https://www.corsi.unina.it/P56', 'https://www.corsi.unina.it/N75', 'https://www.corsi.unina.it/N83', 'https://www.corsi.unina.it/N84', 'https://www.corsi.unina.it/N31', 'https://www.corsi.unina.it/N86', 'https://www.corsi.unina.it/N37', 'https://www.corsi.unina.it/N39', 'https://www.corsi.unina.it/N42', 'https://www.

In [3]:
len(degree_courses_links)

187

In [36]:
def gather_course_info(link):
    
    # Set up the Chrome options
    chrome_options = Options()
    chrome_options.add_argument("--headless")

    # Set up the WebDriver
    service = Service(executable_path='/opt/homebrew/bin/chromedriver')
    driver = webdriver.Chrome(service=service, options=chrome_options)

    # Open the degree course page
    driver.get(link)
    
    # Wait for the page to load
    WebDriverWait(driver, 3).until(EC.presence_of_element_located((By.TAG_NAME, "h1")))
    time.sleep(2)  # Allow extra time for the page to load fully
    
    course_info = {
        'Code': None,
        'Phased out': None,
        'Name': None,
        'English name': None,
        'Coordinator': None,
        'Contacts': [],
        'Website': [],
        "Students' opinions": [],
        'Course type': None,
        'Duration': None,
        'Language': None,
        'Degree class': None,
        'Activation year': None,
        'School': None,
        'Department': None,
        'Locations': None,
        'Course Homepage': link,
    }

    try:
      # Locate the element using XPath
      text_element = driver.find_element(By.XPATH, "/html/body/app-root/div/div/div/app-course-details/div/div/div/app-home-corso/div/div[2]").text

      # Check if the text contains "Phased out degree program"
      if "Phased out degree program" in text_element:
        course_info['Phased out'] = "Yes"
        # print(f"Skipping phased out degree program: {link}")
        try:
            course_info['Code'] = driver.find_element(By.XPATH, "/html/body/app-root/div/div/div/app-course-details/div/div/div/app-home-corso/div/div[3]/div/div[1]/ul/li[1]").text.removeprefix('Code\n')
            #.text.split("\n")[1]
        except:
            pass

        try:
            course_info['Name'] = driver.find_element(By.XPATH, "/html/body/app-root/div/div/div/app-course-details/div/div/div/app-home-corso/div/div[3]/div/div[1]/ul/li[2]").text.removeprefix('Name\n')
        except:
            pass

        try:
            course_info['English name'] = driver.find_element(By.XPATH, "/html/body/app-root/div/div/div/app-course-details/div/div/div/app-home-corso/div/div[3]/div/div[1]/ul/li[3]").text.removeprefix('English name\n')
        except:
            pass

        try:
            course_info['Coordinator'] = driver.find_element(By.XPATH, "/html/body/app-root/div/div/div/app-course-details/div/div/div/app-home-corso/div/div[3]/div/div[1]/ul/li[4]").text.removeprefix('Coordinator\n')
        except:
            pass    

        try:
            course_info['Contacts'] = driver.find_element(By.XPATH, "/html/body/app-root/div/div/div/app-course-details/div/div/div/app-home-corso/div/div[3]/div/div[1]/ul/li[5]").text.removeprefix('Contacts\n')
        except:
            pass 

        try:
            course_info['Website'] = driver.find_element(By.XPATH, "/html/body/app-root/div/div/div/app-course-details/div/div/div/app-home-corso/div/div[3]/div/div[1]/ul/li[6]").removeprefix('Website\n')
        except:
            pass     

        try:
            course_info["Students' opinions"] = driver.find_element(By.XPATH, "/html/body/app-root/div/div/div/app-course-details/div/div/div/app-home-corso/div/div[3]/div/div[1]/ul/li[7]").text.removeprefix("Students' opinions\n").split("\n")
        except:
            pass 

        try:
            course_info['Course type'] = driver.find_element(By.XPATH, "/html/body/app-root/div/div/div/app-course-details/div/div/div/app-home-corso/div/div[4]/div/div[1]").text.removeprefix('Course type\n')
        except:
            pass 

        try:
            course_info['Duration'] = driver.find_element(By.XPATH, "/html/body/app-root/div/div/div/app-course-details/div/div/div/app-home-corso/div/div[4]/div/div[2]").text.removeprefix('Duration\n')
        except:
            pass 

        try:
            course_info['Language'] = driver.find_element(By.XPATH, "/html/body/app-root/div/div/div/app-course-details/div/div/div/app-home-corso/div/div[4]/div/div[3]").text.removeprefix('Language\n')
        except:
            pass 

        try:
            course_info['Activation year'] = driver.find_element(By.XPATH, "/html/body/app-root/div/div/div/app-course-details/div/div/div/app-home-corso/div/div[4]/div/div[4]").text.removeprefix('Activation year\n')
        except:
            pass 

        try:
            course_info['Degree class'] = driver.find_element(By.XPATH, "/html/body/app-root/div/div/div/app-course-details/div/div/div/app-home-corso/div/div[4]/div/div[5]").text.removeprefix('Degree class\n')
        except:
            pass 

        try:
            course_info['School'] = driver.find_element(By.XPATH, "/html/body/app-root/div/div/div/app-course-details/div/div/div/app-home-corso/div/div[4]/div/div[6]").text.removeprefix('School\n')
        except:
            pass 

        try:
            course_info['Department'] = driver.find_element(By.XPATH, "/html/body/app-root/div/div/div/app-course-details/div/div/div/app-home-corso/div/div[4]/div/div[7]").text.removeprefix('Department\n')
        except:
            pass 

        try:
            course_info['Locations'] = driver.find_element(By.XPATH, "/html/body/app-root/div/div/div/app-course-details/div/div/div/app-home-corso/div/div[4]/div/div[8]").text.removeprefix('Locations\n')
        except:
            pass     

      else:
        course_info['Phased out'] = "No"
        try:
            course_info['Code'] = driver.find_element(By.XPATH, "/html/body/app-root/div/div/div/app-course-details/div/div/div/app-home-corso/div/div[2]/div/div[1]/ul/li[1]").text.removeprefix('Code\n')
        except:
            pass

        try:
            course_info['Name'] = driver.find_element(By.XPATH, '/html/body/app-root/div/div/div/app-course-details/div/div/div/app-home-corso/div/div[2]/div/div[1]/ul/li[2]').text.removeprefix('Name\n')
        except:
            pass

        try:
            course_info['English name'] = driver.find_element(By.XPATH, "/html/body/app-root/div/div/div/app-course-details/div/div/div/app-home-corso/div/div[2]/div/div[1]/ul/li[3]").text.removeprefix('English name\n')
        except:
            pass

        try:
            course_info['Coordinator'] = driver.find_element(By.XPATH, "/html/body/app-root/div/div/div/app-course-details/div/div/div/app-home-corso/div/div[2]/div/div[1]/ul/li[4]").text.removeprefix('Coordinator\n')
        except:
            pass    

        try:
            course_info['Contacts'] = driver.find_element(By.XPATH, "/html/body/app-root/div/div/div/app-course-details/div/div/div/app-home-corso/div/div[2]/div/div[1]/ul/li[5]").text.removeprefix('Contacts\n')
        except:
            pass 

        try:
            course_info['Website'] = driver.find_element(By.XPATH, "/html/body/app-root/div/div/div/app-course-details/div/div/div/app-home-corso/div/div[2]/div/div[1]/ul/li[6]").text.removeprefix('Website\n')
        except:
            pass     

        try:
            course_info["Students' opinions"] = driver.find_element(By.XPATH, "/html/body/app-root/div/div/div/app-course-details/div/div/div/app-home-corso/div/div[2]/div/div[1]/ul/li[7]").text.removeprefix("Students' opinions\n").split("\n")
        except:
            pass 

        try:
            course_info['Course type'] = driver.find_element(By.XPATH, "/html/body/app-root/div/div/div/app-course-details/div/div/div/app-home-corso/div/div[3]/div/div[1]").text.removeprefix('Course type\n')
        except:
            pass 

        try:
            course_info['Duration'] = driver.find_element(By.XPATH, "/html/body/app-root/div/div/div/app-course-details/div/div/div/app-home-corso/div/div[3]/div/div[2]").text.removeprefix('Duration\n')
        except:
            pass 

        try:
            course_info['Language'] = driver.find_element(By.XPATH, "/html/body/app-root/div/div/div/app-course-details/div/div/div/app-home-corso/div/div[3]/div/div[3]").text.removeprefix('Language\n')
        except:
            pass 

        try:
            course_info['Activation year'] = driver.find_element(By.XPATH, "/html/body/app-root/div/div/div/app-course-details/div/div/div/app-home-corso/div/div[3]/div/div[4]").text.removeprefix('Activation year\n')
        except:
            pass 

        try:
            course_info['Degree class'] = driver.find_element(By.XPATH, "/html/body/app-root/div/div/div/app-course-details/div/div/div/app-home-corso/div/div[3]/div/div[5]").text.removeprefix('Degree class\n')
        except:
            pass 

        try:
            course_info['School'] = driver.find_element(By.XPATH, "/html/body/app-root/div/div/div/app-course-details/div/div/div/app-home-corso/div/div[3]/div/div[6]").text.removeprefix('School\n')
        except:
            pass 

        try:
            course_info['Department'] = driver.find_element(By.XPATH, "/html/body/app-root/div/div/div/app-course-details/div/div/div/app-home-corso/div/div[3]/div/div[7]").text.removeprefix('Department\n')
            #.text.split("\n")[13]
        except:
            pass 

        try:
            course_info['Locations'] = driver.find_element(By.XPATH, "/html/body/app-root/div/div/div/app-course-details/div/div/div/app-home-corso/div/div[3]/div/div[8]").text.removeprefix('Locations\n')
            #.text.split("\n")[1:]
        except:
            pass     
    except:
      print("Error: Element not found using the provided XPath.")
    
    driver.quit()
    time.sleep(1)

    return course_info

def get_all_course_info(links):
    
    all_course_info = []

    for link in links:
        course_info = gather_course_info(link)
        all_course_info.append(course_info)
        print(f"Processed: {link}")
    return all_course_info


# sample_links = ["https://www.corsi.unina.it/M80", "https://www.corsi.unina.it/P14", "https://www.corsi.unina.it/N14"]
# all_course_info = get_all_course_info(sample_links)

all_course_info = get_all_course_info(degree_courses_links)

print(all_course_info)

[{'Code': 'N70', 'Phased out': 'No', 'Name': "ARCHEOLOGIA E STORIA DELL'ARTE", 'English name': 'ARCHAEOLOGY AND HISTORY OF ART', 'Coordinator': 'DI MEGLIO ROSALBA', 'Contacts': 'rosalba.dimeglio@unina.it', 'Website': 'Not indicated', "Students' opinions": ['2022-', '2021-', '2020'], 'Course type': 'Laurea Magistrale', 'Duration': '2 anni', 'Language': 'Italian', 'Degree class': 'LM-2 - Classe delle lauree magistrali in Archeologia', 'Activation year': '2008', 'School': 'SCUOLA DELLE SCIENZE UMANE E SOCIALI', 'Department': 'DIPARTIMENTO DI STUDI UMANISTICI', 'Locations': 'NAPOLI', 'Course Homepage': 'https://www.corsi.unina.it/N70'}, {'Code': 'P14', 'Phased out': 'No', 'Name': 'ARCHEOLOGIA, STORIA DELLE ARTI E SCIENZE DEL PATRIMONIO CULTURALE', 'English name': 'ARCHEOLOGY, HISTORY OF ARTS AND SCIENCE OF ARTISTIC HERITAGE', 'Coordinator': 'CICALA LUIGI', 'Contacts': 'luigi.cicala@unina.it', 'Website': 'Not indicated', "Students' opinions": ['2022-', '2021-', '2020'], 'Course type': 'Laur

In [37]:
# Prettify the result and display the JSON
import json

output = json.dumps(all_course_info, indent=2)

line_list = output.split("\n")  # Sort of line replacing "\n" with a new line

for line in line_list:
    print(line)


[
  {
    "Code": "N70",
    "Phased out": "No",
    "Name": "ARCHEOLOGIA E STORIA DELL'ARTE",
    "English name": "ARCHAEOLOGY AND HISTORY OF ART",
    "Coordinator": "DI MEGLIO ROSALBA",
    "Contacts": "rosalba.dimeglio@unina.it",
    "Website": "Not indicated",
    "Students' opinions": [
      "2022-",
      "2021-",
      "2020"
    ],
    "Course type": "Laurea Magistrale",
    "Duration": "2 anni",
    "Language": "Italian",
    "Degree class": "LM-2 - Classe delle lauree magistrali in Archeologia",
    "Activation year": "2008",
    "School": "SCUOLA DELLE SCIENZE UMANE E SOCIALI",
    "Department": "DIPARTIMENTO DI STUDI UMANISTICI",
    "Locations": "NAPOLI",
    "Course Homepage": "https://www.corsi.unina.it/N70"
  },
  {
    "Code": "P14",
    "Phased out": "No",
    "Name": "ARCHEOLOGIA, STORIA DELLE ARTI E SCIENZE DEL PATRIMONIO CULTURALE",
    "English name": "ARCHEOLOGY, HISTORY OF ARTS AND SCIENCE OF ARTISTIC HERITAGE",
    "Coordinator": "CICALA LUIGI",
    "Contacts

In [38]:
# Save link output to file

file_name = 'courses_description_legacy.json'

# Open the file in write mode and write the output string to it
with open(file_name, 'w') as file:
    file.write(output)

print(f"The output has been saved to {file_name}")

The output has been saved to courses_description_legacy.json


In [39]:
course_info_df = pd.DataFrame(all_course_info)

In [None]:
course_info_df = course_info_df.dropna(subset=['Code']).reset_index(drop=True)

In [40]:
course_info_df

Unnamed: 0,Code,Phased out,Name,English name,Coordinator,Contacts,Website,Students' opinions,Course type,Duration,Language,Degree class,Activation year,School,Department,Locations,Course Homepage
0,N70,No,ARCHEOLOGIA E STORIA DELL'ARTE,ARCHAEOLOGY AND HISTORY OF ART,DI MEGLIO ROSALBA,rosalba.dimeglio@unina.it,Not indicated,"[2022-, 2021-, 2020]",Laurea Magistrale,2 anni,Italian,LM-2 - Classe delle lauree magistrali in Arche...,2008,SCUOLA DELLE SCIENZE UMANE E SOCIALI,DIPARTIMENTO DI STUDI UMANISTICI,NAPOLI,https://www.corsi.unina.it/N70
1,P14,No,"ARCHEOLOGIA, STORIA DELLE ARTI E SCIENZE DEL P...","ARCHEOLOGY, HISTORY OF ARTS AND SCIENCE OF ART...",CICALA LUIGI,luigi.cicala@unina.it,Not indicated,"[2022-, 2021-, 2020]",Laurea,3 anni,Italian,L-1 - Classe delle lauree in Beni culturali,2015,SCUOLA DELLE SCIENZE UMANE E SOCIALI,DIPARTIMENTO DI STUDI UMANISTICI,NAPOLI,https://www.corsi.unina.it/P14
2,P53,No,ARCHITECTURE AND HERITAGE,ARCHITECTURE AND HERITAGE,PANE ANDREA,andrea.pane@unina.it,Not indicated,"[2022-, 2021-, 2020]",Laurea Magistrale,2 anni,English,LM-4 - Classe delle lauree magistrali in Archi...,2021,SCUOLA POLITECNICA E DELLE SCIENZE DI BASE,DIPARTIMENTO DI ARCHITETTURA,NAPOLI,https://www.corsi.unina.it/P53
3,D06,No,ARCHITETTURA,Not indicated,CERRETA MARIA,maria.cerreta@unina.it,Not indicated,"[2022-, 2021-, 2020]",Laurea Magistrale Ciclo Unico 5 anni,5 anni,Italian,LM-4 c.u. - Classe delle lauree magistrali in ...,2023,SCUOLA POLITECNICA E DELLE SCIENZE DI BASE,DIPARTIMENTO DI ARCHITETTURA,NAPOLI,https://www.corsi.unina.it/D06
4,N14,Yes,ARCHITETTURA,ARCHITECTURE,CERRETA MARIA,maria.cerreta@unina.it,[],"[2022-, 2021-, 2020]",Laurea Magistrale Ciclo Unico 5 anni,5 anni,Italian,LM-4 c.u. - Classe delle lauree magistrali in ...,2008,SCUOLA POLITECNICA E DELLE SCIENZE DI BASE,DIPARTIMENTO DI ARCHITETTURA,NAPOLI,https://www.corsi.unina.it/N14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
182,N10,No,TECNOLOGIE ALIMENTARI,FOOD TECHNOLOGY,FERRANTI PASQUALE,pasquale.ferranti@unina.it,http://www.agraria.unina.it/didattica/corsi-di...,"[2022-, 2021-, 2020]",Laurea,3 anni,Italian,L-26 - Classe delle lauree in Scienze e tecnol...,2008,SCUOLA DI AGRARIA E MEDICINA VETERINARIA,DIPARTIMENTO DI AGRARIA,Federico II - Portici,https://www.corsi.unina.it/N10
183,N72,Yes,TECNOLOGIE DELLE PRODUZIONI ANIMALI,Technologies in animal productions,CALABRO' SERENA,serena.calabro@unina.it,[],"[2022-, 2021-, 2020]",Laurea,3 anni,Italian,L-38 - Classe delle lauree in Scienze zootecni...,2008,SCUOLA DI AGRARIA E MEDICINA VETERINARIA,DIPARTIMENTO DI MEDICINA VETERINARIA E PRODUZI...,Dipartimento Medicina Veterinaria e Produzioni...,https://www.corsi.unina.it/N72
184,P22,No,TOSSICOLOGIA CHIMICA E AMBIENTALE,CHEMISTRY AND ENVIROMENTAL TOXICOLOGY,SORRENTINO RAFFAELLA,raffaella.sorrentino@unina.it,http://www.farmacia.unina.it/didattica/corsi-d...,"[2022-, 2021-, 2020]",Laurea Magistrale,2 anni,Italian,LM-75 - Classe delle lauree magistrali in Scie...,2014,SCUOLA DI MEDICINA E CHIRURGIA,DIPARTIMENTO DI FARMACIA,NAPOLI,https://www.corsi.unina.it/P22
185,P55,No,TRANSPORTATION ENGINEERING AND MOBILITY,TRANSPORTATION ENGINEERING AND MOBILITY,BIFULCO GENNARO NICOLA,gennaronicola.bifulco@unina.it,Not indicated,"[2022-, 2021-, 2020]",Laurea Magistrale,2 anni,English,LM-23 - Classe delle lauree magistrali in Inge...,2021,SCUOLA POLITECNICA E DELLE SCIENZE DI BASE,"DIPARTIMENTO DI INGEGNERIA CIVILE, EDILE E AMB...",NAPOLI,https://www.corsi.unina.it/P55


In [44]:
# Save the DataFrame to a CSV file
file_name = 'courses_description_legacy.csv'
course_info_df.to_csv(file_name, index=False, encoding='utf-8', header=True)