# Imports

In [1]:
#import packages
import time
import pandas as pd
import numpy as np
import json
from collections import defaultdict

from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException 
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

In [2]:
#initialize chrome driver, make sure Chrome Executer with same version of your Web browser in the current folder
driver = webdriver.Chrome()
driver.set_window_size(1120, 1000)

#url of Regione Lombardia
url = 'https://www.lombardiabeniculturali.it/architetture/tipologie/'
driver.get(url)

# Extracting buildings data

In [3]:
# Example
# url = 'https://www.lombardiabeniculturali.it/architetture/schede/RL560-00089/?view=tipologie&offset=101&hid=1&sort=sort_int'

In [4]:
def get_building_data(url):
    """
    INPUT: Building url
    OUTPUT: Updated dictionary containing as key the building of the url and as value its image together with the retrieved info.
    """
    
    # Start the web browser
    driver.get(url) 

    # Building a unique identifier for the building
    building_id = driver.find_element(By.XPATH,'.//p[@class="record-permalink"]//a').get_attribute("href")

    # Retrieving the info
    name = driver.find_element(By.XPATH,'.//div[@class="col-md-8"]//h2').text
    try:
        image = driver.find_element(By.XPATH,'.//div[@class="popup-gallery"]//a').get_attribute("href") # sometimes missing
    except:
        image = None

    # Iniitialize container
    obs = {"building_id":building_id, "name":name, "image":image, "address":None, "typology_gen": None, "typology_spe":None, "description":None, "date":None}

    info = driver.find_elements(By.XPATH,'.//div[@class="col-md-7"]//p')
    for el in info:
        if el.text.split(":")[0] == "Indirizzo":
            address = el.text.split(":")[1].strip()
            obs["address"] = address
        elif el.text.split(":")[0] == "Tipologia generale":
            typology_gen = el.text.split(":")[1].strip()
            obs["typology_gen"] = typology_gen
        elif el.text.split(":")[0] ==  'Tipologia specifica':
            typology_spe = el.text.split(":")[1].strip()
            obs["typology_spe"] = typology_spe
        elif el.text.split(":")[0] ==  'Configurazione strutturale':
            description = el.text.split(":")[1].strip()
            obs["description"] = description
        elif el.text.split(":")[0] ==  'Epoca di costruzione':
            date = el.text.split(":")[1].strip()
            obs["date"] = date
            
    return obs

# Iterating over the pages

In [5]:
url = "https://www.lombardiabeniculturali.it/architetture/tipologie/1/"

In [6]:
def iter_pages(url):
    """
    INPUT: Initial url of the category of buildings
    OUTPUT: Returns a list of urls of all the buildings in the category
    """
    
    driver.get(url)
    
    buildings_urls = [] # Store the data
    i = 0               # Counter in case of issue
    while True:
        i += 1
        # Pre-load all building URLs
        page_buildings = driver.find_elements(By.XPATH, './/div[@class="work-caption"]//a')
        page_urls = [el.get_attribute('href') for el in page_buildings[:]]
        buildings_urls += page_urls
        
        #click next page
        try:  
            # Wait for the element to be clickable
            element = WebDriverWait(driver, 3).until(EC.element_to_be_clickable((By.XPATH, ".//a[@aria-label='Next']")))
        
            if element.find_element(By.XPATH,"..").get_attribute("class"):
                print("this is the last page")
                break
            else:
                # Click on the element
                driver.execute_script("arguments[0].click();", element)
                time.sleep(2)
                        
        # Check if an error occurred
        except:
            print(f"An error occured in page: {i}")

    return buildings_urls

# Creating the dataset for each subcategory

In [7]:
for i in range(1,7):
    url = f"https://www.lombardiabeniculturali.it/architetture/tipologie/{i}/"
    urls = iter_pages(url) # all urls from that typology

    data = []              # container for that typology
    for u in urls:
        data.append(get_building_data(u))
        time.sleep(0.1)

    # Saving as a json file 
    file_path = f"data{i}.json"
    with open(file_path, 'w') as json_file:
        json.dump(data, json_file) 
    print(f"Status update:\t iteration {i}/6 finished")

    time.sleep(60)

this is the last page
Status update:	 iteration 1/6 finished
this is the last page
Status update:	 iteration 2/6 finished
this is the last page
Status update:	 iteration 3/6 finished
this is the last page
Status update:	 iteration 4/6 finished
this is the last page
Status update:	 iteration 5/6 finished
this is the last page
Status update:	 iteration 6/6 finished
