In [1]:
import pandas as pd
import xml.etree.ElementTree as ET
import base64
import re
import requests
import os

In [2]:
def get_element_by_tag(element, tag):
    """Retrieve an element by tag name, considering namespace."""
    return element.find(f".//{{*}}{tag}")

def decode_field_name(encoded_name):
    """Decode a base64 encoded string."""
    decoded_bytes = base64.b64decode(encoded_name)
    return decoded_bytes.decode('utf-8')

def extract_corrected_data(placemark, category):
    """Extract and return data from a Placemark element with corrected field names and data mapping."""
    data = {}
    extended_data = get_element_by_tag(placemark, 'ExtendedData')
    if extended_data:
        for data_field in extended_data.findall('.//{*}SimpleData'):
            decoded_field_name = decode_field_name(data_field.attrib['name'].split(':')[-1])
            data[decoded_field_name] = data_field.text
    point = get_element_by_tag(placemark, 'Point')
    if point:
        coords = get_element_by_tag(point, 'coordinates').text.split(',')
        data['LONGITUDINE'] = coords[0]
        data['LATITUDINE'] = coords[1]
    data['CATEGORIA'] = category

    # add the field URLIMAGE if is finding the tag Carousel
    carousel = get_element_by_tag(placemark, 'Carousel')
    if carousel:
        image_url = get_element_by_tag(carousel, 'ImageUrl')
        if image_url is not None:
            data['URLIMAGE'] = image_url.text

    return data

In [3]:
file_path = 'lavori_pubblici_trento.kml'
with open(file_path, 'r') as file:
    kml_content = file.read()
root = ET.fromstring(kml_content)

records = []
for folder in root.findall('.//{*}Folder'):
    category = get_element_by_tag(folder, 'name').text
    for placemark in folder.findall('.//{*}Placemark'):
        record = extract_corrected_data(placemark, category)
        records.append(record)

df = pd.DataFrame(records)

# Assicurati che la colonna URLIMAGE esista prima di procedere
if 'URLIMAGE' in df.columns:
    df['URLIMAGE'] = df['URLIMAGE'].apply(lambda url: url.split('&fife=s')[0] if pd.notna(url) else url)
    for index, url in enumerate(df['URLIMAGE'].dropna()):
        image_name = f'image_{index+1:04d}.jpg'
        response = requests.get(url)
        if response.status_code == 200:
            with open(os.path.join('images', image_name), 'wb') as file:
                file.write(response.content)
        df.loc[df['URLIMAGE'] == url, 'IMAGENAME'] = image_name

df.TIPOLOGIA = df.TIPOLOGIA.apply(lambda x: x[1:] if not x[0].isalpha() else x)
df.STATO = df.STATO.apply(lambda x: x[1:] if not x[0].isalpha() else x)
#df['IMPORTO'] = df.IMPORTO.apply(clean_and_sum_importo)
#df.to_csv('lavori_pubblici.csv', index=False)

In [4]:
df.rename(columns={'IMPORTO':'DESC_IMPORTO'},inplace=True)

In [11]:
def clean_and_sum(s):
    # Rimuovi i punti usati come separatori delle migliaia
    cleaned_string = re.sub(r'\.(?=\d{3})', '', s)
    # Trova tutti i gruppi di numeri
    numbers = re.findall(r'\d+', cleaned_string)
    print(numbers)
    break
    result = 0
    for sublist in numbers:
        if len(sublist) == 1:
            # Converti l'elemento singolo in intero e aggiungilo ai risultati
            result = (int(sublist[0]))
        else:
            # Inizializza la somma e il moltiplicatore
            sum_value = 0
            multiplier = 1
            for item in sublist:
                if len(item) == 1 and item != '0':
                    # Se l'elemento ha una sola cifra e non è zero, moltiplica
                    multiplier *= int(item)
                else:
                    # Altrimenti somma il valore
                    sum_value += int(item)
            # Aggiungi il risultato della somma e moltiplicatore al risultato finale
            result = (sum_value * multiplier if multiplier > 1 else sum_value)
    return result

In [12]:
ciapalo = list(df.DESC_IMPORTO.apply(lambda s:clean_and_sum(s)))

In [13]:
list(ciapalo)

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 5,
 5,
 5,
 5,
 5,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [8]:
df['IMPORTO'].unique()

KeyError: 'IMPORTO'