In [None]:
import pandas as pd
import numpy as np
import requests
import math
import time
from concurrent.futures import ThreadPoolExecutor, as_completed

def read_enhanced_dataset():
    format_df = {
        'ID': 'object',
        'Caracteristicas': 'object',
        'Habitaciones': 'int32',
        'Aseos': 'int32',
        'Terraza': 'int32',
        'Piscina': 'int32',
        'Garaje': 'int32',
        'Precio': 'int32',
        'Metros': 'int32',
        'CodigoPostal': 'object',
        'Latitud': 'float64',
        'Longitud': 'float64',
        'Precision': 'float64',
        'NPRO': 'object',
        'NCA': 'object',
        'NMUN': 'object',
        'PrecioM2': 'float64',
        'CUDIS': 'object',
        'RentaBrutaHogar': 'float64',
        'RentaBrutaPersona': 'float64'
    }
    df= pd.read_csv("/kaggle/input/enhanced/enhanced.csv", dtype = format_df, parse_dates = ["Fecha"])
    return df

In [None]:
df = read_enhanced_dataset()

In [None]:
df2 = df[df["NCA"]=="Cataluña"].reset_index(drop=True)


In [None]:
df2.shape


In [None]:
#Calcular la distancia entre dos puntos especificados por latitud y longitud.
def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # Radio tierra km
    phi1 = math.radians(lat1)
    phi2 = math.radians(lat2)
    d_phi = math.radians(lat2 - lat1)
    d_lambda = math.radians(lon2 - lon1)
    a = math.sin(d_phi / 2) ** 2 + \
        math.cos(phi1) * math.cos(phi2) * math.sin(d_lambda / 2) ** 2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    distance = R * c
    return distance

def query_overpass(lat, lon, tag_options, radius=10000):
    overpass_url = "https://overpass.private.coffee/api/interpreter"
    filters = ""
    for tags in tag_options:
        tag_filter = "".join(f'["{k}"="{v}"]' for k, v in tags.items())
        filters += f"node{tag_filter}(around:{radius},{lat},{lon});"
        filters += f"way{tag_filter}(around:{radius},{lat},{lon});"
        filters += f"relation{tag_filter}(around:{radius},{lat},{lon});"
    query = f"""
            [out:json];
            (
              {filters}
            );
            out center;
            """
    response = requests.get(overpass_url, params={'data': query})
    data = response.json()
    return data

# Encontrar elemento mas cercano segun las cordenadas
def find_nearest(lat, lon, tag_options, initial_radius=1000, max_radius=50000):
    try:
        radius = initial_radius
        while radius <= max_radius:
            data = query_overpass(lat, lon, tag_options, radius=radius)
            if 'elements' in data and data['elements']:
                break
            radius *= 2  # Duplica el radio si no encuentra nada
        else:
            return None, None  # No encuentra nada
        min_distance = None
        nearest = None
        for element in data['elements']:
            if element['type'] == 'node':
                el_lat = element['lat']
                el_lon = element['lon']
            elif 'center' in element:
                el_lat = element['center']['lat']
                el_lon = element['center']['lon']
            else:
                continue
            distance = haversine(lat, lon, el_lat, el_lon)
            if min_distance is None or distance < min_distance:
                min_distance = distance
                nearest = element
        return min_distance, nearest
    except:
        print("Error")
        return None, None

def process_dataframe(df, lat_col='Latitud', lon_col='Longitud'):
    # Categorias
    categories = [
        ("University", [{"amenity": "university"}]),
        ("School", [{"amenity": "school"}]),
        ("Kindergarten", [{"amenity": "kindergarten"}]),
        ("City Center", [{"place": "city"}]),
        ("Supermarket", [{"shop": "supermarket"}]),
        ("Bakery", [{"shop": "bakery"}]),
        ("Hospital", [{"amenity": "hospital"}]),
        ("Pharmacy", [{"amenity": "pharmacy"}]),
        ("Restaurant", [{"amenity": "restaurant"}]),
        ("Café", [{"amenity": "cafe"}]),
        ("Park", [{"leisure": "park"}]),
        ("Gym", [{"leisure": "fitness_centre"}, {"amenity": "gym"}]),
        ("Movie Theater", [{"amenity": "cinema"}]),
        ("Theater", [{"amenity": "theatre"}]),
        ("Shopping Mall", [{"shop": "mall"}, {"amenity": "shopping_mall"}]),
        ("Bus Stop", [{"highway": "bus_stop"}]),
        ("Metro Station", [
            {"railway": "station", "subway": "yes"},
            {"public_transport": "station", "subway": "yes"}
        ]),
    ]

    for name, _ in categories:
        df[f"{name}_Distance"] = None

    # Busqueda por cada vivienda
    def process_row(idx, row):
        import time
        time.sleep(0.1)
        lat = row[lat_col]
        lon = row[lon_col]
        row_result = {}
        for name, tag_options in categories:
            distance, nearest = find_nearest(lat, lon, tag_options)
            row_result[f"{name}_Distance"] = distance
        print(f"Processed row {idx+1}/{len(df)}")
        return idx, row_result

    # ThreadPoolExecutor para ejecucion en paralelo
    max_workers = 30  # Limite de procesos en paralelo
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(process_row, idx, row): idx for idx, row in df.iterrows()}
        for future in as_completed(futures):
            idx, row_result = future.result()
            for key, value in row_result.items():
                df.at[idx, key] = value

    return df

In [None]:
df3 = process_dataframe(df2)

In [None]:
df3.to_csv("/kaggle/working/output.csv")