In [None]:
!pip install pyspark duckdb
!wget -O "duckdb.jar" "https://repo1.maven.org/maven2/org/duckdb/duckdb_jdbc/0.10.1/duckdb_jdbc-0.10.1.jar"
import pyspark
from pyspark.sql import SparkSession
import duckdb
from pyspark.sql import functions as F
from pyspark.sql.window import Window

--2024-04-24 18:29:14--  https://repo1.maven.org/maven2/org/duckdb/duckdb_jdbc/0.10.1/duckdb_jdbc-0.10.1.jar
Resolving repo1.maven.org (repo1.maven.org)... 199.232.192.209, 199.232.196.209, 2a04:4e42:4c::209, ...
Connecting to repo1.maven.org (repo1.maven.org)|199.232.192.209|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 64009472 (61M) [application/java-archive]
Saving to: ‘duckdb.jar’


2024-04-24 18:29:15 (233 MB/s) - ‘duckdb.jar’ saved [64009472/64009472]



In [None]:
conn = duckdb.connect("quality_database.duckdb")
conn.close()
spark = SparkSession.builder \
    .config("spark.jars", "duckdb.jar") \
    .getOrCreate()

In [None]:
DF_weather = spark.read \
  .format("jdbc") \
  .option("url", "jdbc:duckdb:quality_database.duckdb") \
  .option("driver", "org.duckdb.DuckDBDriver") \
  .option("query", "SELECT * FROM weather") \
  .load()

DF_weather.show()

+----------+------+-----------------+------------------+------------------+------------------------+-----------------------------+-------------------+--------------------+--------------------+-------------------+--------------------+---------+
|      date|  city|         latitude|         longitude|avg_temperature_2m|avg_relative_humidity_2m|avg_precipitation_probability|total_precipitation|     avg_cloud_cover| avg_cloud_cover_low|avg_cloud_cover_mid|avg_cloud_cover_high|estat_cel|
+----------+------+-----------------+------------------+------------------+------------------------+-----------------------------+-------------------+--------------------+--------------------+-------------------+--------------------+---------+
|2024-03-19|Athens|       33.8797677|-83.42271378947368| 7.222916675576319|                  31.625|                          0.0|                0.0|                 0.0|                 0.0|                0.0|                 0.0|Despejado|
|2024-03-19|Athens|     

In [None]:
DF_airbnb = spark.read \
  .format("jdbc") \
  .option("url", "jdbc:duckdb:quality_database.duckdb") \
  .option("driver", "org.duckdb.DuckDBDriver") \
  .option("query", "SELECT * FROM airbnb") \
  .load()

DF_airbnb.show()

+------------------+---------------+-----------+------------+---------------+-----------------+-----+---+------------------+--------------------------+--------+------------------+------------------+------------------+------------------+------------------+------------------+-----------------+------------------+---------+--------+--------------------+-------+
|           realSum|      room_type|room_shared|room_private|person_capacity|host_is_superhost|multi|biz|cleanliness_rating|guest_satisfaction_overall|bedrooms|              dist|        metro_dist|        attr_index|   attr_index_norm|        rest_index|   rest_index_norm|              lng|               lat|     city|day_type|cleanliness_category|cluster|
+------------------+---------------+-----------+------------+---------------+-----------------+-----+---+------------------+--------------------------+--------+------------------+------------------+------------------+------------------+------------------+------------------+------

In [None]:
DF_flights = spark.read \
  .format("jdbc") \
  .option("url", "jdbc:duckdb:quality_database.duckdb") \
  .option("driver", "org.duckdb.DuckDBDriver") \
  .option("query", "SELECT * FROM flights") \
  .load()

DF_flights.show()

+---------------+---------+---------+---------+---------------------+--------------------+--------------------+--------------+-----------+---------------+----------------+---------+------------+--------------+-----------+
|airport_acronym|longitude| latitude|     city|flight_number_default|          owner_name|          owner_logo|origin_airport|origin_city|origin_latitude|origin_longitude|has_delay|arrival_date|departure_time|time_of_day|
+---------------+---------+---------+---------+---------------------+--------------------+--------------------+--------------+-----------+---------------+----------------+---------+------------+--------------+-----------+
|           EHAM| 4.763889|52.308609|Amsterdam|                OS373|   Austrian Airlines|https://images.fl...|           VIE|     Vienna|      48.110271|       16.569719|        0|  2024-03-21|      13:50:00|  afternoon|
|           EHAM| 4.763889|52.308609|Amsterdam|               KL1920|      KLM Cityhopper|https://images.fl...| 

In [None]:
# Suponiendo que ya tienes cargados DF_flights y DF_weather
# Asegurarse de asignar alias para evitar problemas de ambigüedad
flights = DF_flights.alias("flights")
weather = DF_weather.alias("weather")

# Definir una ventana para seleccionar una única fila por cada airport_acronym
window_spec = Window.partitionBy("flights.airport_acronym").orderBy("flights.airport_acronym")

# Usar row_number para seleccionar una única fila por cada airport_acronym
flights = flights.withColumn("row_num", F.row_number().over(window_spec)).filter(F.col("row_num") == 1).drop("row_num")
flights = flights.select("airport_acronym", "latitude", "longitude")

# Definir una ventana para seleccionar una única fila por cada combinación de latitude y longitude
window_spec = Window.partitionBy("weather.latitude", "weather.longitude").orderBy("weather.latitude", "weather.longitude")

# Usar row_number para seleccionar una única fila por cada combinación de latitude y longitude
weather = weather.withColumn("row_num", F.row_number().over(window_spec)).filter(F.col("row_num") == 1).drop("row_num")

# Calcular la distancia Euclidiana entre cada aeropuerto y todos los puntos meteorológicos
distance_df = flights.crossJoin(weather).withColumn(
    "distance",
    F.sqrt(
        (F.col("flights.latitude") - F.col("weather.latitude")) ** 2 +
        (F.col("flights.longitude") - F.col("weather.longitude")) ** 2
    )
)

# Definir la ventana para determinar la fila con la mínima distancia para cada aeropuerto
windowSpec = Window.partitionBy("flights.airport_acronym").orderBy("distance")

# Usar row_number para identificar la fila con la mínima distancia
min_distance_df = distance_df.withColumn("row_num", F.row_number().over(windowSpec)).filter(F.col("row_num") == 1)

# Seleccionar solo las columnas relevantes
final_df_airports = min_distance_df.select(
    F.col("flights.airport_acronym").alias("airport_acronym"),
    F.col("weather.latitude").alias("latitude"),
    F.col("weather.longitude").alias("longitude")
)

# Mostrar los resultados
final_df_airports.show()

+---------------+------------------+--------------------+
|airport_acronym|          latitude|           longitude|
+---------------+------------------+--------------------+
|           CYXU|        34.0397677|          -83.296398|
|           EDDB|52.437036500000005|          13.4688599|
|           EGGW|51.587445599999995|-0.20776529999999999|
|           EGKB|        51.4274456|          -0.0477653|
|           EGKK|        51.4274456| -0.1909231947368421|
|           EGLC| 51.50323507368421|          -0.0477653|
|           EGLL| 51.46955086315789|-0.20776529999999999|
|           EGMC| 51.57060349473684|          -0.0477653|
|           EGSS|51.587445599999995|          -0.0477653|
|           EGTK|51.587445599999995|-0.20776529999999999|
|           EHAM| 52.30992170526316|           4.8124534|
|           FAEL|        41.8133203|          12.5629321|
|           KAHN| 33.95555717368421|  -83.32166115789474|
|           KGON|        34.0397677|          -83.296398|
|           KL

In [None]:
#conn = duckdb.connect("quality_database.duckdb")
#conn.close()

final_df_airports.write \
    .format("jdbc") \
    .option("url", "jdbc:duckdb:quality_database.duckdb") \
    .option("dbtable", "airports") \
    .option("driver", "org.duckdb.DuckDBDriver") \
    .save()

In [None]:
DF_airbnb = DF_airbnb.withColumn("id", F.monotonically_increasing_id())

# Suponer que DF_airbnb y DF_weather ya están cargados como DataFrame
airbnb = DF_airbnb.alias("airbnb")
weather = DF_weather.alias("weather")

# Obtener una lista única de ciudades de ambos DataFrames
cities = airbnb.select("city").distinct().union(weather.select("city").distinct()).distinct().collect()

# Lista para mantener los resultados por ciudad
results_per_city = []

for city_row in cities:
    city = city_row.city
    print(f"City: {city}")

    # Filtrar ambos DataFrames por ciudad
    airbnb_city = airbnb.filter(F.col("city") == city)
    weather_city = weather.filter(F.col("city") == city)

    # Calcular las distancias
    distance_df = airbnb_city.crossJoin(weather_city).withColumn(
        "distance",
        F.sqrt(
            (F.col("airbnb.lat") - F.col("weather.latitude")) ** 2 +
            (F.col("airbnb.lng") - F.col("weather.longitude")) ** 2
        )
    )

    # Definir una ventana para seleccionar la fila más cercana en cada combinación de lat-lng para airbnb
    window_spec_airbnb = Window.partitionBy("airbnb.id").orderBy("distance")
    closest_weather = distance_df.withColumn("row_num", F.row_number().over(window_spec_airbnb))\
                                .filter(F.col("row_num") == 1)\
                                .select(
                                    F.col("airbnb.id").alias("id"),
                                    F.col("weather.latitude").alias("latitude_w"),
                                    F.col("weather.longitude").alias("longitude_w")
                                )

    # Almacenar el DataFrame resultante para esta ciudad
    results_per_city.append(closest_weather)

# Concatenar todos los DataFrames de resultados de cada ciudad
final_df = results_per_city[0]
for df in results_per_city[1:]:
    final_df = final_df.union(df)

City: Lisbon
City: Berlin
City: London
City: Vienna
City: Paris
City: Athens
City: Barcelona
City: Amsterdam
City: Rome
City: Budapest


In [None]:
# Unir el DataFrame final con el original DF_airbnb para añadir las nuevas columnas
result_df = DF_airbnb.join(final_df, "id", "left")

# Mostrar los resultados finales
#print(result_df.count())
#result_df.show()

In [None]:
# Define el diccionario que mapea el nombre de la columna con su tipo de datos
column_types = {
    "id": "INT",
    "realSum": "DOUBLE",
    "room_type": "STRING",
    "room_shared": "BOOLEAN",
    "room_private": "BOOLEAN",
    "person_capacity": "DOUBLE",
    "host_is_superhost": "BOOLEAN",
    "multi": "STRING",
    "biz": "STRING",
    "cleanliness_rating": "DOUBLE",
    "guest_satisfaction_overall": "DOUBLE",
    "bedrooms": "INT",
    "dist": "DOUBLE",
    "metro_dist": "DOUBLE",
    "attr_index": "DOUBLE",
    "attr_index_norm": "DOUBLE",
    "rest_index": "DOUBLE",
    "rest_index_norm": "DOUBLE",
    "lng": "DOUBLE",
    "lat": "DOUBLE",
    "city": "STRING",
    "day_type": "STRING",
    "cleanliness_category": "STRING",
    "cluster": "INT",
    "latitude_w": "DOUBLE",
    "longitude_w": "DOUBLE"
}

# Convertir el diccionario en una cadena de tipos de datos separada por comas
column_types_str = ", ".join([f"{col} {data_type}" for col, data_type in column_types.items()])

# Guardar el DataFrame en DuckDB con los tipos de datos especificados
result_df.write \
    .format("jdbc") \
    .option("url", "jdbc:duckdb:quality_database.duckdb") \
    .option("dbtable", "airbnb") \
    .option("driver", "org.duckdb.DuckDBDriver") \
    .option("createTableColumnTypes", column_types_str) \
    .mode("overwrite") \
    .save()