In [1]:
!pip install pyspark duckdb
!wget -O "duckdb.jar" "https://repo1.maven.org/maven2/org/duckdb/duckdb_jdbc/0.10.1/duckdb_jdbc-0.10.1.jar"
import pyspark
from pyspark.sql import SparkSession
import duckdb

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=c0a59c32623c251db3b751b94768e520692a21b224e9ce91cc0b1a4c72d5b7cf
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1
--2024-04-22 17:24:17--  https://repo1.maven.org/maven2/org/duckdb/duckdb_jdbc/0.10.1/duckdb_jdbc-0.10.1.jar
Resolving repo1.maven.org (repo1.maven.org)... 199.232.192.209, 199.232.196.209, 2a04:4e42:4c::209, ...
Connecting to repo1.maven.org (repo1.maven.org)|199.232.192.20

In [2]:
#!wget -O "database.duckdb" "https://www.icloud.com/iclouddrive/04aKe9BxKcOEpw52hgQMx6IBw#database"

In [3]:
conn = duckdb.connect("database.duckdb")
conn.close()
spark = SparkSession.builder \
    .config("spark.jars", "duckdb.jar") \
    .getOrCreate()

In [4]:
from pyspark.sql import functions as F

In [5]:
DF = spark.read \
  .format("jdbc") \
  .option("url", "jdbc:duckdb:database.duckdb") \
  .option("driver", "org.duckdb.DuckDBDriver") \
  .option("query", "SELECT * FROM weather") \
  .load()

DF.show()

+-------------------+--------------+--------------------+-------------------------+-------------+------------+-----------+---------------+---------------+----------------+----------+---------+---------+
|               date|temperature_2m|relative_humidity_2m|precipitation_probability|precipitation|weather_code|cloud_cover|cloud_cover_low|cloud_cover_mid|cloud_cover_high|  latitude|longitude|     city|
+-------------------+--------------+--------------------+-------------------------+-------------+------------+-----------+---------------+---------------+----------------+----------+---------+---------+
|2024-03-19 00:00:00|          8.05|                96.0|                      0.0|          0.0|         3.0|      100.0|           91.0|           98.0|            37.0|52.2930796|4.8124534|Amsterdam|
|2024-03-19 01:00:00|          8.45|                94.0|                      0.0|          0.0|         3.0|       98.0|           87.0|           91.0|            20.0|52.2930796|4.8124

In [6]:
#DF.write.csv("weather.csv")

# MISSINGS

In [7]:
# Funció per mirar quants NAs hi ha a un DF

def mirar_missings(data):
  # Agafem una llista amb tots els noms de les columnes de DF
  columns = data.columns

  # Comptem NAs en totes les columnes
  na_counts = [data.filter(F.col(c).isNull()).count() for c in columns]

  for c, v in zip(columns, na_counts):
      print(f"Column '{c}' has {v} missings.")

mirar_missings(DF)

Column 'date' has 0 missings.
Column 'temperature_2m' has 0 missings.
Column 'relative_humidity_2m' has 0 missings.
Column 'precipitation_probability' has 552000 missings.
Column 'precipitation' has 0 missings.
Column 'weather_code' has 0 missings.
Column 'cloud_cover' has 0 missings.
Column 'cloud_cover_low' has 0 missings.
Column 'cloud_cover_mid' has 0 missings.
Column 'cloud_cover_high' has 0 missings.
Column 'latitude' has 0 missings.
Column 'longitude' has 0 missings.
Column 'city' has 0 missings.


In [8]:
# Si tenim un NA a "precipitation_probability" pero la precipitació és 0, posem la probabilitat de precipitació a 0.
DF = DF.withColumn(
    "precipitation_probability",
    F.when(
        (F.col("precipitation_probability").isNull()) & (F.col("precipitation") == 0.0),
        0.0
    ).otherwise(F.col("precipitation_probability"))
)

In [9]:
mirar_missings(DF)

Column 'date' has 0 missings.
Column 'temperature_2m' has 0 missings.
Column 'relative_humidity_2m' has 0 missings.
Column 'precipitation_probability' has 78714 missings.
Column 'precipitation' has 0 missings.
Column 'weather_code' has 0 missings.
Column 'cloud_cover' has 0 missings.
Column 'cloud_cover_low' has 0 missings.
Column 'cloud_cover_mid' has 0 missings.
Column 'cloud_cover_high' has 0 missings.
Column 'latitude' has 0 missings.
Column 'longitude' has 0 missings.
Column 'city' has 0 missings.


In [10]:
DF = DF.withColumn("date_only", F.to_date("date"))

In [11]:
mean_values = DF.groupBy("date_only", "latitude", "longitude").agg(
    F.mean("temperature_2m").alias("mean_temperature_2m"),
    F.mean("relative_humidity_2m").alias("mean_relative_humidity_2m"),
    F.mean("precipitation_probability").alias("mean_precipitation_probability"),
    F.mean("precipitation").alias("mean_precipitation"),
    F.mean("weather_code").alias("mean_weather_code"),
    F.mean("cloud_cover").alias("mean_cloud_cover"),
    F.mean("cloud_cover_low").alias("mean_cloud_cover_low"),
    F.mean("cloud_cover_mid").alias("mean_cloud_cover_mid"),
    F.mean("cloud_cover_high").alias("mean_cloud_cover_high")
)
mean_values.show()

+----------+------------------+------------------+-------------------+-------------------------+------------------------------+--------------------+------------------+------------------+--------------------+--------------------+---------------------+
| date_only|          latitude|         longitude|mean_temperature_2m|mean_relative_humidity_2m|mean_precipitation_probability|  mean_precipitation| mean_weather_code|  mean_cloud_cover|mean_cloud_cover_low|mean_cloud_cover_mid|mean_cloud_cover_high|
+----------+------------------+------------------+-------------------+-------------------------+------------------------------+--------------------+------------------+------------------+--------------------+--------------------+---------------------+
|2024-03-27|        52.2930796| 4.837716557894737| 10.151583671569824|        80.70833333333333|             7.791666666666667| 0.03750000149011612| 8.833333333333334| 94.83333333333333|  61.333333333333336|   68.08333333333333|               77.3

In [12]:
DF = DF.join(
    mean_values,
    on=["date_only", "latitude", "longitude"],
    how="left"
)

In [13]:
columns_to_fill = [
    "temperature_2m", "relative_humidity_2m", "precipitation_probability",
    "precipitation", "weather_code", "cloud_cover", "cloud_cover_low",
    "cloud_cover_mid", "cloud_cover_high"
]

# Utilizar un bucle para reemplazar los valores faltantes con las medias correspondientes
for col in columns_to_fill:
    DF = DF.withColumn(col, F.coalesce(DF[col], DF[f"mean_{col}"]))

# Limpiar el DataFrame eliminando las columnas de medias
columns_to_drop = [f"mean_{col}" for col in columns_to_fill]
columns_to_drop.append("date_only")
DF = DF.drop(*columns_to_drop)

In [14]:
mirar_missings(DF)

Column 'latitude' has 0 missings.
Column 'longitude' has 0 missings.
Column 'date' has 0 missings.
Column 'temperature_2m' has 0 missings.
Column 'relative_humidity_2m' has 0 missings.
Column 'precipitation_probability' has 0 missings.
Column 'precipitation' has 0 missings.
Column 'weather_code' has 0 missings.
Column 'cloud_cover' has 0 missings.
Column 'cloud_cover_low' has 0 missings.
Column 'cloud_cover_mid' has 0 missings.
Column 'cloud_cover_high' has 0 missings.
Column 'city' has 0 missings.


In [15]:
DF.show(20)

+----------+---------+-------------------+------------------+--------------------+-------------------------+-------------+------------+-----------+---------------+---------------+----------------+---------+
|  latitude|longitude|               date|    temperature_2m|relative_humidity_2m|precipitation_probability|precipitation|weather_code|cloud_cover|cloud_cover_low|cloud_cover_mid|cloud_cover_high|     city|
+----------+---------+-------------------+------------------+--------------------+-------------------------+-------------+------------+-----------+---------------+---------------+----------------+---------+
|52.2930796|4.8124534|2024-03-19 00:00:00| 8.050000190734863|                96.0|                      0.0|          0.0|         3.0|      100.0|           91.0|           98.0|            37.0|Amsterdam|
|52.2930796|4.8124534|2024-03-19 01:00:00| 8.449999809265137|                94.0|                      0.0|          0.0|         3.0|       98.0|           87.0|         

# CANVI DE VARIABLES: weather_code

Aquesta variable és numèrica, tot i que representa un estat del cel, per tant, canviarem la variable *weather_code* per *weather_description*, una variable que explicarà millor l'estat del cel.

> Informació utilitzada: [https://open-meteo.com/en/docs/](https://open-meteo.com/en/docs/)





In [16]:
# Mirem la variable weather_code
weather_code_counts = DF.groupBy("weather_code").count()
weather_code_counts = weather_code_counts.orderBy("weather_code")
weather_code_counts.show()

+------------+------+
|weather_code| count|
+------------+------+
|         0.0|225354|
|         1.0|166518|
|         2.0|244533|
|         3.0|719045|
|        45.0|  5447|
|        51.0|110978|
|        53.0| 15782|
|        55.0|  4103|
|        61.0| 28331|
|        63.0|   610|
|        80.0| 14756|
|        95.0|   543|
+------------+------+



In [17]:
# Crear un DataFrame actualizado con la descripción del clima basada en weather_code
DF = DF.withColumn("weather_description",
    F.when(F.col("weather_code") == 0.0, "Sense Nuvols")
     .when(F.col("weather_code") == 1.0, "Nuvols Low")
     .when(F.col("weather_code") == 2.0, "Nuvols Medium")
     .when(F.col("weather_code") == 3.0, "Nuvols High")
     .when(F.col("weather_code") == 45.0, "Boira")
     .when(F.col("weather_code") == 51.0, "Pluja Low")
     .when((F.col("weather_code") == 53.0) | (F.col("weather_code") == 55.0) | (F.col("weather_code") == 61.0) | (F.col("weather_code") == 63.0), "Pluja Medium")
     .when(F.col("weather_code") == 80.0, "Pluja High")
     .when(F.col("weather_code") == 95.0, "Tempesta")
     .otherwise("Desconocido")
)
DF = DF.drop("weather_code")

In [18]:
# Mirem la variable weather_code
weather_code_counts = DF.groupBy("weather_description").count()
weather_code_counts = weather_code_counts.orderBy("weather_description")
weather_code_counts.show()

+-------------------+------+
|weather_description| count|
+-------------------+------+
|              Boira|  5447|
|        Nuvols High|719045|
|         Nuvols Low|166518|
|      Nuvols Medium|244533|
|         Pluja High| 14756|
|          Pluja Low|110978|
|       Pluja Medium| 48826|
|       Sense Nuvols|225354|
|           Tempesta|   543|
+-------------------+------+



# AGRUPACIÓ DE FILES

Per tal de simplificar el problema, agruparem els valors per dia/mes/any i no per hora/dia/mes/any. Per fer-ho, algunes columnes seran agrupades per la mitja i altres per la suma.

In [19]:
from pyspark.sql.types import DateType, StringType, MapType, IntegerType

In [20]:
# Asegúrate de que 'date' sea solo la fecha si aún no se ha hecho
DF = DF.withColumn('date', F.to_date('date'))

# Agrupar por 'date', 'latitude', 'longitude' y realizar las agregaciones
DF = DF.groupBy('date', 'latitude', 'longitude').agg(
    F.first('latitude').alias('avg_latitude'),
    F.first('longitude').alias('avg_longitude'),
    F.mean('temperature_2m').alias('avg_temperature_2m'),
    F.mean('relative_humidity_2m').alias('avg_relative_humidity_2m'),
    F.mean('precipitation_probability').alias('avg_precipitation_probability'),
    F.sum('precipitation').alias('total_precipitation'),
    F.mean('cloud_cover').alias('avg_cloud_cover'),
    F.mean('cloud_cover_low').alias('avg_cloud_cover_low'),
    F.mean('cloud_cover_mid').alias('avg_cloud_cover_mid'),
    F.mean('cloud_cover_high').alias('avg_cloud_cover_high'),
    F.first('city').alias('city'),  # Aunque 'city' ya está en el groupBy, lo mantenemos por claridad
    F.collect_list('weather_description').alias('weather_descriptions')
)

In [21]:
def count_descriptions(descriptions):
    counts = {}
    for desc in descriptions:
        counts[desc] = counts.get(desc, 0) + 1
    return counts

# Registrar la UDF
count_descriptions_udf = F.udf(count_descriptions, MapType(StringType(), IntegerType()))

# Aplicar la UDF
DF = DF.withColumn(
    'weather_description_count',
    count_descriptions_udf('weather_descriptions')
)

In [22]:
print(DF.count())

64000


In [23]:
DF = DF.select('date', 'city', 'latitude', 'longitude', 'avg_temperature_2m',
                     'avg_relative_humidity_2m', 'avg_precipitation_probability',
                     'total_precipitation', 'avg_cloud_cover', 'avg_cloud_cover_low',
                     'avg_cloud_cover_mid', 'avg_cloud_cover_high', 'weather_description_count')

In [24]:
DF.show()

+----------+------+-----------------+------------------+------------------+------------------------+-----------------------------+-------------------+--------------------+--------------------+-------------------+--------------------+-------------------------+
|      date|  city|         latitude|         longitude|avg_temperature_2m|avg_relative_humidity_2m|avg_precipitation_probability|total_precipitation|     avg_cloud_cover| avg_cloud_cover_low|avg_cloud_cover_mid|avg_cloud_cover_high|weather_description_count|
+----------+------+-----------------+------------------+------------------+------------------------+-----------------------------+-------------------+--------------------+--------------------+-------------------+--------------------+-------------------------+
|2024-03-19|Athens|       33.8797677|        -83.456398| 7.064750048021476|                  32.375|                          0.0|                0.0|                 0.0|                 0.0|                0.0|        

Un cop tenim les files agrupades per dia/mes/any, ens hem d'asegurar de mantenir correctament l'estat del cel a la variable *weather_description_count*.

Per fer-ho, aplicarem una funció per tal de crear un *string* que defineixi l'estat del cel.

In [25]:
def estat_cel(description_map):
    conditions = {
        "Sense Nuvols": ("Despejado", 10),
        "Nuvols Low": ("Poco nublado", 6),
        "Nuvols Medium": ("Parcialmente nublado", 6),
        "Nuvols High": ("Nublado", 6),
        "Pluja Low": ("Lloviznas", 4),
        "Pluja Medium": ("Lluvias moderadas", 4),
        "Pluja High": ("Lluvias intensas", 4),
        "Boira": ("Niebla", 3),
        "Tempesta": ("Tormenta", 1)
    }

    descriptions = []
    for key, (label, threshold) in conditions.items():
        if description_map.get(key, 0) > threshold:
            descriptions.append(label)

    if not descriptions:
        # Si no se cumple ninguna condición, asignamos la condición con el mayor conteo
        # que sea relevante pero no alcanzó el umbral
        highest_count = max(description_map.items(), key=lambda x: x[1])
        if highest_count[1] > 0:
            descriptions.append(conditions.get(highest_count[0], (highest_count[0], 0))[0])
        else:
            descriptions.append("Condición no específica")

    return ', '.join(descriptions)

# Registrar la UDF en Spark
estat_cel_udf = F.udf(estat_cel, StringType())

# Crear una nueva columna usando la UDF
DF = DF.withColumn("estat_cel", estat_cel_udf("weather_description_count"))

In [26]:
# Mirem la variable estat_cel
estat_cel_counts = DF.groupBy("estat_cel").count()
estat_cel_counts = estat_cel_counts.orderBy("estat_cel")
estat_cel_counts.show(truncate=False, n=50)

+-----------------------------------------------+-----+
|estat_cel                                      |count|
+-----------------------------------------------+-----+
|Despejado                                      |5156 |
|Despejado, Nublado                             |1533 |
|Despejado, Poco nublado                        |584  |
|Lloviznas                                      |3013 |
|Lloviznas, Lluvias moderadas                   |617  |
|Lloviznas, Niebla                              |5    |
|Lluvias intensas                               |342  |
|Lluvias moderadas                              |160  |
|Nublado                                        |28028|
|Nublado, Lloviznas                             |4485 |
|Nublado, Lloviznas, Lluvias intensas           |2    |
|Nublado, Lloviznas, Lluvias moderadas          |539  |
|Nublado, Lluvias intensas                      |326  |
|Nublado, Lluvias moderadas                     |2684 |
|Nublado, Niebla                                

Tot i això, veiem que hi ha moltes files que es repeteixen o son inconsistents:

*   *Parcialmente nublado, Nublado, Lluvias intensas*

En aquests casos, decidirem si el cel esta parcialment nuvolat o nuvolat.

In [27]:
# Diccionario de reemplazos
replacements = {
    "Despejado, Nublado": "Parcialmente nublado",
    "Despejado, Poco nublado": "Parcialmente nublado",
    "Lloviznas, Lluvias moderadas": "Lluvias ligeras",
    "Nublado, Lloviznas, Lluvias intensas": "Nublado, Lluvias moderadas",
    "Nublado, Lloviznas, Lluvias moderadas": "Nublado, Lluvias moderadas",
    "Poco nublado": "Parcialmente nublado",
    "Poco nublado, Lloviznas": "Parcialmente nublado, Lloviznas",
    "Poco nublado, Lluvias moderadas": "Parcialmente nublado, Lluvias moderadas",
    "Poco nublado, Niebla": "Parcialmente nublado, Niebla",
    "Poco nublado, Nublado": "Parcialmente nublado",
    "Poco nublado, Nublado, Lluvias moderadas": "Nublado, Lluvias moderadas",
    "Poco nublado, Parcialmente nublado": "Parcialmente nublado",
    "Poco nublado, Parcialmente nublado, Nublado": "Parcialmente nublado",
    "Parcialmente nublado, Nublado": "Parcialmente nublado",
    "Parcialmente nublado, Nublado, Lloviznas": "Parcialmente nublado, Lloviznas",
    "Parcialmente nublado, Nublado, Lluvias intensas": "Nublado, Lluvias intensas",
    "Parcialmente nublado, Nublado, Niebla": "Nublado, Niebla"
}

# Crear una columna condicional usando when-otherwise
replacement_expr = F.col("estat_cel")  # comienza con la columna original
for key, value in replacements.items():
    replacement_expr = F.when(F.col("estat_cel") == key, value).otherwise(replacement_expr)

# Aplicar la transformación al DataFrame
DF = DF.withColumn("estat_cel", replacement_expr)

DF = DF.drop("weather_description_count")

In [28]:
# Mirem la variable estat_cel
estat_cel_counts = DF.groupBy("estat_cel").count()
estat_cel_counts = estat_cel_counts.orderBy("estat_cel")
estat_cel_counts.show(truncate=False, n=30)

+---------------------------------------+-----+
|estat_cel                              |count|
+---------------------------------------+-----+
|Despejado                              |5156 |
|Lloviznas                              |3013 |
|Lloviznas, Niebla                      |5    |
|Lluvias intensas                       |342  |
|Lluvias ligeras                        |617  |
|Lluvias moderadas                      |160  |
|Nublado                                |28028|
|Nublado, Lloviznas                     |4485 |
|Nublado, Lluvias intensas              |347  |
|Nublado, Lluvias moderadas             |3365 |
|Nublado, Niebla                        |367  |
|Nublado, Tormenta                      |181  |
|Parcialmente nublado                   |16779|
|Parcialmente nublado, Lloviznas        |294  |
|Parcialmente nublado, Lluvias intensas |418  |
|Parcialmente nublado, Lluvias moderadas|204  |
|Parcialmente nublado, Niebla           |239  |
+---------------------------------------

In [29]:
DF.show()

+----------+------+-----------------+------------------+------------------+------------------------+-----------------------------+-------------------+--------------------+--------------------+-------------------+--------------------+---------+
|      date|  city|         latitude|         longitude|avg_temperature_2m|avg_relative_humidity_2m|avg_precipitation_probability|total_precipitation|     avg_cloud_cover| avg_cloud_cover_low|avg_cloud_cover_mid|avg_cloud_cover_high|estat_cel|
+----------+------+-----------------+------------------+------------------+------------------------+-----------------------------+-------------------+--------------------+--------------------+-------------------+--------------------+---------+
|2024-03-19|Athens|       33.8797677|        -83.456398| 7.064750048021476|                  32.375|                          0.0|                0.0|                 0.0|                 0.0|                0.0|                 0.0|Despejado|
|2024-03-19|Athens|     