In [None]:
!pip install pyspark duckdb
!wget -O "duckdb.jar" "https://repo1.maven.org/maven2/org/duckdb/duckdb_jdbc/0.10.1/duckdb_jdbc-0.10.1.jar"
import pyspark
from pyspark.sql import SparkSession
import duckdb

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=f6d9efaace1833fc92c82e4e69eedc9c71c2b3bf2f135a8c2fbeaa89e3d601b2
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1
--2024-04-22 18:52:54--  https://repo1.maven.org/maven2/org/duckdb/duckdb_jdbc/0.10.1/duckdb_jdbc-0.10.1.jar
Resolving repo1.maven.org (repo1.maven.org)... 199.232.192.209, 199.232.196.209, 2a04:4e42:4c::209, ...
Connecting to repo1.maven.org (repo1.maven.org)|199.232.192.20

In [None]:
conn = duckdb.connect("database.duckdb")
conn.close()
spark = SparkSession.builder \
    .config("spark.jars", "duckdb.jar") \
    .getOrCreate()

In [None]:
from pyspark.sql import functions as F

We read the database from the duckdb

In [None]:
DF = spark.read \
  .format("jdbc") \
  .option("url", "jdbc:duckdb:database.duckdb") \
  .option("driver", "org.duckdb.DuckDBDriver") \
  .option("query", "SELECT * FROM weather") \
  .load()

DF.show()

+-------------------+--------------+--------------------+-------------------------+-------------+------------+-----------+---------------+---------------+----------------+----------+---------+---------+
|               date|temperature_2m|relative_humidity_2m|precipitation_probability|precipitation|weather_code|cloud_cover|cloud_cover_low|cloud_cover_mid|cloud_cover_high|  latitude|longitude|     city|
+-------------------+--------------+--------------------+-------------------------+-------------+------------+-----------+---------------+---------------+----------------+----------+---------+---------+
|2024-03-19 00:00:00|          8.05|                96.0|                      0.0|          0.0|         3.0|      100.0|           91.0|           98.0|            37.0|52.2930796|4.8124534|Amsterdam|
|2024-03-19 01:00:00|          8.45|                94.0|                      0.0|          0.0|         3.0|       98.0|           87.0|           91.0|            20.0|52.2930796|4.8124

# MISSINGS

In [None]:
# Function to check missing values
def mirar_missings(data):
  # Get column names
  columns = data.columns

  # Count missings for each column
  na_counts = [data.filter(F.col(c).isNull()).count() for c in columns]

  for c, v in zip(columns, na_counts):
      print(f"Column '{c}' has {v} missings.")

# Check missings
mirar_missings(DF)

Column 'date' has 0 missings.
Column 'temperature_2m' has 0 missings.
Column 'relative_humidity_2m' has 0 missings.
Column 'precipitation_probability' has 552000 missings.
Column 'precipitation' has 0 missings.
Column 'weather_code' has 0 missings.
Column 'cloud_cover' has 0 missings.
Column 'cloud_cover_low' has 0 missings.
Column 'cloud_cover_mid' has 0 missings.
Column 'cloud_cover_high' has 0 missings.
Column 'latitude' has 0 missings.
Column 'longitude' has 0 missings.
Column 'city' has 0 missings.


We see that there are no missing values in the dataset except for the column `precipitation_probability` which has 552000 missing values. We will fill these missing values with some logic and, if there are still there, the mean of that day.

In [None]:
# If we have a NA in "precipitation_probability" but the precipitation is 0, we set the precipitation probability to 0.
DF = DF.withColumn(
    "precipitation_probability",
    F.when(
        (F.col("precipitation_probability").isNull()) & (F.col("precipitation") == 0.0),
        0.0
    ).otherwise(F.col("precipitation_probability"))
)

In [None]:
mirar_missings(DF)

Column 'date' has 0 missings.
Column 'temperature_2m' has 0 missings.
Column 'relative_humidity_2m' has 0 missings.
Column 'precipitation_probability' has 78714 missings.
Column 'precipitation' has 0 missings.
Column 'weather_code' has 0 missings.
Column 'cloud_cover' has 0 missings.
Column 'cloud_cover_low' has 0 missings.
Column 'cloud_cover_mid' has 0 missings.
Column 'cloud_cover_high' has 0 missings.
Column 'latitude' has 0 missings.
Column 'longitude' has 0 missings.
Column 'city' has 0 missings.


We see that there are still missings in the column `precipitation_probability` after filling them with the mean of the day. We will fill these missing values with the mean of the same day.

In [None]:
DF = DF.withColumn("date_only", F.to_date("date"))

In [None]:
mean_values = DF.groupBy("date_only", "latitude", "longitude").agg(
    F.mean("temperature_2m").alias("mean_temperature_2m"),
    F.mean("relative_humidity_2m").alias("mean_relative_humidity_2m"),
    F.mean("precipitation_probability").alias("mean_precipitation_probability"),
    F.mean("precipitation").alias("mean_precipitation"),
    F.mean("weather_code").alias("mean_weather_code"),
    F.mean("cloud_cover").alias("mean_cloud_cover"),
    F.mean("cloud_cover_low").alias("mean_cloud_cover_low"),
    F.mean("cloud_cover_mid").alias("mean_cloud_cover_mid"),
    F.mean("cloud_cover_high").alias("mean_cloud_cover_high")
)
mean_values.show()

+----------+------------------+------------------+-------------------+-------------------------+------------------------------+--------------------+------------------+------------------+--------------------+--------------------+---------------------+
| date_only|          latitude|         longitude|mean_temperature_2m|mean_relative_humidity_2m|mean_precipitation_probability|  mean_precipitation| mean_weather_code|  mean_cloud_cover|mean_cloud_cover_low|mean_cloud_cover_mid|mean_cloud_cover_high|
+----------+------------------+------------------+-------------------+-------------------------+------------------------------+--------------------+------------------+------------------+--------------------+--------------------+---------------------+
|2024-03-27|        52.2930796| 4.837716557894737| 10.151583671569824|        80.70833333333333|             7.791666666666667| 0.03750000149011612| 8.833333333333334| 94.83333333333333|  61.333333333333336|   68.08333333333333|               77.3

In [None]:
DF = DF.join(
    mean_values,
    on=["date_only", "latitude", "longitude"],
    how="left"
)

In [None]:
columns_to_fill = [
    "temperature_2m", "relative_humidity_2m", "precipitation_probability",
    "precipitation", "weather_code", "cloud_cover", "cloud_cover_low",
    "cloud_cover_mid", "cloud_cover_high"
]

# Use a loop to replace missing values with the corresponding means
for col in columns_to_fill:
    DF = DF.withColumn(col, F.coalesce(DF[col], DF[f"mean_{col}"]))

# Clean the DataFrame by dropping the mean columns
columns_to_drop = [f"mean_{col}" for col in columns_to_fill]
columns_to_drop.append("date_only")
DF = DF.drop(*columns_to_drop)

In [None]:
mirar_missings(DF)

Column 'latitude' has 0 missings.
Column 'longitude' has 0 missings.
Column 'date' has 0 missings.
Column 'temperature_2m' has 0 missings.
Column 'relative_humidity_2m' has 0 missings.
Column 'precipitation_probability' has 0 missings.
Column 'precipitation' has 0 missings.
Column 'weather_code' has 0 missings.
Column 'cloud_cover' has 0 missings.
Column 'cloud_cover_low' has 0 missings.
Column 'cloud_cover_mid' has 0 missings.
Column 'cloud_cover_high' has 0 missings.
Column 'city' has 0 missings.


We see that there are no more missings in the dataset.

# CANVI DE VARIABLES: weather_code

Aquesta variable és numèrica, tot i que representa un estat del cel, per tant, canviarem la variable *weather_code* per *weather_description*, una variable que explicarà millor l'estat del cel.

> Informació utilitzada: [https://open-meteo.com/en/docs/](https://open-meteo.com/en/docs/)





In [None]:
# See the counts of each weather code
weather_code_counts = DF.groupBy("weather_code").count()
weather_code_counts = weather_code_counts.orderBy("weather_code")
weather_code_counts.show()

+------------+------+
|weather_code| count|
+------------+------+
|         0.0|225354|
|         1.0|166518|
|         2.0|244533|
|         3.0|719045|
|        45.0|  5447|
|        51.0|110978|
|        53.0| 15782|
|        55.0|  4103|
|        61.0| 28331|
|        63.0|   610|
|        80.0| 14756|
|        95.0|   543|
+------------+------+



In [None]:
# Create an updated dataset with the weather description
DF = DF.withColumn("weather_description",
    F.when(F.col("weather_code") == 0.0, "Sense Nuvols")
     .when(F.col("weather_code") == 1.0, "Nuvols Low")
     .when(F.col("weather_code") == 2.0, "Nuvols Medium")
     .when(F.col("weather_code") == 3.0, "Nuvols High")
     .when(F.col("weather_code") == 45.0, "Boira")
     .when(F.col("weather_code") == 51.0, "Pluja Low")
     .when((F.col("weather_code") == 53.0) | (F.col("weather_code") == 55.0) | (F.col("weather_code") == 61.0) | (F.col("weather_code") == 63.0), "Pluja Medium")
     .when(F.col("weather_code") == 80.0, "Pluja High")
     .when(F.col("weather_code") == 95.0, "Tempesta")
     .otherwise("Desconocido")
)
DF = DF.drop("weather_code")

In [None]:
# Check the counts of each weather description
weather_code_counts = DF.groupBy("weather_description").count()
weather_code_counts = weather_code_counts.orderBy("weather_description")
weather_code_counts.show()

+-------------------+------+
|weather_description| count|
+-------------------+------+
|              Boira|  5447|
|        Nuvols High|719045|
|         Nuvols Low|166518|
|      Nuvols Medium|244533|
|         Pluja High| 14756|
|          Pluja Low|110978|
|       Pluja Medium| 48826|
|       Sense Nuvols|225354|
|           Tempesta|   543|
+-------------------+------+



# AGRUPACIÓ DE FILES

In order to simplify the problem, we will group the values by day/month/year and not by hour/day/month/year. To do this, some columns will be grouped by the mean and others by the sum.

In [None]:
from pyspark.sql.types import DateType, StringType, MapType, IntegerType

In [None]:
# We assure that the 'date' column is a date type
DF = DF.withColumn('date', F.to_date('date'))

# Group by date, latitude and longitude and make the necessary aggregations
DF = DF.groupBy('date', 'latitude', 'longitude').agg(
    F.first('latitude').alias('avg_latitude'),
    F.first('longitude').alias('avg_longitude'),
    F.mean('temperature_2m').alias('avg_temperature_2m'),
    F.mean('relative_humidity_2m').alias('avg_relative_humidity_2m'),
    F.mean('precipitation_probability').alias('avg_precipitation_probability'),
    F.sum('precipitation').alias('total_precipitation'),
    F.mean('cloud_cover').alias('avg_cloud_cover'),
    F.mean('cloud_cover_low').alias('avg_cloud_cover_low'),
    F.mean('cloud_cover_mid').alias('avg_cloud_cover_mid'),
    F.mean('cloud_cover_high').alias('avg_cloud_cover_high'),
    F.first('city').alias('city'),
    F.collect_list('weather_description').alias('weather_descriptions')
)

In [None]:
def count_descriptions(descriptions):
    counts = {}
    for desc in descriptions:
        counts[desc] = counts.get(desc, 0) + 1
    return counts

# Registrate the UDF
count_descriptions_udf = F.udf(count_descriptions, MapType(StringType(), IntegerType()))

# Apply the UDF to the DataFrame
DF = DF.withColumn(
    'weather_description_count',
    count_descriptions_udf('weather_descriptions')
)

In [None]:
print(DF.count())

64000


Now, we have 64.000 rows in the dataset.

In [None]:
# We only keep the columns we need
DF = DF.select('date', 'city', 'latitude', 'longitude', 'avg_temperature_2m',
                     'avg_relative_humidity_2m', 'avg_precipitation_probability',
                     'total_precipitation', 'avg_cloud_cover', 'avg_cloud_cover_low',
                     'avg_cloud_cover_mid', 'avg_cloud_cover_high', 'weather_description_count')

In [None]:
DF.show()

+----------+------+-----------------+------------------+------------------+------------------------+-----------------------------+-------------------+--------------------+--------------------+-------------------+--------------------+-------------------------+
|      date|  city|         latitude|         longitude|avg_temperature_2m|avg_relative_humidity_2m|avg_precipitation_probability|total_precipitation|     avg_cloud_cover| avg_cloud_cover_low|avg_cloud_cover_mid|avg_cloud_cover_high|weather_description_count|
+----------+------+-----------------+------------------+------------------+------------------------+-----------------------------+-------------------+--------------------+--------------------+-------------------+--------------------+-------------------------+
|2024-03-19|Athens|       33.8797677|        -83.456398| 7.064750048021476|                  32.375|                          0.0|                0.0|                 0.0|                 0.0|                0.0|        

Once we have the rows grouped by day/month/year, we have to make sure that the weather state is correctly maintained in the variable *weather_description_count*.

In order to do so, we will create a function that returns *string* that describes the weather state.

In [None]:
def estat_cel(description_map):
    conditions = {
        "Sense Nuvols": ("Despejado", 10),
        "Nuvols Low": ("Poco nublado", 6),
        "Nuvols Medium": ("Parcialmente nublado", 6),
        "Nuvols High": ("Nublado", 6),
        "Pluja Low": ("Lloviznas", 4),
        "Pluja Medium": ("Lluvias moderadas", 4),
        "Pluja High": ("Lluvias intensas", 4),
        "Boira": ("Niebla", 3),
        "Tempesta": ("Tormenta", 1)
    }

    # Order the descriptions by the count
    descriptions = []
    for key, (label, threshold) in conditions.items():
        if description_map.get(key, 0) > threshold:
            descriptions.append(label)

    if not descriptions:
        # If any condition is met, we assign the condition with the highest count that is relevant but did not reach the threshold
        highest_count = max(description_map.items(), key=lambda x: x[1])
        if highest_count[1] > 0:
            descriptions.append(conditions.get(highest_count[0], (highest_count[0], 0))[0])
        else:
            descriptions.append("Condición no específica")

    return ', '.join(descriptions)

# Register the UDF
estat_cel_udf = F.udf(estat_cel, StringType())

# Create a new column with the weather state using the UDF
DF = DF.withColumn("estat_cel", estat_cel_udf("weather_description_count"))

In [None]:
# We check the counts of each weather state
estat_cel_counts = DF.groupBy("estat_cel").count()
estat_cel_counts = estat_cel_counts.orderBy("estat_cel")
estat_cel_counts.show(truncate=False, n=50)

+-----------------------------------------------+-----+
|estat_cel                                      |count|
+-----------------------------------------------+-----+
|Despejado                                      |5156 |
|Despejado, Nublado                             |1533 |
|Despejado, Poco nublado                        |584  |
|Lloviznas                                      |3013 |
|Lloviznas, Lluvias moderadas                   |617  |
|Lloviznas, Niebla                              |5    |
|Lluvias intensas                               |342  |
|Lluvias moderadas                              |160  |
|Nublado                                        |28028|
|Nublado, Lloviznas                             |4485 |
|Nublado, Lloviznas, Lluvias intensas           |2    |
|Nublado, Lloviznas, Lluvias moderadas          |539  |
|Nublado, Lluvias intensas                      |326  |
|Nublado, Lluvias moderadas                     |2684 |
|Nublado, Niebla                                

Nevertheles, we see that there are many rows that are repeated or inconsistent:

*   *Parcialmente nublado, Nublado, Lluvias intensas*

In that cases, we will decide if the sky is partially cloudy or cloudy.

In [None]:
# Diccionario de reemplazos
replacements = {
    "Despejado, Nublado": "Parcialmente nublado",   # We merge these two categories
    "Despejado, Poco nublado": "Parcialmente nublado",  # We merge these two categories
    "Lloviznas, Lluvias moderadas": "Lluvias ligeras",  # We merge these two categories
    "Nublado, Lloviznas, Lluvias intensas": "Nublado, Lluvias moderadas",   # We merge these two categories
    "Nublado, Lloviznas, Lluvias moderadas": "Nublado, Lluvias moderadas",  # We merge these two categories (we keep lluvias moderadas as the sky is very cloudy)
    "Poco nublado": "Parcialmente nublado", # We delete poco nublado and transform it into parcialmente nublado
    "Poco nublado, Lloviznas": "Parcialmente nublado, Lloviznas",   # We delete poco nublado and transform it into parcialmente nublado
    "Poco nublado, Lluvias moderadas": "Parcialmente nublado, Lluvias moderadas",   # We delete poco nublado and transform it into parcialmente nublado
    "Poco nublado, Niebla": "Parcialmente nublado, Niebla", # We delete poco nublado and transform it into parcialmente nublado
    "Poco nublado, Nublado": "Parcialmente nublado",    # We merge these two categories
    "Poco nublado, Nublado, Lluvias moderadas": "Nublado, Lluvias moderadas",   # As it rains moderately, we consider it as a cloudy day
    "Poco nublado, Parcialmente nublado": "Parcialmente nublado",   # We delete poco nublado and transform it into parcialmente nublado
    "Poco nublado, Parcialmente nublado, Nublado": "Parcialmente nublado",  # We merge these three categories
    "Parcialmente nublado, Nublado": "Parcialmente nublado",    # We keep the lowest category
    "Parcialmente nublado, Nublado, Lloviznas": "Parcialmente nublado, Lloviznas",  # We still keep the lowest category
    "Parcialmente nublado, Nublado, Lluvias intensas": "Nublado, Lluvias intensas", # We consider it as a cloudy day as it rains a lot
    "Parcialmente nublado, Nublado, Niebla": "Nublado, Niebla"  # If it is foggy, we consider it as a cloudy day
}

# Create a conditional expression to replace the values
replacement_expr = F.col("estat_cel")
for key, value in replacements.items():
    replacement_expr = F.when(F.col("estat_cel") == key, value).otherwise(replacement_expr)

# Apply the conditional expression
DF = DF.withColumn("estat_cel", replacement_expr)

DF = DF.drop("weather_description_count")

In [None]:
# Check the counts of each weather state
estat_cel_counts = DF.groupBy("estat_cel").count()
estat_cel_counts = estat_cel_counts.orderBy("estat_cel")
estat_cel_counts.show(truncate=False, n=30)

+---------------------------------------+-----+
|estat_cel                              |count|
+---------------------------------------+-----+
|Despejado                              |5156 |
|Lloviznas                              |3013 |
|Lloviznas, Niebla                      |5    |
|Lluvias intensas                       |342  |
|Lluvias ligeras                        |617  |
|Lluvias moderadas                      |160  |
|Nublado                                |28028|
|Nublado, Lloviznas                     |4485 |
|Nublado, Lluvias intensas              |347  |
|Nublado, Lluvias moderadas             |3365 |
|Nublado, Niebla                        |367  |
|Nublado, Tormenta                      |181  |
|Parcialmente nublado                   |16779|
|Parcialmente nublado, Lloviznas        |294  |
|Parcialmente nublado, Lluvias intensas |418  |
|Parcialmente nublado, Lluvias moderadas|204  |
|Parcialmente nublado, Niebla           |239  |
+---------------------------------------

In [None]:
DF.show()

+----------+------+-----------------+------------------+------------------+------------------------+-----------------------------+-------------------+--------------------+--------------------+-------------------+--------------------+---------+
|      date|  city|         latitude|         longitude|avg_temperature_2m|avg_relative_humidity_2m|avg_precipitation_probability|total_precipitation|     avg_cloud_cover| avg_cloud_cover_low|avg_cloud_cover_mid|avg_cloud_cover_high|estat_cel|
+----------+------+-----------------+------------------+------------------+------------------------+-----------------------------+-------------------+--------------------+--------------------+-------------------+--------------------+---------+
|2024-03-19|Athens|       33.8797677|        -83.456398| 7.064750048021476|                  32.375|                          0.0|                0.0|                 0.0|                 0.0|                0.0|                 0.0|Despejado|
|2024-03-19|Athens|     

We save the dataset

In [None]:
conn = duckdb.connect("quality_database.duckdb")
conn.close()

DF.write \
    .format("jdbc") \
    .option("url", "jdbc:duckdb:quality_database.duckdb") \
    .option("dbtable", "weather") \
    .option("driver", "org.duckdb.DuckDBDriver") \
    .save()