In [1]:
from pyspark.sql import SparkSession, functions as func
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

In [2]:
spark = SparkSession.builder.appName("MinMaxTemperature").getOrCreate()

24/11/25 20:02:27 WARN Utils: Your hostname, Ngas-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.0.115 instead (on interface en0)
24/11/25 20:02:27 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/11/25 20:02:28 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
schema = StructType([
    StructField("stationID", StringType(), True),
    StructField("date", IntegerType(), True),
    StructField("measure_type", StringType(), True),
    StructField("temperature", IntegerType(), True)
])

df = spark.read.schema(schema).csv("../data/1800.csv")
df.show()

+-----------+--------+------------+-----------+
|  stationID|    date|measure_type|temperature|
+-----------+--------+------------+-----------+
|ITE00100554|18000101|        TMAX|        -75|
|ITE00100554|18000101|        TMIN|       -148|
|GM000010962|18000101|        PRCP|          0|
|EZE00100082|18000101|        TMAX|        -86|
|EZE00100082|18000101|        TMIN|       -135|
|ITE00100554|18000102|        TMAX|        -60|
|ITE00100554|18000102|        TMIN|       -125|
|GM000010962|18000102|        PRCP|          0|
|EZE00100082|18000102|        TMAX|        -44|
|EZE00100082|18000102|        TMIN|       -130|
|ITE00100554|18000103|        TMAX|        -23|
|ITE00100554|18000103|        TMIN|        -46|
|GM000010962|18000103|        PRCP|          4|
|EZE00100082|18000103|        TMAX|        -10|
|EZE00100082|18000103|        TMIN|        -73|
|ITE00100554|18000104|        TMAX|          0|
|ITE00100554|18000104|        TMIN|        -13|
|GM000010962|18000104|        PRCP|     

In [11]:
# Filter out all but TMIN entries
min_temps = df.filter(df.measure_type == "TMIN")
# Select only stationID and temperature
station_temps = min_temps.select("stationID", "temperature")
# Aggregation
min_temps = station_temps.groupBy("stationID").agg(func.min("temperature").alias("min_temperature"))
min_temps.show()

+-----------+---------------+
|  stationID|min_temperature|
+-----------+---------------+
|ITE00100554|           -148|
|EZE00100082|           -135|
+-----------+---------------+



In [12]:
# Filter out all but TMIN entries
max_temps = df.filter(df.measure_type == "TMAX")
# Select only stationID and temperature
station_temps = max_temps.select("stationID", "temperature")
# Aggregation
max_temps = station_temps.groupBy("stationID").agg(func.max("temperature").alias("max_temperature"))
max_temps.show()

+-----------+---------------+
|  stationID|max_temperature|
+-----------+---------------+
|ITE00100554|            323|
|EZE00100082|            323|
+-----------+---------------+



In [13]:
# Convert a temperature value from Celsius to Fahrenheit
min_temps_by_station_f = min_temps.withColumn("temperature_f", func.round(func.col("min_temperature") / 10 * 1.8 + 32, 2))
min_temps_by_station_f.show()

+-----------+---------------+-------------+
|  stationID|min_temperature|temperature_f|
+-----------+---------------+-------------+
|ITE00100554|           -148|         5.36|
|EZE00100082|           -135|          7.7|
+-----------+---------------+-------------+



In [14]:
spark.stop()