In [25]:
import csv
from tabulate import tabulate

# Define the file path
file_path = "temp1800.csv"

# Dictionary to hold maximum temperatures for each month
max_temperatures = {"January": {"Temperature": float("-inf"), "Date": "", "Station Name": ""},
                    "April": {"Temperature": float("-inf"), "Date": "", "Station Name": ""},
                    "July": {"Temperature": float("-inf"), "Date": "", "Station Name": ""},
                    "October": {"Temperature": float("-inf"), "Date": "", "Station Name": ""}}

# Read the CSV file and process the data
with open(file_path, mode='r') as file:
    reader = csv.reader(file)
    for row in reader:
        # Extract relevant information
        station_id, date, element, temperature, *_ = row  # Taking only the first four columns
        
        # Convert temperature to Celsius
        temperature_celsius = int(temperature) / 10
        
        # Extract month from the date
        month = int(date[4:6])
        
        # Only process if the month is January, April, July, or October
        if month in [1, 4, 7, 10]:
            month_name = {1: "January", 4: "April", 7: "July", 10: "October"}[month]
            # Update maximum temperature for each month
            if element == "TMAX" and temperature_celsius > max_temperatures[month_name]["Temperature"]:
                max_temperatures[month_name]["Temperature"] = temperature_celsius
                max_temperatures[month_name]["Date"] = date
                max_temperatures[month_name]["Station Name"] = station_id

# Prepare data for tabulate
data = [["Month"] + list(max_temperatures.keys())]
for param in ["Temperature", "Date", "Station Name"]:
    row = [param]
    for month in max_temperatures:
        row.append(max_temperatures[month][param])
    data.append(row)

# Print the result using tabulate
print(tabulate(data, tablefmt="grid"))


+--------------+-------------+-------------+-------------+-------------+
| Month        | January     | April       | July        | October     |
+--------------+-------------+-------------+-------------+-------------+
| Temperature  | 9.8         | 28.6        | 31.9        | 20.0        |
+--------------+-------------+-------------+-------------+-------------+
| Date         | 18000125    | 18000429    | 18000710    | 18001003    |
+--------------+-------------+-------------+-------------+-------------+
| Station Name | ITE00100554 | EZE00100082 | ITE00100554 | ITE00100554 |
+--------------+-------------+-------------+-------------+-------------+


In [30]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, max as spark_max

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("MaxTemperatures") \
    .getOrCreate()

# Define the file path
file_path = "temp1800.csv"

# Read CSV into a DataFrame, considering only the first four columns
df = spark.read.csv(file_path, header=False).select("_c0", "_c1", "_c2", "_c3")

# Define schema
schema = ["station_id", "date", "element", "temperature"]

# Assign schema to DataFrame
df = df.toDF(*schema)

# Convert temperature to Celsius
df = df.withColumn("temperature_celsius", col("temperature").cast("float") / 10)

# Extract month from date
df = df.withColumn("month", col("date").substr(5, 2).cast("int"))

# Filter for the required months
df = df.filter(col("month").isin([1, 4, 7, 10]))

# Filter for TMAX records
df = df.filter(col("element") == "TMAX")

# Group by month and find maximum temperature
max_temperatures = df.groupBy("month") \
    .agg(spark_max("temperature_celsius").alias("max_temperature")) \
    .orderBy("month")

# Show the result
max_temperatures.show()

# Stop SparkSession
spark.stop()


                                                                                

+-----+---------------+
|month|max_temperature|
+-----+---------------+
|    1|            9.8|
|    4|           28.6|
|    7|           31.9|
|   10|           20.0|
+-----+---------------+



In [33]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, max as spark_max, when, lit

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("MaxTemperatures") \
    .getOrCreate()

# Define the file path
file_path = "temp1800.csv"

# Read CSV into a DataFrame, considering only the first four columns
df = spark.read.csv(file_path, header=False).select("_c0", "_c1", "_c2", "_c3")

# Define schema
schema = ["station_id", "date", "element", "temperature"]

# Assign schema to DataFrame
df = df.toDF(*schema)

# Convert temperature to Celsius
df = df.withColumn("temperature_celsius", col("temperature").cast("float") / 10)

# Extract month from date
df = df.withColumn("month", col("date").substr(5, 2).cast("int"))

# Filter for TMAX records
df = df.filter(col("element") == "TMAX")

# Create DataFrame to hold maximum temperatures for each month
max_temperatures = spark.createDataFrame([
    ("January", float("-inf"), "", ""),
    ("April", float("-inf"), "", ""),
    ("July", float("-inf"), "", ""),
    ("October", float("-inf"), "", "")
], ["Month", "Temperature", "Date", "Station Name"])

# Iterate over months to find maximum temperatures
for month in ["January", "April", "July", "October"]:
    month_num = {"January": 1, "April": 4, "July": 7, "October": 10}[month]
    temp_df = df.filter(col("month") == month_num)
    max_temp = temp_df.agg(spark_max("temperature_celsius")).collect()[0][0]
    max_temp_row = temp_df.filter(col("temperature_celsius") == max_temp).first()
    max_temperatures = max_temperatures.withColumn(month, lit(max_temp)) \
        .withColumn("Date", when(col("Month") == month, max_temp_row["date"])) \
        .withColumn("Station Name", when(col("Month") == month, max_temp_row["station_id"]))

# Show the result
max_temperatures.show(truncate=False)

# Stop SparkSession
spark.stop()



                                                                                

+-------+-----------+--------+------------+-------+-----+----+-------+
|Month  |Temperature|Date    |Station Name|January|April|July|October|
+-------+-----------+--------+------------+-------+-----+----+-------+
|January|-Infinity  |null    |null        |9.8    |28.6 |31.9|20.0   |
|April  |-Infinity  |null    |null        |9.8    |28.6 |31.9|20.0   |
|July   |-Infinity  |null    |null        |9.8    |28.6 |31.9|20.0   |
|October|-Infinity  |18001003|ITE00100554 |9.8    |28.6 |31.9|20.0   |
+-------+-----------+--------+------------+-------+-----+----+-------+



In [34]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, max as spark_max, when, lit

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("MaxTemperatures") \
    .getOrCreate()

# Define the file path
file_path = "temp1800.csv"

# Read CSV into a DataFrame, considering only the first four columns
df = spark.read.csv(file_path, header=False).select("_c0", "_c1", "_c2", "_c3")

# Define schema
schema = ["station_id", "date", "element", "temperature"]

# Assign schema to DataFrame
df = df.toDF(*schema)

# Convert temperature to Celsius
df = df.withColumn("temperature_celsius", col("temperature").cast("float") / 10)

# Extract month from date
df = df.withColumn("month", col("date").substr(5, 2).cast("int"))

# Filter for TMAX records
df = df.filter(col("element") == "TMAX")

# Create DataFrame to hold maximum temperatures for each month
max_temperatures = spark.createDataFrame([
    ("Temperature", "", "", "", ""),
    ("Date", "", "", "", ""),
    ("Station Name", "", "", "", "")
], ["Month", "January", "April", "July", "October"])

# Iterate over months to find maximum temperatures
for month in ["January", "April", "July", "October"]:
    month_num = {"January": 1, "April": 4, "July": 7, "October": 10}[month]
    temp_df = df.filter(col("month") == month_num)
    max_temp = temp_df.agg(spark_max("temperature_celsius")).collect()[0][0]
    max_temp_row = temp_df.filter(col("temperature_celsius") == max_temp).first()
    max_temperatures = max_temperatures.withColumn(month, lit(max_temp)) \
        .withColumn("Date", when(col("Month") == month, max_temp_row["date"])) \
        .withColumn("Station Name", when(col("Month") == month, max_temp_row["station_id"]))

# Show the result
max_temperatures.show(truncate=False)

# Stop SparkSession
spark.stop()


                                                                                

+------------+-------+-----+----+-------+----+------------+
|Month       |January|April|July|October|Date|Station Name|
+------------+-------+-----+----+-------+----+------------+
|Temperature |9.8    |28.6 |31.9|20.0   |null|null        |
|Date        |9.8    |28.6 |31.9|20.0   |null|null        |
|Station Name|9.8    |28.6 |31.9|20.0   |null|null        |
+------------+-------+-----+----+-------+----+------------+



In [36]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, max as spark_max, when, lit
from tabulate import tabulate

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("MaxTemperatures") \
    .getOrCreate()

# Define the file path
file_path = "temp1800.csv"

# Read CSV into a DataFrame, considering only the first four columns
df = spark.read.csv(file_path, header=False).select("_c0", "_c1", "_c2", "_c3")

# Define schema
schema = ["station_id", "date", "element", "temperature"]

# Assign schema to DataFrame
df = df.toDF(*schema)

# Convert temperature to Celsius
df = df.withColumn("temperature_celsius", col("temperature").cast("float") / 10)

# Extract month from date
df = df.withColumn("month", col("date").substr(5, 2).cast("int"))

# Filter for TMAX records
df = df.filter(col("element") == "TMAX")

# Dictionary to hold maximum temperatures for each month
max_temperatures = {"Jan": {"Temperature": float("-inf"), "Date": "", "Station Name": ""},
                    "Apr": {"Temperature": float("-inf"), "Date": "", "Station Name": ""},
                    "Jul": {"Temperature": float("-inf"), "Date": "", "Station Name": ""},
                    "Oct": {"Temperature": float("-inf"), "Date": "", "Station Name": ""}}

# Iterate over months to find maximum temperatures
for month in ["Jan", "Apr", "Jul", "Oct"]:
    month_num = {"Jan": 1, "Apr": 4, "Jul": 7, "Oct": 10}[month]
    temp_df = df.filter(col("month") == month_num)
    max_temp = temp_df.agg(spark_max("temperature_celsius")).collect()[0][0]
    max_temp_row = temp_df.filter(col("temperature_celsius") == max_temp).first()

    # Update maximum temperature for each month
    max_temperatures[month]["Temperature"] = max_temp
    max_temperatures[month]["Date"] = max_temp_row["date"]
    max_temperatures[month]["Station Name"] = max_temp_row["station_id"]

# Prepare data for printing
data = [["Month"] + list(max_temperatures.keys())]
for param in ["Temperature", "Date", "Station Name"]:
    row = [param]
    for month in max_temperatures:
        row.append(max_temperatures[month][param])
    data.append(row)

# Print the result using tabulate
print(tabulate(data, tablefmt="grid"))

# Stop SparkSession
spark.stop()


+--------------+-------------+-------------+-------------+-------------+
| Month        | Jan         | Apr         | Jul         | Oct         |
+--------------+-------------+-------------+-------------+-------------+
| Temperature  | 9.8         | 28.6        | 31.9        | 20.0        |
+--------------+-------------+-------------+-------------+-------------+
| Date         | 18000125    | 18000429    | 18000710    | 18001003    |
+--------------+-------------+-------------+-------------+-------------+
| Station Name | ITE00100554 | EZE00100082 | ITE00100554 | ITE00100554 |
+--------------+-------------+-------------+-------------+-------------+
