In [24]:
from pyspark.sql.functions import radians, sin, cos, sqrt, atan2, col, lit
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import sum as sql_sum
from pyspark.sql import SparkSession
import matplotlib.pyplot as plt
from pyspark.sql.functions import date_format



# Create SparkSession
spark = SparkSession.builder.appName("ChallengerTemperatureAnalysis").getOrCreate()

# Define haversine distance UDF
def haversine(lat1, lon1, lat2, lon2):
    R = 6371.0  # Earth radius in kilometers
    dlon = radians(lon2) - radians(lon1)
    dlat = radians(lat2) - radians(lat1)
    a = sin(dlat / 2)**2 + cos(radians(lat1)) * cos(radians(lat2)) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    distance = R * c
    return distance

# Define inverse distance weighting UDF
def idw(distances, temperatures):
    weights = 1 / distances
    weighted_temps = temperatures * weights
    return sql_sum(weighted_temps) / sql_sum(weights)


# Read station and temperature data
stations = spark.read.csv("../data/stations.csv", header=True, inferSchema=True)
temperatures = spark.read.csv("../data/1986.csv", header=False, inferSchema=True) \
                     .toDF("station_id", "wban_id", "month", "day", "temperature")

# Filter and clean up the data
stations = stations.filter((stations["latitude"].isNotNull()) & (stations["longitude"].isNotNull()))
temperatures = temperatures.filter((temperatures["temperature"].isNotNull()) & (temperatures["month"] == 1) & (temperatures["day"] == 28))

# Join stations and temperatures data
joined = temperatures.join(stations, ["station_id", "wban_id"])

# Compute distances from each station to Cape Canaveral (28.3922° N, 80.6077° W)
lat_cc = lit(28.3922)
lon_cc = lit(-80.6077)
joined = joined.withColumn("distance", haversine(lat_cc, lon_cc, joined["latitude"], joined["longitude"]).cast(DoubleType()))

# Filter for stations within 100 km of Cape Canaveral
joined = joined.filter(joined["distance"] <= 100)

# Compute IDW temperature at Cape Canaveral on January 28, 1986
idw_temp = joined.groupby().agg(idw(col("distance"), col("temperature")).alias("IDW_Temperature")).collect()[0]["IDW_Temperature"]

# Print result
print("The estimated temperature at Cape Canaveral on January 28, 1986, using inverse distance weighting, is {:.2f} degrees F.".format(idw_temp))




The estimated temperature at Cape Canaveral on January 28, 1986, using inverse distance weighting, is 34.46 degrees F.


                                                                                

In [27]:
# Aggregate temperature data by date

daily_temps = joined.filter(joined["temperature"].isNotNull()) \
                   .groupBy(date_format("date", "yyyy-MM-dd").alias("date")) \
                   .agg(sql_sum(col("temperature")).alias("total_temperature"), sql_sum(col("distance")).alias("total_distance")) \
                   .withColumn("average_temperature", col("total_temperature") / col("total_distance")) \
                   .orderBy("date")

# Extract the temperatures and days from the daily_temps DataFrame
temps = daily_temps.select("average_temperature").collect()
days = daily_temps.select(date_format("day", "d").alias("day")).collect()

# Plot the temperatures for each day in January 1986
plt.plot(days, temps)
plt.title("Temperature in January 1986 at Cape Canaveral")
plt.xlabel("Day of the Month")
plt.ylabel("Temperature (F)")
plt.show()




AnalysisException: Column 'date' does not exist. Did you mean one of the following? [day, month, distance, latitude, wban_id, longitude, station_id, temperature];
'Aggregate [date_format('date, yyyy-MM-dd, Some(America/New_York))], [date_format('date, yyyy-MM-dd, Some(America/New_York)) AS date#684, sum(cast(temperature#589 as double)) AS total_temperature#694, sum(distance#604) AS total_distance#696]
+- Filter isnotnull(temperature#589)
   +- Filter (distance#604 <= cast(100 as double))
      +- Project [station_id#585, wban_id#586, month#587, day#588, temperature#589, latitude#552, longitude#553, cast(((ATAN2(SQRT((POWER(SIN(((RADIANS(latitude#552) - RADIANS(28.3922)) / cast(2 as double))), cast(2 as double)) + ((COS(RADIANS(28.3922)) * COS(RADIANS(latitude#552))) * POWER(SIN(((RADIANS(longitude#553) - RADIANS(-80.6077)) / cast(2 as double))), cast(2 as double))))), SQRT((cast(1 as double) - (POWER(SIN(((RADIANS(latitude#552) - RADIANS(28.3922)) / cast(2 as double))), cast(2 as double)) + ((COS(RADIANS(28.3922)) * COS(RADIANS(latitude#552))) * POWER(SIN(((RADIANS(longitude#553) - RADIANS(-80.6077)) / cast(2 as double))), cast(2 as double))))))) * cast(2 as double)) * 6371.0) as double) AS distance#604]
         +- Project [station_id#585, wban_id#586, month#587, day#588, temperature#589, latitude#552, longitude#553]
            +- Join Inner, ((cast(station_id#585 as int) = station_id#550) AND (cast(wban_id#586 as int) = wban_id#551))
               :- Filter ((isnotnull(temperature#589) AND (cast(month#587 as int) = 1)) AND (cast(day#588 as int) = 28))
               :  +- Project [_c0#575 AS station_id#585, _c1#576 AS wban_id#586, _c2#577 AS month#587, _c3#578 AS day#588, _c4#579 AS temperature#589]
               :     +- Relation [_c0#575,_c1#576,_c2#577,_c3#578,_c4#579] csv
               +- Filter (isnotnull(latitude#552) AND isnotnull(longitude#553))
                  +- Relation [station_id#550,wban_id#551,latitude#552,longitude#553] csv


In [22]:
print(daily_temps.count())




1


                                                                                