# 1. Sample Data

In [11]:
from pyspark.sql.types import StructType, StructField, FloatType, LongType, StringType
from datetime import datetime

## Create schema

In [1]:
feats = []
f = open('../features.txt')
for line_num, line in enumerate(f):
    if line_num == 0:
        # Timestamp
        feats.append(StructField(line.strip(), LongType(), True))
    elif line_num == 1:
        # Geohash
        feats.append(StructField(line.strip(), StringType(), True))
    else:
        # Other features
        feats.append(StructField(line.strip(), FloatType(), True))
    
schema = StructType(feats)

print(schema)

StructType(List(StructField(Timestamp,LongType,true),StructField(Geohash,StringType,true),StructField(geopotential_height_lltw,FloatType,true),StructField(water_equiv_of_accum_snow_depth_surface,FloatType,true),StructField(drag_coefficient_surface,FloatType,true),StructField(sensible_heat_net_flux_surface,FloatType,true),StructField(categorical_ice_pellets_yes1_no0_surface,FloatType,true),StructField(visibility_surface,FloatType,true),StructField(number_of_soil_layers_in_root_zone_surface,FloatType,true),StructField(categorical_freezing_rain_yes1_no0_surface,FloatType,true),StructField(pressure_reduced_to_msl_msl,FloatType,true),StructField(upward_short_wave_rad_flux_surface,FloatType,true),StructField(relative_humidity_zerodegc_isotherm,FloatType,true),StructField(categorical_snow_yes1_no0_surface,FloatType,true),StructField(u-component_of_wind_tropopause,FloatType,true),StructField(surface_wind_gust_surface,FloatType,true),StructField(total_cloud_cover_entire_atmosphere,FloatType,tru

## Load sample data

In [2]:
%%time

df = spark.read.format('csv').option('sep', '\t').schema(schema).load('hdfs://orion11:13030/nam_tiny.tdv')
df.take(1)

CPU times: user 13.8 ms, sys: 1.12 ms, total: 14.9 ms
Wall time: 8.35 s


## Highest temperature

In [7]:
max_row = df.agg({'temperature_surface': 'max'}).collect()[0]
hotest = max_row['max(temperature_surface)']
hotest

306.4980163574219

## Hotest day

In [13]:
# Creating an SQL 'table'
df.createOrReplaceTempView("TINY_NAM")

# Let's get all the snow cover values:
hotest_time = spark.sql(f'SELECT Timestamp FROM TINY_NAM WHERE temperature_surface = {hotest}').collect()
print(f'Hostest day count = {len(hotest_time)}')

unix_timestamp = hotest_time[0]['Timestamp'] / 1000
print(f'Unix Timestamp = {unix_timestamp}')

hotest_date = datetime.utcfromtimestamp(unix_timestamp).strftime('%Y-%m-%d %H:%M:%S')
print(f'Hostest date = {hotest_date}')

Hostest day count = 1
Unix Timestamp = 1426377600.0
Hostest date = 2015-03-15 00:00:00


* We can observe that the highest temperature in the sample dataset is **306.5** degrees Fahrenheit.
* That happened on **March 15th, 2015**.
* The result is **truly an anomaly** since there is no way the temperature would get that high.
* I removed three tailing zeros from the Unix Timestamp since it was in millisecond format and we need second format to convert it into datetime in Python.