# 1. Sample Dataset

In [1]:
from pyspark.sql.types import StructType, StructField, FloatType, LongType, StringType
from datetime import datetime

## Create schema

In [2]:
feats = []
f = open('../features.txt')
for line_num, line in enumerate(f):
    if line_num == 0:
        # Timestamp
        feats.append(StructField(line.strip(), LongType(), True))
    elif line_num == 1:
        # Geohash
        feats.append(StructField(line.strip(), StringType(), True))
    else:
        # Other features
        feats.append(StructField(line.strip(), FloatType(), True))
    
schema = StructType(feats)

print(schema)

StructType(List(StructField(Timestamp,LongType,true),StructField(Geohash,StringType,true),StructField(geopotential_height_lltw,FloatType,true),StructField(water_equiv_of_accum_snow_depth_surface,FloatType,true),StructField(drag_coefficient_surface,FloatType,true),StructField(sensible_heat_net_flux_surface,FloatType,true),StructField(categorical_ice_pellets_yes1_no0_surface,FloatType,true),StructField(visibility_surface,FloatType,true),StructField(number_of_soil_layers_in_root_zone_surface,FloatType,true),StructField(categorical_freezing_rain_yes1_no0_surface,FloatType,true),StructField(pressure_reduced_to_msl_msl,FloatType,true),StructField(upward_short_wave_rad_flux_surface,FloatType,true),StructField(relative_humidity_zerodegc_isotherm,FloatType,true),StructField(categorical_snow_yes1_no0_surface,FloatType,true),StructField(u-component_of_wind_tropopause,FloatType,true),StructField(surface_wind_gust_surface,FloatType,true),StructField(total_cloud_cover_entire_atmosphere,FloatType,tru

## Load sample data

In [3]:
%%time

df = spark.read.format('csv').option('sep', '\t').schema(schema).load('hdfs://orion11:13030/nam_tiny.tdv')
df.take(1)

CPU times: user 8.25 ms, sys: 4.69 ms, total: 12.9 ms
Wall time: 7.17 s


## Highest temperature

In [4]:
max_row = df.agg({'temperature_surface': 'max'}).collect()[0]
hotest = max_row['max(temperature_surface)']
hotest

306.4980163574219

## Hotest day

In [5]:
# Creating an SQL 'table'
df.createOrReplaceTempView("TINY_NAM")

# Getting the timestamp value of the highest temperature_surface row.
hotest_time = spark.sql(f'SELECT Timestamp FROM TINY_NAM WHERE temperature_surface = {hotest}').collect()
print(f'Hostest day count = {len(hotest_time)}')

unix_timestamp = hotest_time[0]['Timestamp'] / 1000
print(f'Unix Timestamp = {unix_timestamp}')

hotest_date = datetime.utcfromtimestamp(unix_timestamp).strftime('%Y-%m-%d %H:%M:%S')
print(f'Hostest date = {hotest_date}')

Hostest day count = 1
Unix Timestamp = 1426377600.0
Hostest date = 2015-03-15 00:00:00


## Describe dataset

In [6]:
df.describe(['temperature_surface']).show()

+-------+-------------------+
|summary|temperature_surface|
+-------+-------------------+
|  count|                100|
|   mean|  284.9017663574219|
| stddev| 13.002025568205239|
|    min|          247.49802|
|    max|          306.49802|
+-------+-------------------+



* We can observe that the highest temperature in the sample dataset is **306.5** degrees Fahrenheit.
* That happened on **March 15th, 2015**.
* The result is **truly an anomaly** at first glance since there is no way the temperature would be that high. But it's event more weird that the mean of temperature_surface of the sample dataset is 287.9, which is impossible! So, this result makes the 306.5 **not anomalous** at all!
* I removed three tailing zeros from the Unix Timestamp since it was in millisecond format and we need second format to convert it into datetime in Python.

# 2. Entire Dataset

In [19]:
%%time

df = spark.read.format('csv').option('sep', '\t').schema(schema).load('hdfs://orion11:13030/nam/*')
print(f'Loaded {df.count()} rows of data')

Loaded 323759744 rows of data
CPU times: user 65.4 ms, sys: 29.7 ms, total: 95.1 ms
Wall time: 5min 40s


In [21]:
%%time

# Repeat the approach above
max_row = df.agg({'temperature_surface': 'max'}).collect()[0]
hotest = max_row['max(temperature_surface)']
print(f'Highest temperature = {hotest}')

df.createOrReplaceTempView("NAM")
hotest_time = spark.sql(f'SELECT Timestamp FROM NAM WHERE temperature_surface = {hotest}').collect()
print(f'Hostest day count = {len(hotest_time)}')

unix_timestamp = hotest_time[0]['Timestamp'] / 1000
print(f'Unix Timestamp = {unix_timestamp}')

hotest_date = datetime.utcfromtimestamp(unix_timestamp).strftime('%Y-%m-%d %H:%M:%S')
print(f'Hostest date = {hotest_date}')

df.describe(['temperature_surface']).show()

Highest temperature = 331.390625
Hostest day count = 1
Unix Timestamp = 1440266400.0
Hostest date = 2015-08-22 18:00:00
+-------+-------------------+
|summary|temperature_surface|
+-------+-------------------+
|  count|          323759744|
|   mean|  287.8572105962259|
| stddev| 13.716834080260094|
|    min|          218.99284|
|    max|          331.39062|
+-------+-------------------+

CPU times: user 161 ms, sys: 106 ms, total: 267 ms
Wall time: 17min 36s


* The highest temperature of the entire dataset is **331.4**.
* The hotest date is **August 22nd, 2015**.
* In my opinion, that is **an anomaly**, but not for the dataset I guess.