# 1. Sample Dataset 2015-12

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import geohash2 as gh
import gmaps
import gmaps.datasets
import os

from pyspark.sql.types import StructType, StructField, FloatType, LongType, StringType
from datetime import datetime

gmaps.configure(api_key=os.environ["GOOGLE_API_KEY"])

## Create schema

In [2]:
feats = []
f = open('../features.txt')
for line_num, line in enumerate(f):
    if line_num == 0:
        # Timestamp
        feats.append(StructField(line.strip(), LongType(), True))
    elif line_num == 1:
        # Geohash
        feats.append(StructField(line.strip(), StringType(), True))
    else:
        # Other features
        feats.append(StructField(line.strip(), FloatType(), True))
    
schema = StructType(feats)

print(schema)

StructType(List(StructField(Timestamp,LongType,true),StructField(Geohash,StringType,true),StructField(geopotential_height_lltw,FloatType,true),StructField(water_equiv_of_accum_snow_depth_surface,FloatType,true),StructField(drag_coefficient_surface,FloatType,true),StructField(sensible_heat_net_flux_surface,FloatType,true),StructField(categorical_ice_pellets_yes1_no0_surface,FloatType,true),StructField(visibility_surface,FloatType,true),StructField(number_of_soil_layers_in_root_zone_surface,FloatType,true),StructField(categorical_freezing_rain_yes1_no0_surface,FloatType,true),StructField(pressure_reduced_to_msl_msl,FloatType,true),StructField(upward_short_wave_rad_flux_surface,FloatType,true),StructField(relative_humidity_zerodegc_isotherm,FloatType,true),StructField(categorical_snow_yes1_no0_surface,FloatType,true),StructField(u-component_of_wind_tropopause,FloatType,true),StructField(surface_wind_gust_surface,FloatType,true),StructField(total_cloud_cover_entire_atmosphere,FloatType,tru

## Load sample data

In [3]:
%%time

df = spark.read.format('csv').option('sep', '\t').schema(schema).load('hdfs://orion11:13030/nam_tiny.tdv')
df.take(1)

CPU times: user 12.7 ms, sys: 5.24 ms, total: 17.9 ms
Wall time: 11.3 s


## EDA about "snow"
* Fields include snow keyword: water_equiv_of_accum_snow_depth_surface, categorical_snow_yes1_no0_surface, snow_cover_surface, snow_depth_surface

In [4]:
df.describe([
    'water_equiv_of_accum_snow_depth_surface',
    'categorical_snow_yes1_no0_surface',
    'snow_cover_surface',
    'snow_depth_surface',
]).show()

+-------+---------------------------------------+---------------------------------+------------------+-------------------+
|summary|water_equiv_of_accum_snow_depth_surface|categorical_snow_yes1_no0_surface|snow_cover_surface| snow_depth_surface|
+-------+---------------------------------------+---------------------------------+------------------+-------------------+
|  count|                                    100|                              100|               100|                100|
|   mean|                                  29.28|                             0.06|              25.0|0.09450799790618475|
| stddev|                      137.3906970517423|              0.23868325657594203|43.519413988924455| 0.2866969536458589|
|    min|                                    0.0|                              0.0|               0.0|                0.0|
|    max|                                 1321.0|                              1.0|             100.0|              2.128|
+-------+-------

* The categorical_snow_yes1_no0_surface is the field to determine if it's snowy or not since other fields could be ther result of remaining snow. 

In [5]:
df.createOrReplaceTempView("TINY_NAM")
snowy_location = spark.sql(
    f'''SELECT Geohash FROM TINY_NAM WHERE Geohash NOT IN (
    SELECT Geohash FROM TINY_NAM WHERE categorical_snow_yes1_no0_surface = 0 GROUP BY Geohash
    )'''
).collect()
print(f'Sonwy all year location count = {len(snowy_location)}')

Sonwy all year location count = 6


In [6]:
snowy_location = [row['Geohash'] for row in snowy_location]
snowy_location

['f2w29r4werxb',
 'fccz22w4fytb',
 'c1nuq5290jup',
 'f2d5v1jeyp7z',
 'c6s64488ws80',
 'f2fh6jpdgv5b']

* Here are all locations that were snowy all year.
* I found some useful website to check the geohash:
    * [http://geohash.gofreerange.com](http://geohash.gofreerange.com)
    * [http://www.movable-type.co.uk/scripts/geohash.html](http://www.movable-type.co.uk/scripts/geohash.html)
* The Geohash 'c1nuq5290jup' is the clostest one from San Francisco, which is located in Comox-Strathcona J, British Columbia, Canada. It's Latitude is 51.39012686, and it's Longitude is -125.24344845.
* If the Geohash starts from 'f', it is located in the east coast.
* Geohash 'c6s' is more northern than 'c1n'.

In [7]:
# Decode geohash into lat and long
lat_long_list = [gh.decode(row) for row in snowy_location]
lat_long_list = [(float(row[0]), float(row[1])) for row in lat_long_list]
lat_long_list

[(47.939999, -69.902423),
 (56.121418, -53.769711),
 (51.390127, -125.243448),
 (48.477857, -75.709958),
 (59.427905, -117.684581),
 (49.993785, -75.839144)]

In [15]:
snowy_layer = gmaps.symbol_layer(
    lat_long_list,
    fill_color='red',
    stroke_color='red',
    hover_text=snowy_location,
)
fig = gmaps.figure()
fig.add_layer(snowy_layer)
fig

Figure(layout=FigureLayout(height='420px'))

![Figure](img/map_tiny.png)

## S dataset

In [3]:
%%time

df = spark.read.format('csv').option('sep', '\t').schema(schema).load('hdfs://orion11:13030/nam_s/*')
df.show(3)

+-------------+------------+------------------------+---------------------------------------+------------------------+------------------------------+----------------------------------------+------------------+------------------------------------------+------------------------------------------+---------------------------+----------------------------------+-----------------------------------+---------------------------------+------------------------------+-------------------------+-----------------------------------+---------------------------------+-----------------------------+---------------------------------+-------------------------+--------------+-----------------+------------------------------+-----------------------------+----------------+------------------------------------------------+---------------------+------------------+---------------------------------+-----------------------------------+---------------------------------------+------------------------------+-------------

In [4]:
%%time

df.createOrReplaceTempView('NAM')
non_snowy_location = spark.sql(
    'SELECT Geohash FROM NAM WHERE categorical_snow_yes1_no0_surface = 0 GROUP BY Geohash'
)
non_snowy_location.show(10)

+------------+
|     Geohash|
+------------+
|c41kxf2pgq00|
|9mf6wz2s507z|
|dmejh7g1xshp|
|dptu4jbqrvgz|
|c2xm7h3qt5rz|
|9q218mtp4500|
|dhp2heqfhvs0|
|dxwr6qw3q1fp|
|f8w3y0dukx6p|
|9tny7zv632rb|
+------------+
only showing top 10 rows

CPU times: user 17.6 ms, sys: 3.36 ms, total: 20.9 ms
Wall time: 2min 55s


In [None]:
%%time

non_snowy_location.createOrReplaceTempView('NON_SNOWY')
snowy_location = spark.sql(
    'SELECT Geohash FROM NAM WHERE Geohash NOT IN (SELECT Geohash FROM NON_SNOWY)'
)
snowy_location.show(10)