In [2]:
import matplotlib.pyplot as plt
import numpy as np
import geohash2 as gh
import gmaps
import gmaps.datasets

from pyspark.sql.types import StructType, StructField, FloatType, LongType, StringType
from datetime import datetime

In [3]:
feats = []
f = open('../features.txt')
for line_num, line in enumerate(f):
    if line_num == 0:
        # Timestamp
        feats.append(StructField(line.strip(), LongType(), True))
    elif line_num == 1:
        # Geohash
        feats.append(StructField(line.strip(), StringType(), True))
    else:
        # Other features
        feats.append(StructField(line.strip(), FloatType(), True))
    
schema = StructType(feats)


In [4]:
%%time

df = spark.read.format('csv').option('sep', '\t').schema(schema).load('hdfs://orion11:13030/nam_s')
df.take(1)

CPU times: user 43.4 ms, sys: 11.2 ms, total: 54.6 ms
Wall time: 2min 34s


## Oddly Snowy

What we are looking for here is snow cover where there is not very much snow coverage. For this problem we can limit ourself to only looking for areas with a snow covered surface where the surrounding areas do not.

In [6]:
df.describe([
    'snow_cover_surface',
]).show()

+-------+------------------+
|summary|snow_cover_surface|
+-------+------------------+
|  count|         108000000|
|   mean|16.332520101851852|
| stddev| 36.90268917507707|
|    min|               0.0|
|    max|             100.0|
+-------+------------------+



## Method

We will take a broad area and find areas in which there are little snow. Areas with only a few areas with snow can be considered strangly snowy. In this case we only group by the Geospace and do not group by the Times. After we find the general area where there is a strangly snowy location we will narrow it down to a more specific geohash in a sepreate query

In [5]:
%%time
df.createOrReplaceTempView("nam_small")
snowy_location = spark.sql(
    f'''SELECT substr(Geohash,1,4) as geoloc,count(substr(Geohash,1,4)) as count
        FROM nam_small 
        WHERE snow_cover_surface > 0
        GROUP BY substr(Geohash,1,4)
        ORDER BY count 
        limit 10
    ''').collect()
print(f'Number of Locations = {len(snowy_location)}')

Number of Locations = 10
CPU times: user 43.9 ms, sys: 13.7 ms, total: 57.6 ms
Wall time: 2min 39s


These are the geohashs that have a strangly snowy area/location

In [6]:
for row in snowy_location:
    print (row)

Row(geoloc='9v11', count=1)
Row(geoloc='9udw', count=1)
Row(geoloc='9qc0', count=1)
Row(geoloc='9ubn', count=1)
Row(geoloc='9sqc', count=1)
Row(geoloc='9sxh', count=1)
Row(geoloc='9sx0', count=1)
Row(geoloc='9ucq', count=1)
Row(geoloc='9ubr', count=1)
Row(geoloc='9q96', count=1)


In [7]:
lat_long_list = [gh.decode(row[0]) for row in snowy_location]
lat_long_list = [(float(row[0]), float(row[1])) for row in lat_long_list]

lighting = gmaps.symbol_layer(
    lat_long_list,
    fill_color='red',
    stroke_color='red',
)
fig = gmaps.figure()
fig.add_layer(lighting)
fig

Figure(layout=FigureLayout(height='420px'))

This code finds a the specific geo hash of the strangly snowy location.


In [8]:
%%time
df.createOrReplaceTempView("nam_small")
snowy_location = spark.sql(
    f'''SELECT Geohash, snow_cover_surface
        FROM nam_small 
        WHERE substr(Geohash,1,4) = '9sxh' and snow_cover_surface = 100.0
    ''').collect()
print(f'Number of Locations = {len(snowy_location)}')


Number of Locations = 1
CPU times: user 69.2 ms, sys: 25 ms, total: 94.2 ms
Wall time: 4min 40s


In [10]:
print(snowy_location)

[Row(Geohash='9sxhw0ef52zz', snow_cover_surface=100.0)]


In [9]:
lat_long_list = [gh.decode(row[0]) for row in snowy_location]
lat_long_list = [(float(row[0]), float(row[1])) for row in lat_long_list]

lighting = gmaps.symbol_layer(
    lat_long_list,
    fill_color='red',
    stroke_color='red',
)
fig = gmaps.figure()
fig.add_layer(lighting)
fig

Figure(layout=FigureLayout(height='420px'))

In [11]:
%%time
df.createOrReplaceTempView("nam_small")
snowy_location = spark.sql(
    f'''SELECT Geohash, snow_cover_surface
        FROM nam_small 
        WHERE substr(Geohash,1,4) = '9sqc' and snow_cover_surface = 100.0
    ''').collect()
print(f'Number of Locations = {len(snowy_location)}')

Number of Locations = 1
CPU times: user 43.1 ms, sys: 14.3 ms, total: 57.4 ms
Wall time: 2min 46s
