In [1]:
import matplotlib.pyplot as plt
import numpy as np
import geohash2 as gh
import gmaps
import gmaps.datasets

from pyspark.sql.types import StructType, StructField, FloatType, LongType, StringType
from datetime import datetime

In [2]:
feats = []
f = open('../features.txt')
for line_num, line in enumerate(f):
    if line_num == 0:
        # Timestamp
        feats.append(StructField(line.strip(), LongType(), True))
    elif line_num == 1:
        # Geohash
        feats.append(StructField(line.strip(), StringType(), True))
    else:
        # Other features
        feats.append(StructField(line.strip(), FloatType(), True))
    
schema = StructType(feats)

print(schema)

StructType(List(StructField(Timestamp,LongType,true),StructField(Geohash,StringType,true),StructField(geopotential_height_lltw,FloatType,true),StructField(water_equiv_of_accum_snow_depth_surface,FloatType,true),StructField(drag_coefficient_surface,FloatType,true),StructField(sensible_heat_net_flux_surface,FloatType,true),StructField(categorical_ice_pellets_yes1_no0_surface,FloatType,true),StructField(visibility_surface,FloatType,true),StructField(number_of_soil_layers_in_root_zone_surface,FloatType,true),StructField(categorical_freezing_rain_yes1_no0_surface,FloatType,true),StructField(pressure_reduced_to_msl_msl,FloatType,true),StructField(upward_short_wave_rad_flux_surface,FloatType,true),StructField(relative_humidity_zerodegc_isotherm,FloatType,true),StructField(categorical_snow_yes1_no0_surface,FloatType,true),StructField(u-component_of_wind_tropopause,FloatType,true),StructField(surface_wind_gust_surface,FloatType,true),StructField(total_cloud_cover_entire_atmosphere,FloatType,tru

In [3]:
%%time

df = spark.read.format('csv').option('sep', '\t').schema(schema).load('hdfs://orion11:30999/nam/data/nam_s')
df.take(1)

CPU times: user 39 ms, sys: 12.9 ms, total: 51.9 ms
Wall time: 2min 20s


In [4]:
df.describe([
    'lightning_surface',
    'categorical_rain_yes1_no0_surface'
]).show()

+-------+-------------------+---------------------------------+
|summary|  lightning_surface|categorical_rain_yes1_no0_surface|
+-------+-------------------+---------------------------------+
|  count|          108000000|                        108000000|
|   mean|0.03493687962962963|              0.08852023148148148|
| stddev|0.18361997272508837|               0.2840499970903469|
|    min|                0.0|                              0.0|
|    max|                1.0|                              1.0|
+-------+-------------------+---------------------------------+



In [5]:
%%time
df.createOrReplaceTempView("nam_small")
lighting_location = spark.sql(
    f'''SELECT * FROM(SELECT substr(Geohash,1,4) as geoloc, avg(lightning_surface) as lightavg, avg(categorical_rain_yes1_no0_surface) as avg_rain
        FROM nam_small 
        GROUP BY substr(Geohash,1,4) 
        Having avg(lightning_surface) > .25) as t2
    ORDER BY lightavg DESC
    Limit 4
    ''').collect()
print(f'Locations to get hit by lightning = {len(lighting_location)}')

Locations to get hit by lightning = 4
CPU times: user 38.1 ms, sys: 14.4 ms, total: 52.5 ms
Wall time: 2min 32s


In [6]:
for row in lighting_location:
    print (row)

Row(geoloc='9g3v', lightavg=0.3184, avg_rain=0.392)
Row(geoloc='9g3h', lightavg=0.29589905362776026, avg_rain=0.39621451104100947)
Row(geoloc='9g3m', lightavg=0.29389942291838417, avg_rain=0.39159109645507006)
Row(geoloc='9err', lightavg=0.275338530980714, avg_rain=0.34673779236766517)


In [10]:
lat_long_list = [[(19.25),(-98.61)],[(19.07),(-99.67)],[(19.25),(-99.32)],[(19.6),(-102.13)]]

lighting = gmaps.symbol_layer(
    lat_long_list,
    fill_color='red',
    stroke_color='red',
)
fig = gmaps.figure()
fig.add_layer(lighting)
fig


Figure(layout=FigureLayout(height='420px'))

## Analysis: Lightning

From the map below you can see that most of the locations that you are likely to be struck by lightning are all in mexico. However after a quick google search it is shown that the places where there is the most lightning activity are in South America. This is likely because we restricting to parameters averaging all the lighting strikes within 4 geohash characters. I can expect that if we allowed for more than 4 geo hash characters I expect that these results will be significantly different.


In [11]:
%%time
df.createOrReplaceTempView("nam_small")
lighting_location = spark.sql(
    f'''SELECT * FROM(SELECT Geohash, avg(lightning_surface) as lightavg, avg(categorical_rain_yes1_no0_surface) as avg_rain
        FROM nam_small 
        GROUP BY Geohash 
        Having avg(lightning_surface) > .25) as t2
    ORDER BY lightavg DESC
    Limit 4
    ''').collect()
print(f'Locations to get hit by lightning = {len(lighting_location)}')

Locations to get hit by lightning = 4
CPU times: user 82.6 ms, sys: 26 ms, total: 109 ms
Wall time: 5min 30s


In [12]:
for row in lighting_location:
    print (row)
    

Row(Geohash='9g3h968ygj7z', lightavg=0.3969465648854962, avg_rain=0.5216284987277354)
Row(Geohash='9g3m79nrf3zb', lightavg=0.36623376623376624, avg_rain=0.4675324675324675)
Row(Geohash='9g3ug8ckk4hp', lightavg=0.3592964824120603, avg_rain=0.44221105527638194)
Row(Geohash='9g3mq3f7y6eb', lightavg=0.3559718969555035, avg_rain=0.45433255269320844)


In [14]:
lat_long_list = [gh.decode(row[0]) for row in lighting_location]
lat_long_list = [(float(row[0]), float(row[1])) for row in lat_long_list]
lat_long_list

[(19.087187, -99.787654),
 (19.210935, -99.329689),
 (19.121072, -98.63336),
 (19.214372, -99.214166)]

In [15]:
lighting = gmaps.symbol_layer(
    lat_long_list,
    fill_color='red',
    stroke_color='red',
)
fig = gmaps.figure()
fig.add_layer(lighting)
fig

Figure(layout=FigureLayout(height='420px'))

After testing, my hypothesis of allowing for more specific geohashes did not impact the locations. This is possible based on the test data itself ( maybe it does not include south america) or something to do with the sample size.