In [1]:
import matplotlib.pyplot as plt
import numpy as np

from pyspark.sql.types import StructType, StructField, FloatType, LongType, StringType
from datetime import datetime

In [2]:
feats = []
f = open('../features.txt')
for line_num, line in enumerate(f):
    if line_num == 0:
        # Timestamp
        feats.append(StructField(line.strip(), LongType(), True))
    elif line_num == 1:
        # Geohash
        feats.append(StructField(line.strip(), StringType(), True))
    else:
        # Other features
        feats.append(StructField(line.strip(), FloatType(), True))
    
schema = StructType(feats)

print(schema)

StructType(List(StructField(Timestamp,LongType,true),StructField(Geohash,StringType,true),StructField(geopotential_height_lltw,FloatType,true),StructField(water_equiv_of_accum_snow_depth_surface,FloatType,true),StructField(drag_coefficient_surface,FloatType,true),StructField(sensible_heat_net_flux_surface,FloatType,true),StructField(categorical_ice_pellets_yes1_no0_surface,FloatType,true),StructField(visibility_surface,FloatType,true),StructField(number_of_soil_layers_in_root_zone_surface,FloatType,true),StructField(categorical_freezing_rain_yes1_no0_surface,FloatType,true),StructField(pressure_reduced_to_msl_msl,FloatType,true),StructField(upward_short_wave_rad_flux_surface,FloatType,true),StructField(relative_humidity_zerodegc_isotherm,FloatType,true),StructField(categorical_snow_yes1_no0_surface,FloatType,true),StructField(u-component_of_wind_tropopause,FloatType,true),StructField(surface_wind_gust_surface,FloatType,true),StructField(total_cloud_cover_entire_atmosphere,FloatType,tru

In [3]:
%%time

df = spark.read.format('csv').option('sep', '\t').schema(schema).load('hdfs://orion11:30999/nam/data/nam_s')
df.take(1)

CPU times: user 67.1 ms, sys: 19.8 ms, total: 86.9 ms
Wall time: 3min 45s


In [5]:
df.describe([
    'lightning_surface'
]).show()

+-------+-------------------+
|summary|  lightning_surface|
+-------+-------------------+
|  count|          108000000|
|   mean|0.03493687962962963|
| stddev| 0.1836199727250884|
|    min|                0.0|
|    max|                1.0|
+-------+-------------------+



In [6]:
df.createOrReplaceTempView("nam_small")
lighting_location = spark.sql(
    f'''SELECT * FROM(SELECT substr(Geohash,1,4) as geoloc, avg(lightning_surface) as lightavg 
        FROM nam_small 
        GROUP BY substr(Geohash,1,4) 
        Having avg(lightning_surface) > .25) as t2
    ORDER BY lightavg DESC
    Limit 4
    ''').collect()
print(f'Locations to get hit by lightning = {len(lighting_location)}')

Locations to get hit by lightning = 4


In [7]:
for row in lighting_location:
    print (row)

Row(geoloc='9g3v', lightavg=0.3184)
Row(geoloc='9g3h', lightavg=0.29589905362776026)
Row(geoloc='9g3m', lightavg=0.29389942291838417)
Row(geoloc='9err', lightavg=0.275338530980714)
