In [16]:
import matplotlib.pyplot as plt
import numpy as np

from pyspark.sql.types import StructType, StructField, FloatType, LongType, StringType
from datetime import datetime
from pyspark.sql import functions as F

In [17]:
hdfs_port = "hdfs://orion11:26990"
data_path = "/nam_s/nam_201501_s*"
# data_path = "/sample/nam_tiny*"

In [18]:
feats = []
f = open('../features.txt')
for line_num, line in enumerate(f):
    if line_num == 0:
        # Timestamp
        feats.append(StructField(line.strip(), LongType(), True))
    elif line_num == 1:
        # Geohash
        feats.append(StructField(line.strip(), StringType(), True))
    else:
        # Other features
        feats.append(StructField(line.strip(), FloatType(), True))
    
schema = StructType(feats)

In [19]:
%%time

df = spark.read.format('csv').option('sep', '\t').schema(schema).load(f'{hdfs_port}{data_path}')
# df.take(1)
# print(df.head())

CPU times: user 1.79 ms, sys: 73 µs, total: 1.87 ms
Wall time: 25.8 ms


In [20]:
# df.describe([
#     'visibility_surface',
# ]).show()
bay_area_hashes = {"9q8", "9q9", "9qc", "9qb"}

In [27]:
%%time

# t2 = df.select("Geohash", "visibility_surface")
# t2.describe()



dg = df.select("Geohash", "visibility_surface", df.Geohash.substr(1, 3).alias("front_hash"))

dg = dg[dg.front_hash.isin("9q8", "9q9", "9qc", "9qb")]

CPU times: user 15.7 ms, sys: 6.35 ms, total: 22.1 ms
Wall time: 1min 38s


In [28]:
dg.createOrReplaceTempView("nam_small")
visibilities = spark.sql(
    f'''SELECT * FROM(SELECT Geohash, avg(visibility_surface) as vis_surf_avg
        FROM nam_small 
        GROUP BY Geohash) as t2
    ORDER BY vis_surf_avg DESC
    Limit 10
    ''').collect()

In [29]:
visibilities

[Row(Geohash='9q8dns19g1eb', vis_surf_avg=24225.777402935608),
 Row(Geohash='9q8xpsdk0m5b', vis_surf_avg=24225.681692023027),
 Row(Geohash='9q87p661n4gz', vis_surf_avg=24225.650301846592),
 Row(Geohash='9q93zsym9gs0', vis_surf_avg=24225.643136160714),
 Row(Geohash='9qcy198s8zkp', vis_surf_avg=24225.55669642857),
 Row(Geohash='9q97ne0ndks0', vis_surf_avg=24225.554575892857),
 Row(Geohash='9qb05tdmp5h0', vis_surf_avg=24225.536458333332),
 Row(Geohash='9q8g2q1xkx2p', vis_surf_avg=24225.533336292614),
 Row(Geohash='9q8930yqggs0', vis_surf_avg=24225.522690716913),
 Row(Geohash='9qcyr9h1t07z', vis_surf_avg=24225.517578125)]