In [1]:
import matplotlib.pyplot as plt
import numpy as np

from pyspark.sql.types import StructType, StructField, FloatType, LongType, StringType
from datetime import datetime
from pyspark.sql import functions as F

In [27]:
hdfs_port = "hdfs://orion11:26990"
data_path = "/nam_s/nam_201501_s*"
# data_path = "/nam_s/*"
# data_path = "/sample/nam_tiny*"

In [28]:
feats = []
f = open('../features.txt')
for line_num, line in enumerate(f):
    if line_num == 0:
        # Timestamp
        feats.append(StructField(line.strip(), LongType(), True))
    elif line_num == 1:
        # Geohash
        feats.append(StructField(line.strip(), StringType(), True))
    else:
        # Other features
        feats.append(StructField(line.strip(), FloatType(), True))
    
schema = StructType(feats)

In [29]:
%%time

df = spark.read.format('csv').option('sep', '\t').schema(schema).load(f'{hdfs_port}{data_path}')
# df.take(1)
# print(df.head())

CPU times: user 7.68 ms, sys: 1.31 ms, total: 8.99 ms
Wall time: 54 ms


In [30]:
hash_prefix = "c6s64"

In [37]:
%%time

dg = df

dg.createOrReplaceTempView("nam_small")

query_str = f'''
SELECT AVG(temperature_surface) AS tmp_avg,
    MIN(temperature_surface) AS tmp_min,
    MAX(temperature_surface) AS tmp_max,
    AVG(categorical_rain_yes1_no0_surface) AS rain_avg
FROM nam_small
WHERE Geohash LIKE "{hash_prefix}%"
'''

print(query_str)


SELECT AVG(temperature_surface) AS tmp_avg,
    MIN(temperature_surface) AS tmp_min,
    MAX(temperature_surface) AS tmp_max,
    AVG(categorical_rain_yes1_no0_surface) AS rain_avg
FROM nam_small
WHERE Geohash LIKE "c6s64%"

CPU times: user 1.15 ms, sys: 11 µs, total: 1.16 ms
Wall time: 4.87 ms


In [38]:
%%time

visibilities = spark.sql(query_str).collect()

CPU times: user 12.1 ms, sys: 1.55 ms, total: 13.6 ms
Wall time: 1min 25s


In [39]:
visibilities

[Row(tmp_avg=254.01363597196692, tmp_min=237.46861267089844, tmp_max=272.9434509277344, rain_avg=0.0)]