In [1]:
import matplotlib.pyplot as plt
import numpy as np

from pyspark.sql.types import StructType, StructField, FloatType, LongType, StringType
from datetime import datetime
from pyspark.sql import functions as F

In [3]:
hdfs_port = "hdfs://orion11:26990"
# data_path = "/nam_s/nam_201501_s*"
# data_path = "/nam_s/*"
data_path = "/sample/nam_tiny*"

In [4]:
feats = []
f = open('../features.txt')
for line_num, line in enumerate(f):
    if line_num == 0:
        # Timestamp
        feats.append(StructField(line.strip(), LongType(), True))
    elif line_num == 1:
        # Geohash
        feats.append(StructField(line.strip(), StringType(), True))
    else:
        # Other features
        feats.append(StructField(line.strip(), FloatType(), True))
    
schema = StructType(feats)

In [5]:
%%time

df = spark.read.format('csv').option('sep', '\t').schema(schema).load(f'{hdfs_port}{data_path}')
# df.take(1)
# print(df.head())

CPU times: user 2.3 ms, sys: 2.57 ms, total: 4.87 ms
Wall time: 11.5 s


In [6]:
# df.describe([
#     'visibility_surface',
# ]).show()
bay_area_hashes = {"9q8", "9q9", "9qc", "9qb"}

In [7]:
%%time

# t2 = df.select("Geohash", "visibility_surface")
# t2.describe()

# dg = df

dg = df.select("Geohash", "visibility_surface", df.Geohash.substr(1, 3).alias("front_hash"), "geopotential_height_cloud_base", "geopotential_height_surface")

dg = dg[dg.front_hash.isin(*bay_area_hashes)]

CPU times: user 5.11 ms, sys: 2.21 ms, total: 7.33 ms
Wall time: 1.09 s


In [None]:
%%time

dg.createOrReplaceTempView("nam_small")
visibilities = spark.sql(
    f'''
    SELECT 
        *, 
        foggy_days/counts as foggy_ratio 
        FROM(
            SELECT substr(Geohash,1,5) as geoloc,
                avg(visibility_surface) as vis_surf_avg,
                sum(case when geopotential_height_cloud_base <= geopotential_height_surface then 1 else 0 end) as foggy_days,
                sum(1) as counts
            FROM nam_small 
            GROUP BY substr(Geohash,1,5)) as t2
    ORDER BY foggy_ratio ASC, vis_surf_avg DESC, foggy_days ASC, counts ASC
    ''').collect()