In [8]:
import matplotlib.pyplot as plt
import numpy as np

from pyspark.sql.types import StructType, StructField, FloatType, LongType, StringType
from datetime import datetime
from pyspark.sql import functions as F

In [9]:
hdfs_port = "hdfs://orion11:26990"
# data_path = "/nam_s/nam_201501_s*"
# data_path = "/nam_s/*"
data_path = "/sample/nam_tiny*"

In [10]:
feats = []
f = open('../features.txt')
for line_num, line in enumerate(f):
    if line_num == 0:
        # Timestamp
        feats.append(StructField(line.strip(), LongType(), True))
    elif line_num == 1:
        # Geohash
        feats.append(StructField(line.strip(), StringType(), True))
    else:
        # Other features
        feats.append(StructField(line.strip(), FloatType(), True))
    
schema = StructType(feats)

In [11]:
%%time

df = spark.read.format('csv').option('sep', '\t').schema(schema).load(f'{hdfs_port}{data_path}')
# df.take(1)
# print(df.head())

CPU times: user 1.38 ms, sys: 1.4 ms, total: 2.77 ms
Wall time: 111 ms


In [51]:
hash_prefix = "c6s"

In [53]:
%%time

dg = df

dg.createOrReplaceTempView("nam_small")
visibilities = spark.sql(
    f'''
    SELECT 
        *, 
        foggy_days/counts as foggy_ratio 
        FROM(
            SELECT substr(Geohash,1,5) as geoloc,
                avg(visibility_surface) as vis_surf_avg,
                sum(case when geopotential_height_cloud_base <= geopotential_height_surface then 1 else 0 end) as foggy_days,
                sum(1) as counts
            FROM nam_small 
            WHERE Geohash LIKE "{hash_prefix}%"
            AND visibility_surface < 24221
            GROUP BY substr(Geohash,1,5)) as t2
    ORDER BY foggy_ratio ASC, vis_surf_avg DESC, foggy_days ASC, counts ASC
    ''').collect()

CPU times: user 4.67 ms, sys: 1.46 ms, total: 6.13 ms
Wall time: 1.86 s


In [54]:
visibilities

[Row(geoloc='c6s64', vis_surf_avg=2221.5888671875, foggy_days=0, counts=1, foggy_ratio=0.0)]