In [14]:
import matplotlib.pyplot as plt
import numpy as np

from pyspark.sql.types import StructType, StructField, FloatType, LongType, StringType
from datetime import datetime
from pyspark.sql import functions as F

In [46]:
hdfs_port = "hdfs://orion11:26990"
# data_path = "/nam_s/nam_201501_s*"
data_path = "/nam_s/*"
# data_path = "/sample/nam_tiny*"

In [47]:
feats = []
f = open('../features.txt')
for line_num, line in enumerate(f):
    if line_num == 0:
        # Timestamp
        feats.append(StructField(line.strip(), LongType(), True))
    elif line_num == 1:
        # Geohash
        feats.append(StructField(line.strip(), StringType(), True))
    else:
        # Other features
        feats.append(StructField(line.strip(), FloatType(), True))
    
schema = StructType(feats)

In [48]:
%%time

df = spark.read.format('csv').option('sep', '\t').schema(schema).load(f'{hdfs_port}{data_path}')
# df.take(1)
# print(df.head())

CPU times: user 3.36 ms, sys: 0 ns, total: 3.36 ms
Wall time: 143 ms


In [49]:
# df.describe([
#     'visibility_surface',
# ]).show()
bay_area_hashes = {"9q8", "9q9", "9qc", "9qb"}

In [50]:
%%time

# t2 = df.select("Geohash", "visibility_surface")
# t2.describe()

# dg = df

dg = df.select("Geohash", "visibility_surface", df.Geohash.substr(1, 3).alias("front_hash"), "geopotential_height_cloud_base", "geopotential_height_surface")

dg = dg[dg.front_hash.isin(*bay_area_hashes)]

CPU times: user 8.59 ms, sys: 3.04 ms, total: 11.6 ms
Wall time: 51.3 ms


In [51]:
%%time

dg.createOrReplaceTempView("nam_small")
visibilities = spark.sql(
    f'''
    SELECT 
        *, 
        foggy_days/counts as foggy_ratio 
        FROM(
            SELECT substr(Geohash,1,5) as geoloc,
                avg(visibility_surface) as vis_surf_avg,
                sum(case when geopotential_height_cloud_base <= geopotential_height_surface then 1 else 0 end) as foggy_days,
                sum(1) as counts
            FROM nam_small 
            GROUP BY substr(Geohash,1,5)) as t2
    ORDER BY foggy_ratio ASC, vis_surf_avg DESC, foggy_days ASC, counts ASC
    ''').collect()

CPU times: user 20.7 ms, sys: 6.42 ms, total: 27.1 ms
Wall time: 3min 2s


In [52]:
visibilities

[Row(geoloc='9q80j', vis_surf_avg=23615.99707769456, foggy_days=131, counts=401, foggy_ratio=0.3266832917705736),
 Row(geoloc='9q835', vis_surf_avg=23626.317845540503, foggy_days=150, counts=438, foggy_ratio=0.3424657534246575),
 Row(geoloc='9q822', vis_surf_avg=23421.318650966736, foggy_days=142, counts=410, foggy_ratio=0.3463414634146341),
 Row(geoloc='9q80v', vis_surf_avg=23400.30476604206, foggy_days=141, counts=403, foggy_ratio=0.34987593052109184),
 Row(geoloc='9q804', vis_surf_avg=23147.953163789898, foggy_days=157, counts=445, foggy_ratio=0.35280898876404493),
 Row(geoloc='9q811', vis_surf_avg=23395.94564152742, foggy_days=139, counts=390, foggy_ratio=0.3564102564102564),
 Row(geoloc='9q80d', vis_surf_avg=23592.60039320668, foggy_days=148, counts=409, foggy_ratio=0.36185819070904646),
 Row(geoloc='9q81k', vis_surf_avg=23305.059346217466, foggy_days=151, counts=414, foggy_ratio=0.3647342995169082),
 Row(geoloc='9q889', vis_surf_avg=23510.57788722036, foggy_days=147, counts=403, 