In [1]:
import matplotlib.pyplot as plt
import numpy as np

from pyspark.sql.types import StructType, StructField, FloatType, LongType, StringType
from datetime import datetime

In [None]:
feats = []
f = open('../features.txt')
for line_num, line in enumerate(f):
    if line_num == 0:
        # Timestamp
        feats.append(StructField(line.strip(), LongType(), True))
    elif line_num == 1:
        # Geohash
        feats.append(StructField(line.strip(), StringType(), True))
    else:
        # Other features
        feats.append(StructField(line.strip(), FloatType(), True))
schema = StructType(feats)

In [3]:
%%time

df = spark.read.format('csv').option('sep', '\t').schema(schema).load('hdfs://orion11:30999/nam/data/nam_s')
df.take(1)

CPU times: user 53.7 ms, sys: 12.6 ms, total: 66.2 ms
Wall time: 2min 35s


### Solar Features
total_cloud_cover_entire_atmosphere<br>
visibility_surface<br>
categorical_rain_yes1_no0_surface<br>

We want to find features that have low cloud coverage and high visiblity surface ( Not 100% sure what visiblity surface means but I will assume that higher visiblity would allow for more sunlight ). Of course the best places for solar farms would also be places with low rainfall since we want as little clouds as possible. 

### Wind Features
surface_wind_gust_surface<br>
v-component_of_wind_maximum_wind<br>
pressure_surface<br>
pressure_maximum_wind<br>




## Solar Features Description

In [8]:
df.describe([
    'total_cloud_cover_entire_atmosphere',
    'visibility_surface',
    'categorical_rain_yes1_no0_surface'
]).show()

+-------+-----------------------------------+------------------+---------------------------------+
|summary|total_cloud_cover_entire_atmosphere|visibility_surface|categorical_rain_yes1_no0_surface|
+-------+-----------------------------------+------------------+---------------------------------+
|  count|                          108000000|         108000000|                        108000000|
|   mean|               -2.42109131147400...|21717.428957248074|              0.08852023148148148|
| stddev|               4.914506718751981...| 6141.102072809382|              0.28404999709034684|
|    min|                      -1.0000036E20|         17.292929|                              0.0|
|    max|                              100.0|         24307.436|                              1.0|
+-------+-----------------------------------+------------------+---------------------------------+



## Wind Features Description

In [7]:
df.describe([
    'surface_wind_gust_surface',
    'v-component_of_wind_maximum_wind',
    'pressure_surface',
    'pressure_maximum_wind'
]).show()

+-------+-------------------------+--------------------------------+-----------------+---------------------+
|summary|surface_wind_gust_surface|v-component_of_wind_maximum_wind| pressure_surface|pressure_maximum_wind|
+-------+-------------------------+--------------------------------+-----------------+---------------------+
|  count|                108000000|                       108000000|        108000000|            108000000|
|   mean|        6.847670909482179|             -1.1452036799420957|97924.25023205555|   22274.680711306133|
| stddev|        4.462058868522337|              18.983894779918007|6041.825119192631|    7190.963967770848|
|    min|             7.1111065E-4|                        -92.0031|          63860.0|            11120.641|
|    max|                 55.79795|                       106.34735|         105154.0|            50128.137|
+-------+-------------------------+--------------------------------+-----------------+---------------------+



In [21]:
%%time
df.createOrReplaceTempView("nam_small")
solar_location = spark.sql(
    f'''SELECT * FROM(SELECT Geohash, avg(total_cloud_cover_entire_atmosphere) as avg_cloud, avg(visibility_surface) as avg_vis, avg(categorical_rain_yes1_no0_surface) as avg_rain
        FROM nam_small 
        GROUP BY Geohash 
        Having avg_cloud < -.2 and avg_vis > 21700 and avg_rain < .08) as t2
    ORDER BY avg_cloud, avg_vis DESC, avg_rain
    Limit 3
    ''').collect()

for row in solar_location:
    print (row)

Row(Geohash='dh6zwfesgggz', avg_cloud=-8.333360621627462e+17, avg_vis=23788.02738308377, avg_rain=0.03888888888888889)
Row(Geohash='9h7zm3u5enrb', avg_cloud=-8.310276520182511e+17, avg_vis=24178.492847558864, avg_rain=0.019390581717451522)
Row(Geohash='8uxzm5c75j80', avg_cloud=-8.287319955209632e+17, avg_vis=23920.80753355922, avg_rain=0.0718232044198895)
CPU times: user 24.7 ms, sys: 15.5 ms, total: 40.2 ms
Wall time: 2min 29s


In [20]:
wind_location = spark.sql(
    f'''SELECT * FROM(SELECT Geohash, avg(surface_wind_gust_surface) as avg_gust, avg(pressure_surface) as avg_pressure, avg(pressure_maximum_wind) as avg_max_pressure
        FROM nam_small 
        GROUP BY Geohash 
        Having avg_gust > 6.8 and avg_pressure > 98000 and avg_max_pressure > 22000) as t2
    ORDER BY avg_max_pressure DESC, avg_gust DESC, avg_pressure DESC
    Limit 3
    ''').collect()


for row in wind_location:
    print (row)

Row(Geohash='b9k9ftmc7npb', avg_gust=11.904280211305277, avg_pressure=100729.32696897375, avg_max_pressure=28154.51828432652)
Row(Geohash='b9pp0nkjhepb', avg_gust=12.28862724831139, avg_pressure=100892.28743961353, avg_max_pressure=28060.23099949049)
Row(Geohash='b9kcfeq0zv00', avg_gust=12.433918561014679, avg_pressure=100643.69565217392, avg_max_pressure=27887.859090273338)
Row(Geohash='b9huphe79ubp', avg_gust=12.240076919446638, avg_pressure=100970.36432160804, avg_max_pressure=27881.79797915358)
