In [9]:
import matplotlib.pyplot as plt
import numpy as np
import geohash2 as gh
import gmaps
import gmaps.datasets
from pyspark.sql.types import StructType, StructField, FloatType, LongType, StringType
from datetime import datetime

In [3]:
feats = []
f = open('../features.txt')
for line_num, line in enumerate(f):
    if line_num == 0:
        # Timestamp
        feats.append(StructField(line.strip(), LongType(), True))
    elif line_num == 1:
        # Geohash
        feats.append(StructField(line.strip(), StringType(), True))
    else:
        # Other features
        feats.append(StructField(line.strip(), FloatType(), True))
schema = StructType(feats)

In [4]:
%%time

df = spark.read.format('csv').option('sep', '\t').schema(schema).load('hdfs://orion11:30999/nam/data/nam_s')
df.take(1)

CPU times: user 43 ms, sys: 9.88 ms, total: 52.8 ms
Wall time: 2min 30s


### Solar Features
total_cloud_cover_entire_atmosphere<br>
visibility_surface<br>
categorical_rain_yes1_no0_surface<br>

We want to find features that have low cloud coverage and high visiblity surface ( Not 100% sure what visiblity surface means but I will assume that higher visiblity would allow for more sunlight ). Of course the best places for solar farms would also be places with low rainfall since we want as little clouds as possible. 

### Wind Features
surface_wind_gust_surface<br>
v-component_of_wind_maximum_wind<br>
pressure_surface<br>
pressure_maximum_wind<br>

For Wind Featuress we look for wind related attributes and track choose areas in which the average wind attribute is higher than the average. It is important to have multiple attributes to measure weither or not the area is suitable for a wind farm.


### General Features

It is important for the farm to be on a location that can actually be inhabitable. For now we will only isolate for land ( we dont want a solar farm in the middle of the ocean )

## Solar Features Description

In [8]:
df.describe([
    'total_cloud_cover_entire_atmosphere',
    'visibility_surface',
    'categorical_rain_yes1_no0_surface'
]).show()

+-------+-----------------------------------+------------------+---------------------------------+
|summary|total_cloud_cover_entire_atmosphere|visibility_surface|categorical_rain_yes1_no0_surface|
+-------+-----------------------------------+------------------+---------------------------------+
|  count|                          108000000|         108000000|                        108000000|
|   mean|               -2.42109131147400...|21717.428957248074|              0.08852023148148148|
| stddev|               4.914506718751981...| 6141.102072809382|              0.28404999709034684|
|    min|                      -1.0000036E20|         17.292929|                              0.0|
|    max|                              100.0|         24307.436|                              1.0|
+-------+-----------------------------------+------------------+---------------------------------+



## Wind Features Description

In [7]:
df.describe([
    'surface_wind_gust_surface',
    'v-component_of_wind_maximum_wind',
    'pressure_surface',
    'pressure_maximum_wind'
]).show()

+-------+-------------------------+--------------------------------+-----------------+---------------------+
|summary|surface_wind_gust_surface|v-component_of_wind_maximum_wind| pressure_surface|pressure_maximum_wind|
+-------+-------------------------+--------------------------------+-----------------+---------------------+
|  count|                108000000|                       108000000|        108000000|            108000000|
|   mean|        6.847670909482179|             -1.1452036799420957|97924.25023205555|   22274.680711306133|
| stddev|        4.462058868522337|              18.983894779918007|6041.825119192631|    7190.963967770848|
|    min|             7.1111065E-4|                        -92.0031|          63860.0|            11120.641|
|    max|                 55.79795|                       106.34735|         105154.0|            50128.137|
+-------+-------------------------+--------------------------------+-----------------+---------------------+



In [11]:
%%time
df.createOrReplaceTempView("nam_small")
solar_location = spark.sql(
    f'''SELECT * FROM(SELECT Geohash, avg(total_cloud_cover_entire_atmosphere) as avg_cloud, avg(visibility_surface) as avg_vis, avg(categorical_rain_yes1_no0_surface) as avg_rain
        FROM nam_small 
        GROUP BY Geohash 
        Having avg_cloud < -.2 and avg_vis > 21700 and avg_rain < .08 and avg(land_cover_land1_sea0_surface) = 1) as t2
    ORDER BY avg_cloud, avg_vis DESC, avg_rain
    Limit 3
    ''').collect()

for row in solar_location:
    print (row)

Row(Geohash='9muc4swndp2p', avg_cloud=-8.241785230181007e+17, avg_vis=23113.471296037947, avg_rain=0.01098901098901099)
Row(Geohash='9sqscbng8qhp', avg_cloud=-8.196748152420456e+17, avg_vis=23235.54689767973, avg_rain=0.06830601092896176)
Row(Geohash='9qvg43dhb5eb', avg_cloud=-8.152200608113823e+17, avg_vis=23374.304511028786, avg_rain=0.0625)
CPU times: user 47.7 ms, sys: 16.6 ms, total: 64.3 ms
Wall time: 2min 45s


In [None]:
lat_long_list = [gh.decode(row[0]) for row in solar_location]
lat_long_list = [(float(row[0]), float(row[1])) for row in lat_long_list]

solar = gmaps.symbol_layer(
    lat_long_list,
    fill_color='red',
    stroke_color='red',
)
fig = gmaps.figure()
fig.add_layer(solar)
fig

![Solar](img/SolarFarm.png)

In [14]:
wind_location = spark.sql(
    f'''SELECT * FROM(SELECT Geohash, avg(surface_wind_gust_surface) as avg_gust, avg(pressure_surface) as avg_pressure, avg(pressure_maximum_wind) as avg_max_pressure
        FROM nam_small 
        GROUP BY Geohash 
        Having avg_gust > 6.8 and avg_pressure > 98000 and avg_max_pressure > 22000 and avg(land_cover_land1_sea0_surface) = 1) as t2
    ORDER BY avg_max_pressure DESC, avg_gust DESC, avg_pressure DESC
    Limit 3
    ''').collect()


for row in wind_location:
    print (row)

Row(Geohash='f6c0dnrzupkp', avg_gust=7.301884487974194, avg_pressure=100876.15330188679, avg_max_pressure=27631.67333984375)
Row(Geohash='f6dquxqseheb', avg_gust=7.487838643878033, avg_pressure=99143.55831265509, avg_max_pressure=27631.04482736895)
Row(Geohash='f63jjcx9hhkp', avg_gust=7.6772719182737985, avg_pressure=100049.68069306931, avg_max_pressure=27557.737307104733)


In [None]:
lat_long_list = [gh.decode(row[0]) for row in wind_location]
lat_long_list = [(float(row[0]), float(row[1])) for row in lat_long_list]

wind = gmaps.symbol_layer(
    lat_long_list,
    fill_color='red',
    stroke_color='red',
)
fig = gmaps.figure()
fig.add_layer(wind)
fig

![Wind](img/WindFarms.png)

In [16]:
%%time
df.createOrReplaceTempView("nam_small")
solar_location = spark.sql(
    f'''SELECT * FROM(SELECT Geohash, avg(total_cloud_cover_entire_atmosphere) as avg_cloud, avg(visibility_surface) as avg_vis, avg(categorical_rain_yes1_no0_surface) as avg_rain,
        avg(surface_wind_gust_surface) as avg_gust, avg(pressure_surface) as avg_pressure, avg(pressure_maximum_wind) as avg_max_pressure
        FROM nam_small 
        GROUP BY Geohash 
        Having avg(land_cover_land1_sea0_surface) = 1 and avg_cloud < -.2 and avg_vis > 21700 and avg_rain < .08 and avg_gust > 6.8 and avg_pressure > 98000 and avg_max_pressure > 22000) as t2
    ORDER BY avg_cloud, avg_vis DESC, avg_rain, avg_max_pressure DESC, avg_gust DESC, avg_pressure DESC
    Limit 3
    ''').collect()

for row in solar_location:
    print (row)

Row(Geohash='cbdgu8ssqukp', avg_cloud=-7.317099062034778e+17, avg_vis=22050.815116668327, avg_rain=0.046341463414634146, avg_gust=7.20288945211143, avg_pressure=98590.6243902439, avg_max_pressure=23063.607140815548)
Row(Geohash='cbdejvg4pgxb', avg_cloud=-5.249361021497615e+17, avg_vis=22233.35329998447, avg_rain=0.03937007874015748, avg_gust=6.960226039404631, avg_pressure=98365.70341207349, avg_max_pressure=23514.148401615814)
Row(Geohash='9zvjgu2w68pb', avg_cloud=-5.235619238718825e+17, avg_vis=21927.44379666713, avg_rain=0.03664921465968586, avg_gust=6.92882694561444, avg_pressure=98086.35340314136, avg_max_pressure=22601.759586673757)
CPU times: user 40.7 ms, sys: 9.65 ms, total: 50.4 ms
Wall time: 2min 51s


In [None]:
lat_long_list = [gh.decode(row[0]) for row in solar_location]
lat_long_list = [(float(row[0]), float(row[1])) for row in lat_long_list]

wind_sol = gmaps.symbol_layer(
    lat_long_list,
    fill_color='red',
    stroke_color='red',
)
fig = gmaps.figure()
fig.add_layer(wind_sol)
fig

![Solar_Wind](img/Wind_Sol.png)

### Analysis

Many of the result was expected. The Trend seems to be SOLAR farms tend to be located near the south (Unsuprisingly San Diego is a great place to place a solar farm :))

However places to put WIND farms seem to be less desirable all in the cold north near the top of canada.
When we fuse these two and look for places for both wind and solar those seem to be located near the middle between the optimal place for solar and wind farms. (Also kind of makes sense but not really expected)