In [22]:
import geohash

def parseLine(line):
    variables = line.split("\t")
    try:
        lat = float(variables[1])
        lon = float(variables[2])
        wind_speed = float(variables[17])
        cloud_cover = float(variables[12])
        #Energy is proportional to the third power of wind speed.
        #At here, we define wind_energy_factor = wind_speed * wind_speed * wind_speed
        wind_energy_factor = wind_speed * wind_speed * wind_speed;
        
        gh = geohash.encode(lat, lon)
        return (gh, wind_energy_factor, cloud_cover)
    except:
        return ('', 0, 0)
    
#text_fileTT = spark.read.load('hdfs://orion11:21001/3hr_sample/sampled_2015/*', format='csv', sep='\t', inferSchema=True, header=True)
text_file = sc.textFile("hdfs://orion11:21001/3hr/2018/*")

# (GeoHash, wind_energy_factor, cloud_cover)
parsed_data = text_file \
    .map(lambda line: parseLine(line))

In [23]:
#we sort the geohashs by their cumulative wind_energy_factor to find out top3 wind farm
wind_data_top3 = parsed_data.map(lambda data: (data[0][0: 4], data[1])) \
    .reduceByKey(lambda a, b: a + b) \
    .sortBy(lambda x: x[1], False)

In [25]:
'''
top 3 best wind places
dxx7
dxwe
dxwk
'''
wind_data_top3.take(3)

[('dxx7', 30610549.790516056),
 ('dxwe', 30515688.971412513),
 ('dxwk', 30197750.69924766)]

In [26]:
#we sort the geohashs by their cumulative cloud_cover in decreasing order to find out top3 solar farm
solar_data_top3 = parsed_data.map(lambda data: (data[0][0: 4], data[2])) \
    .reduceByKey(lambda a, b: a + b) \
    .sortBy(lambda x: x[1], True)


In [27]:
'''
top 3 best solar energy places
dd93
d6bm
dd9g
'''
solar_data_top3.take(4)

[('', 0), ('dd93', 26282.0), ('d6bm', 27174.0), ('dd9g', 27642.0)]

In [54]:
#Average cumulative wind_energy_factor in all geohashs
avg_wind = wind_data_top3.map(lambda data: ("", data[1])) \
    .mapValues(lambda x: (x, 1)) \
    .reduceByKey(lambda x, y: (x[0] + y[0], x[1]+y[1])) \
    .mapValues(lambda v: (v[0] / v[1]))

avg_wind.take(1)

[('', 5920975.497173111)]

In [59]:
#Average cumulative cloud_cover in all geohashs
avg_solar = solar_data_top3 \
    .map(lambda data: ("", data[1])) \
    .filter(lambda data: data[0] != "") \
    .mapValues(lambda x: (x, 1)) \
    .reduceByKey(lambda x, y: (x[0] + y[0], x[1]+y[1])) \
    .mapValues(lambda v: (v[0] / v[1]))

avg_solar.take(1)

[]

In [60]:
#As you can see, the average cumulative wind_energy_factor in all geohashs is around 5920975
#the average cumulative cloud_cover in all geohashs is around 319382
#So we set the factor f = 185, we will us it later.
#Then, average_cumulative_wind_energy_factor - (f * average_cumulative_cloud_cover) = 0

#To answer the question: Locate the top 3 places for solar + wind farm.
#We define combine_energy_factor = wind_energy_factor - (f * cloud_cover).
#Then, try to find out the top3 geohashs with biggest cumulative_combine_energy_factor. 
#We regard them as top 3 places for solar + wind farm. 
#Because these places has big cumulative_wind_energy_factor with small cumulative_cloud_cover,
#which means they have more wind energy with less cloud cover.


import geohash

def parseLine(line):
    variables = line.split("\t")
    try:
        lat = float(variables[1])
        lon = float(variables[2])
        wind_speed = float(variables[17])
        cloud_cover = float(variables[12])
        #Energy is proportional to the third power of wind speed.
        #At here, we define wind_energy_factor = wind_speed * wind_speed * wind_speed
        wind_energy_factor = wind_speed * wind_speed * wind_speed;
        
        gh = geohash.encode(lat, lon)
        return (gh, wind_energy_factor - 185 * cloud_cover)
    except:
        return ('', 0)
    
#text_fileTT = spark.read.load('hdfs://orion11:21001/3hr_sample/sampled_2015/*', format='csv', sep='\t', inferSchema=True, header=True)
text_file = sc.textFile("hdfs://orion11:21001/3hr/2018/*")

# (GeoHash, wind_energy_factor, cloud_cover)
parsed_data = text_file \
    .map(lambda line: parseLine(line))


wind_solar_top3 = parsed_data.map(lambda data: (data[0][0: 4], data[1])) \
    .reduceByKey(lambda a, b: a + b) \
    .sortBy(lambda x: x[1], False)

In [61]:
'''
Here is the top3 solar + wind farm places in our view
d6bm
dd93
9mr4
'''

wind_solar_top3.take(4)

[('', 0),
 ('d6bm', -3482372.517710123),
 ('dd93', -3613794.311647366),
 ('9mr4', -3808428.2687501083)]