In [22]:
import geohash

def parseLine(line):
    variables = line.split("\t")
    try:
        lat = float(variables[1])
        lon = float(variables[2])
        wind_speed = float(variables[17])
        cloud_cover = float(variables[12])
        #Energy is proportional to the third power of wind speed.
        #At here, we define wind_energy_factor = wind_speed * wind_speed * wind_speed
        wind_energy_factor = wind_speed * wind_speed * wind_speed;
        
        gh = geohash.encode(lat, lon)
        return (gh, wind_energy_factor, cloud_cover)
    except:
        return ('', 0, 0)
    
#text_fileTT = spark.read.load('hdfs://orion11:21001/3hr_sample/sampled_2015/*', format='csv', sep='\t', inferSchema=True, header=True)
text_file = sc.textFile("hdfs://orion11:21001/3hr/2018/*")

# (GeoHash, wind_energy_factor, cloud_cover)
parsed_data = text_file \
    .map(lambda line: parseLine(line))

In [23]:
#we sort the geohashs by their cumulative wind_energy_factor to find out top3 wind farm
wind_data_top3 = parsed_data.map(lambda data: (data[0][0: 4], data[1])) \
    .reduceByKey(lambda a, b: a + b) \
    .sortBy(lambda x: x[1], False)

In [25]:
'''
top 3 best wind places
dxx7
dxwe
dxwk
'''
wind_data_top3.take(3)

[('dxx7', 30610549.790516056),
 ('dxwe', 30515688.971412513),
 ('dxwk', 30197750.69924766)]

In [26]:
#we sort the geohashs by their cumulative cloud_cover in decreasing order to find out top3 solar farm
solar_data_top3 = parsed_data.map(lambda data: (data[0][0: 4], data[2])) \
    .reduceByKey(lambda a, b: a + b) \
    .sortBy(lambda x: x[1], True)


In [27]:
'''
top 3 best solar energy places
dd93
d6bm
dd9g
'''
solar_data_top3.take(4)

[('', 0), ('dd93', 26282.0), ('d6bm', 27174.0), ('dd9g', 27642.0)]

In [28]:
wind_data_top3.take(1)

[('dxx7', 30610549.790516056)]

In [29]:
solar_data_top3.take(2)

[('', 0), ('dd93', 26282.0)]

In [30]:
#As you can see, the best wind farm's cumulative wind_energy_factor is around 30610550
#the best solar farm's cumulative cloud_cover is 26282.0
#So we set the factor we will us later f = 1165. 
#Then, max_cumulative_wind_energy_factor - (f * min_cumulative_cloud_cover) = 0

#To answer the question: Locate the top 3 places for solar + wind farm.
#We defind combine_energy_factor = wind_energy_factor - (f * cloud_cover).
#Then, try to find out the top3 geohashs with biggest cumulative_combine_energy_factor. 
#We regard them as top 3 places for solar + wind farm. 
#Because these places has big cumulative_wind_energy_factor with small cumulative_cloud_cover,
#which means they have more wind energy with less cloud cover.


import geohash

def parseLine(line):
    variables = line.split("\t")
    try:
        lat = float(variables[1])
        lon = float(variables[2])
        wind_speed = float(variables[17])
        cloud_cover = float(variables[12])
        #Energy is proportional to the third power of wind speed.
        #At here, we define wind_energy_factor = wind_speed * wind_speed * wind_speed
        wind_energy_factor = wind_speed * wind_speed * wind_speed;
        
        gh = geohash.encode(lat, lon)
        return (gh, wind_energy_factor - 1165 * cloud_cover)
    except:
        return ('', 0)
    
#text_fileTT = spark.read.load('hdfs://orion11:21001/3hr_sample/sampled_2015/*', format='csv', sep='\t', inferSchema=True, header=True)
text_file = sc.textFile("hdfs://orion11:21001/3hr/2018/*")

# (GeoHash, wind_energy_factor, cloud_cover)
parsed_data = text_file \
    .map(lambda line: parseLine(line))


wind_solar_top3 = parsed_data.map(lambda data: (data[0][0: 4], data[1])) \
    .reduceByKey(lambda a, b: a + b) \
    .sortBy(lambda x: x[1], False)

In [32]:
'''
Here is the top3 solar + wind farm places in our view
dd93
d6bm
dd9g
'''

wind_solar_top3.take(4)

[('', 0),
 ('dd93', -29370154.311647378),
 ('d6bm', -30112892.517710146),
 ('dd9g', -31047502.468095403)]