In [7]:
import geohash
from datetime import datetime

def parseLine(line):
    variables = line.split("\t")
    try:
        lat = float(variables[1])
        lon = float(variables[2])
        tem = float(variables[10])
        humidity = float(variables[8])
        wind_speed = float(variables[17])
        cloud_cover = float(variables[12])   
        
        ts = int(variables[0][0: 10])
        # if you encounter a "year is out of range" error the timestamp
        # may be in milliseconds, try `ts /= 1000` in that case
        yearMonth = datetime.utcfromtimestamp(ts).strftime('%Y-%m')
        
        gh = geohash.encode(lat, lon)
        return (gh[0: 2] + '\t' + yearMonth, tem, humidity, wind_speed, cloud_cover)
    except:
        return ('z?', 0, 0, 0, 0)
    
#text_file = spark.read.load('hdfs://orion11:21001/3hr_sample/sampled_2015/*', format='csv', sep='\t', inferSchema=True, header=True)
text_fileML = sc.textFile("hdfs://orion11:21001/3hr_sample/*")
#text_fileML = sc.textFile("hdfs://orion11:21001/3hr_sample/*")


In [4]:
#1: 8g
'''
original result:
#-----------------
geohash: 8g
humidity: 
[[ 1.         -0.07573358]
 [-0.07573358  1.        ]]
wind_speed: 
[[ 1.        -0.0757577]
 [-0.0757577  1.       ]]
could_cover: 
[[ 1.         -0.38374192]
 [-0.38374192  1.        ]]
-----------------
'''
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator


# (GeoHash, wind_energy_factor, cloud_cover)
parsed_dataML8g = text_fileML \
    .map(lambda line: parseLine(line)) \
    .filter(lambda line: line[0].startswith('8g') == True)
#transform rdd to df
dfML8g = parsed_dataML8g.toDF()

#prepare data for ML
def prepare_data(dframe, predictors, target):
    assembler = VectorAssembler(inputCols=predictors, outputCol="features")
    output = assembler.transform(dframe)
    return output.select("features", target).withColumnRenamed(target, "label")

# Choose our dependent and independent variables:
#1: cloud
prepped8gCloud = prepare_data(dfML8g,
    ['_5'],
    '_2')

#prepped8gCloud.show()
(trainingData8gCloud, testData8gCloud) = prepped8gCloud.randomSplit([0.6, 0.4])

#2: humidity
prepped8gHumidity = prepare_data(dfML8g,
    ['_3'],
    '_2')

#prepped8gHumidity.show()
(trainingData8gHumidity, testData8gHumidity) = prepped8gHumidity.randomSplit([0.6, 0.4])

#3: wind_speed
prepped8gWind = prepare_data(dfML8g,
    ['_4'],
    '_2')

#prepped8gWind.show()
(trainingData8gWind, testData8Wind) = prepped8gWind.randomSplit([0.6, 0.4])


#start MLing

#We use same parameter for all predictors
rf8g = RandomForestRegressor(numTrees=100, maxDepth=5, maxBins=32)



#1: humidity
model = rf8g.fit(trainingData8gHumidity)
predictions = model.transform(testData8gHumidity)

evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Humidity: Root Mean Squared Error (RMSE) on test data = %g" % rmse)


#2: wind_speed
model = rf8g.fit(trainingData8gWind)
predictions = model.transform(testData8Wind)

evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Wind: Root Mean Squared Error (RMSE) on test data = %g" % rmse)


#3: cloud_cover
model = rf8g.fit(trainingData8gCloud)
predictions = model.transform(testData8gCloud)

evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Cloud: Root Mean Squared Error (RMSE) on test data = %g" % rmse)

Humidity: Root Mean Squared Error (RMSE) on test data = 1.05988
Wind: Root Mean Squared Error (RMSE) on test data = 1.05709
Cloud: Root Mean Squared Error (RMSE) on test data = 1.07119


In [5]:
#2: 8x
'''
original result:
-----------------
geohash: 8x
humidity: 
[[ 1.         -0.81826103]
 [-0.81826103  1.        ]]
wind_speed: 
[[ 1.         -0.34704516]
 [-0.34704516  1.        ]]
could_cover: 
[[ 1.         -0.11887706]
 [-0.11887706  1.        ]]
-----------------
'''
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator


# (GeoHash, wind_energy_factor, cloud_cover)
parsed_dataML8g = text_fileML \
    .map(lambda line: parseLine(line)) \
    .filter(lambda line: line[0].startswith('8x') == True)
#transform rdd to df
dfML8g = parsed_dataML8g.toDF()

#prepare data for ML
def prepare_data(dframe, predictors, target):
    assembler = VectorAssembler(inputCols=predictors, outputCol="features")
    output = assembler.transform(dframe)
    return output.select("features", target).withColumnRenamed(target, "label")

# Choose our dependent and independent variables:
#1: cloud
prepped8gCloud = prepare_data(dfML8g,
    ['_5'],
    '_2')

#prepped8gCloud.show()
(trainingData8gCloud, testData8gCloud) = prepped8gCloud.randomSplit([0.6, 0.4])

#2: humidity
prepped8gHumidity = prepare_data(dfML8g,
    ['_3'],
    '_2')

#prepped8gHumidity.show()
(trainingData8gHumidity, testData8gHumidity) = prepped8gHumidity.randomSplit([0.6, 0.4])

#3: wind_speed
prepped8gWind = prepare_data(dfML8g,
    ['_4'],
    '_2')

#prepped8gWind.show()
(trainingData8gWind, testData8Wind) = prepped8gWind.randomSplit([0.6, 0.4])


#start MLing

#We use same parameter for all predictors
rf8g = RandomForestRegressor(numTrees=100, maxDepth=5, maxBins=32)



#1: humidity
model = rf8g.fit(trainingData8gHumidity)
predictions = model.transform(testData8gHumidity)

evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Humidity: Root Mean Squared Error (RMSE) on test data = %g" % rmse)


#2: wind_speed
model = rf8g.fit(trainingData8gWind)
predictions = model.transform(testData8Wind)

evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Wind: Root Mean Squared Error (RMSE) on test data = %g" % rmse)


#3: cloud_cover
model = rf8g.fit(trainingData8gCloud)
predictions = model.transform(testData8gCloud)

evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Cloud: Root Mean Squared Error (RMSE) on test data = %g" % rmse)

Humidity: Root Mean Squared Error (RMSE) on test data = 3.27765
Wind: Root Mean Squared Error (RMSE) on test data = 3.50637
Cloud: Root Mean Squared Error (RMSE) on test data = 3.45458


In [8]:
#2: 8x
'''
original result:
-----------------
geohash: 8x
humidity: 
[[ 1.         -0.81826103]
 [-0.81826103  1.        ]]
wind_speed: 
[[ 1.         -0.34704516]
 [-0.34704516  1.        ]]
could_cover: 
[[ 1.         -0.11887706]
 [-0.11887706  1.        ]]
-----------------
'''
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator


# (GeoHash, wind_energy_factor, cloud_cover)
parsed_dataML8g = text_fileML \
    .map(lambda line: parseLine(line)) \
    .filter(lambda line: line[0].startswith('8x') == True)
#transform rdd to df
dfML8g = parsed_dataML8g.toDF()

#prepare data for ML
def prepare_data(dframe, predictors, target):
    assembler = VectorAssembler(inputCols=predictors, outputCol="features")
    output = assembler.transform(dframe)
    return output.select("features", target).withColumnRenamed(target, "label")

# Choose our dependent and independent variables:
#1: cloud + wind
prepped8gCloud = prepare_data(dfML8g,
    ['_5', '_4'],
    '_2')

#prepped8gCloud.show()
(trainingData8gCloud, testData8gCloud) = prepped8gCloud.randomSplit([0.6, 0.4])

#2: humidity + wind
prepped8gHumidity = prepare_data(dfML8g,
    ['_3', '_4'],
    '_2')

#prepped8gHumidity.show()
(trainingData8gHumidity, testData8gHumidity) = prepped8gHumidity.randomSplit([0.6, 0.4])

#3: humidity + cloud
prepped8gWind = prepare_data(dfML8g,
    ['_3', '_5'],
    '_2')

#prepped8gWind.show()
(trainingData8gWind, testData8Wind) = prepped8gWind.randomSplit([0.6, 0.4])


#start MLing

#We use same parameter for all predictors
rf8g = RandomForestRegressor(numTrees=100, maxDepth=5, maxBins=32)



#1: humidity
model = rf8g.fit(trainingData8gHumidity)
predictions = model.transform(testData8gHumidity)

evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Humidity: Root Mean Squared Error (RMSE) on test data = %g" % rmse)


#2: wind_speed
model = rf8g.fit(trainingData8gWind)
predictions = model.transform(testData8Wind)

evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Wind: Root Mean Squared Error (RMSE) on test data = %g" % rmse)


#3: cloud_cover
model = rf8g.fit(trainingData8gCloud)
predictions = model.transform(testData8gCloud)

evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Cloud: Root Mean Squared Error (RMSE) on test data = %g" % rmse)

Humidity: Root Mean Squared Error (RMSE) on test data = 3.10963
Wind: Root Mean Squared Error (RMSE) on test data = 3.04392
Cloud: Root Mean Squared Error (RMSE) on test data = 3.3164
