In [1]:
import geohash
from datetime import datetime

def parseLine(line):
    variables = line.split("\t")
    try:
        lat = float(variables[1])
        lon = float(variables[2])
        tem = float(variables[10])
        humidity = float(variables[8])
        wind_speed = float(variables[17])
        cloud_cover = float(variables[12])   
        
        ts = int(variables[0][0: 10])
        # if you encounter a "year is out of range" error the timestamp
        # may be in milliseconds, try `ts /= 1000` in that case
        yearMonth = datetime.utcfromtimestamp(ts).strftime('%Y-%m')
        
        gh = geohash.encode(lat, lon)
        return (gh[0: 2] + '\t' + yearMonth, tem, humidity, wind_speed, cloud_cover)
    except:
        return ('z?', 0, 0, 0, 0)
    
#text_file = spark.read.load('hdfs://orion11:21001/3hr_sample/sampled_2015/*', format='csv', sep='\t', inferSchema=True, header=True)
text_fileML = sc.textFile("hdfs://orion11:21001/3hr_sample/sampled_2018/*")

# (GeoHash, wind_energy_factor, cloud_cover)
parsed_dataML8g = text_fileML \
    .map(lambda line: parseLine(line)) \
    .filter(lambda line: line[0].startswith('8g') == True)

In [2]:
print(type(parsed_dataML8g))

<class 'pyspark.rdd.PipelinedRDD'>


In [3]:
dfML8g = parsed_dataML8g.toDF()

In [4]:
print(type(dfML8g))

<class 'pyspark.sql.dataframe.DataFrame'>


In [7]:
dfML8g.columns

['_1', '_2', '_3', '_4', '_5']

In [11]:
#1: 8g
'''
original result:
#-----------------
geohash: 8g
humidity: 
[[ 1.         -0.07573358]
 [-0.07573358  1.        ]]
wind_speed: 
[[ 1.        -0.0757577]
 [-0.0757577  1.       ]]
could_cover: 
[[ 1.         -0.38374192]
 [-0.38374192  1.        ]]
-----------------
'''
from pyspark.ml.feature import VectorAssembler

def prepare_data(dframe, predictors, target):
    assembler = VectorAssembler(inputCols=predictors, outputCol="features")
    output = assembler.transform(dframe)
    return output.select("features", target).withColumnRenamed(target, "label")

# Choose our dependent and independent variables:
#1: cloud
prepped8gCloud = prepare_data(dfML8g,
    ['_5'],
    '_2')

prepped8gCloud.show()
(trainingData8gCloud, testData8gCloud) = prepped8gCloud.randomSplit([0.9, 0.1])

#1: humidity
prepped8gHumidity = prepare_data(dfML8g,
    ['_3'],
    '_2')

prepped8gHumidity.show()
(trainingData8gHumidity, testData8gHumidity) = prepped8gHumidity.randomSplit([0.9, 0.1])

#1: wind_speed
prepped8gWind = prepare_data(dfML8g,
    ['_4'],
    '_2')

prepped8gWind.show()
(trainingData8gWind, testData8Wind) = prepped8gWind.randomSplit([0.9, 0.1])

+--------+---------+
|features|    label|
+--------+---------+
|   [0.0]|297.67184|
|   [3.0]|297.52185|
|  [11.0]|297.53186|
|   [1.0]|299.29187|
|   [0.0]|299.15186|
|  [29.0]|298.06186|
|  [21.0]|297.39185|
|  [77.0]|298.10187|
|   [0.0]|297.51184|
| [100.0]|297.81186|
|   [3.0]|297.75186|
|  [65.0]|297.29187|
|  [22.0]|297.03186|
|  [21.0]|297.28186|
|   [0.0]|297.20184|
|   [5.0]|298.57187|
|   [0.0]|297.14185|
|  [11.0]|297.25186|
|  [24.0]|296.96185|
|  [28.0]|297.03186|
+--------+---------+
only showing top 20 rows

+--------+---------+
|features|    label|
+--------+---------+
|   [3.0]|297.67184|
|   [3.0]|297.52185|
|   [2.0]|297.53186|
|  [26.0]|299.29187|
|  [25.0]|299.15186|
|  [14.0]|298.06186|
|   [3.0]|297.39185|
|  [10.0]|298.10187|
|   [2.0]|297.51184|
|   [5.0]|297.81186|
|   [5.0]|297.75186|
|   [3.0]|297.29187|
|   [3.0]|297.03186|
|   [2.0]|297.28186|
|   [3.0]|297.20184|
|  [24.0]|298.57187|
|   [3.0]|297.14185|
|   [3.0]|297.25186|
|   [3.0]|296.96185|
|   [3.0

In [None]:
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator

#We use same parameter for all predictors
rf8g = RandomForestRegressor(numTrees=100, maxDepth=5, maxBins=32)

#1: cloud
model = rf8g.fit(trainingData8gCloud)
predictions = model.transform(testData8gCloud)

evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Cloud: Root Mean Squared Error (RMSE) on test data = %g" % rmse)

#2: humidity
model = rf8g.fit(trainingData8gHumidity)
predictions = model.transform(testData8gHumidity)

evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Humidity: Root Mean Squared Error (RMSE) on test data = %g" % rmse)


#1: wind_speed
model = rf8g.fit(trainingData8gWind)
predictions = model.transform(testData8Wind)

evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Wind: Root Mean Squared Error (RMSE) on test data = %g" % rmse)

Cloud: Root Mean Squared Error (RMSE) on test data = 1.06573
Humidity: Root Mean Squared Error (RMSE) on test data = 1.06734
