In [1]:
'''
In this part, we tried to use ML to train some model that can predict 
the temperature of a specific region(two-character geohash) according to the humidity/wind_speed/could_cover
'''
'''As training a model even cost hours, at here, we choose geohash '8x' to analyzie'''


import geohash
from datetime import datetime

#transform or data to (geohash-yearMonth, temperature, humidity, wind_speed, cloud_cover) format
def parseLine(line):
    variables = line.split("\t")
    try:
        lat = float(variables[1])
        lon = float(variables[2])
        tem = float(variables[10])
        humidity = float(variables[8])
        wind_speed = float(variables[17])
        cloud_cover = float(variables[12])   
        
        ts = int(variables[0][0: 10])
        # if you encounter a "year is out of range" error the timestamp
        # may be in milliseconds, try `ts /= 1000` in that case
        yearMonth = datetime.utcfromtimestamp(ts).strftime('%Y-%m')
        
        gh = geohash.encode(lat, lon)
        return (gh[0: 2] + '\t' + yearMonth, tem, humidity, wind_speed, cloud_cover)
    except:
        return ('z?', 0, 0, 0, 0)
    
#text_file = spark.read.load('hdfs://orion11:21001/3hr_sample/sampled_2015/*', format='csv', sep='\t', inferSchema=True, header=True)
#text_fileML = sc.textFile("hdfs://orion11:21001/3hr_sample/*")
#text_fileML = sc.textFile("hdfs://orion11:21001/3hr_sample/sampled_2018/*")
text_fileML = sc.textFile("hdfs://orion11:21001/3hr/2018/*")


In [3]:
'''
Method1: At here, we use humidity, wind_speed, and could_cover as predictors respectively

1 The feature you will predict/classify: temperature
2 Features used to train the model: humidity-temperature, wind_speed-temperature, and could_cover-temperature
3 How you partitioned your data: Use .randomSplit([0.8, 0.2]) to partition our data. 80% for training, 20% for testing
4 How the prediction/classification improves your analysis: 
    According to the correlation matrix below, we can konw that for geohash '8x', humidity is the most relevant parameter
And as the model test results shown below, the humidity-temperature model has the lowest RMSE, which means that
it is the best of these three models. So it corresponds with our correlation matrix.

    So the ML result tell us, the correlations between temperature and different parameters are different. For some region,
you can get a pretty good result when you are trying to use humidity to predict the temperature, it is because they are relevant.
But for some other parameters, as they are not such relevant to temperature, 
you cannot get too much information for temperature prediction.
'''


#1: 8x
'''
original result:
-----------------
geohash: 8x
humidity: 
[[ 1.         -0.81826103]
 [-0.81826103  1.        ]]
wind_speed: 
[[ 1.         -0.34704516]
 [-0.34704516  1.        ]]
could_cover: 
[[ 1.         -0.11887706]
 [-0.11887706  1.        ]]
-----------------
'''
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator

parsed_dataML8g = text_fileML \
    .map(lambda line: parseLine(line)) \
    .filter(lambda line: line[0].startswith('8x') == True)
#transform rdd to df
dfML8g = parsed_dataML8g.toDF()

#prepare data for ML
def prepare_data(dframe, predictors, target):
    assembler = VectorAssembler(inputCols=predictors, outputCol="features")
    output = assembler.transform(dframe)
    return output.select("features", target).withColumnRenamed(target, "label")

# Choose our dependent and independent variables:
#1: cloud
prepped8gCloud = prepare_data(dfML8g,
    ['_5'],
    '_2')

#prepped8gCloud.show()
(trainingData8gCloud, testData8gCloud) = prepped8gCloud.randomSplit([0.8, 0.2])

#2: humidity
prepped8gHumidity = prepare_data(dfML8g,
    ['_3'],
    '_2')

#prepped8gHumidity.show()
(trainingData8gHumidity, testData8gHumidity) = prepped8gHumidity.randomSplit([0.8, 0.2])

#3: wind_speed
prepped8gWind = prepare_data(dfML8g,
    ['_4'],
    '_2')

#prepped8gWind.show()
(trainingData8gWind, testData8Wind) = prepped8gWind.randomSplit([0.8, 0.2])


#start MLing

#We use same parameter for all predictors
rf8g = RandomForestRegressor(numTrees=200, maxDepth=20, maxBins=128)



#1: humidity
model = rf8g.fit(trainingData8gHumidity)
predictions = model.transform(testData8gHumidity)

evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Humidity: Root Mean Squared Error (RMSE) on test data = %g" % rmse)


#2: wind_speed
model = rf8g.fit(trainingData8gWind)
predictions = model.transform(testData8Wind)

evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Wind: Root Mean Squared Error (RMSE) on test data = %g" % rmse)


#3: cloud_cover
model = rf8g.fit(trainingData8gCloud)
predictions = model.transform(testData8gCloud)

evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Cloud: Root Mean Squared Error (RMSE) on test data = %g" % rmse)

Humidity: Root Mean Squared Error (RMSE) on test data = 11.0485
Wind: Root Mean Squared Error (RMSE) on test data = 12.0159
Cloud: Root Mean Squared Error (RMSE) on test data = 11.6724


In [2]:
'''
Method2: At here, we combine humidity, wind_speed, and could_cover in pairs and use these pairs as predictors.

1 The feature you will predict/classify: temperature
2 Features used to train the model: (humidity, wind_speed)-temperature, 
(humidity, could_cover)-temperature, and (wind_speed, could_cover)-temperature
3 How you partitioned your data: Use .randomSplit([0.7, 0.3]) to partition our data. 80% for training, 20% for testing
4 How the prediction/classification improves your analysis: 
    According to the correlation matrix below, we can konw that for geohash '8x', humidity is the most relevant parameter, 
, wind_speed is the second, could_cover is the least relevant one.
As the model test results shown below, we can get the minimun RMSE with (humidity, wind_speed) as the predictors.
And the result of (humidity, could_cover) is better than (wind_speed, could_cover) - which is the pair with two pretty 
unrelevant parameters. So it corresponds with our correlation matrix.

    So the ML result tell us, the correlations between temperature and different parameters are different. For some region,
you can get a pretty good result when you are trying to use humidity to predict the temperature, it is because they are relevant.
But for some other parameters, as they are not such relevant to temperature, 
you cannot get too much information for temperature prediction.
'''

#1: 8x
'''
original result:
-----------------
geohash: 8x
humidity: 
[[ 1.         -0.81826103]
 [-0.81826103  1.        ]]
wind_speed: 
[[ 1.         -0.34704516]
 [-0.34704516  1.        ]]
could_cover: 
[[ 1.         -0.11887706]
 [-0.11887706  1.        ]]
-----------------
'''
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator


# (GeoHash, wind_energy_factor, cloud_cover)
parsed_dataML8gd = text_fileML \
    .map(lambda line: parseLine(line)) \
    .filter(lambda line: line[0].startswith('8x') == True)
#transform rdd to df
dfML8g = parsed_dataML8gd.toDF()

#prepare data for ML
def prepare_data(dframe, predictors, target):
    assembler = VectorAssembler(inputCols=predictors, outputCol="features")
    output = assembler.transform(dframe)
    return output.select("features", target).withColumnRenamed(target, "label")

# Choose our dependent and independent variables:
#1: cloud + wind
prepped8gCloudWind = prepare_data(dfML8g,
    ['_5', '_4'],
    '_2')

#prepped8gCloud.show()
(trainingData8gCloudWind, testData8gCloudWind) = prepped8gCloudWind.randomSplit([0.7, 0.3])

#2: humidity + wind
prepped8gHumidityWind = prepare_data(dfML8g,
    ['_3', '_4'],
    '_2')

#prepped8gHumidity.show()
(trainingData8gHumidityWind, testData8gHumidityWind) = prepped8gHumidityWind.randomSplit([0.7, 0.3])

#3: humidity + cloud
prepped8gHumidityCloud = prepare_data(dfML8g,
    ['_3', '_5'],
    '_2')

#prepped8gWind.show()
(trainingData8gHumidityCloud, testDataHumidityCloud) = prepped8gHumidityCloud.randomSplit([0.7, 0.3])


#start MLing

#We use same parameter for all predictors
rf8g = RandomForestRegressor(numTrees=200, maxDepth=20, maxBins=128)



#1: cloud + wind
model = rf8g.fit(trainingData8gCloudWind)
predictions = model.transform(testData8gCloudWind)

evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("cloud + wind: Root Mean Squared Error (RMSE) on test data = %g" % rmse)


#2: humidity + wind
model = rf8g.fit(trainingData8gHumidityWind)
predictions = model.transform(testData8gHumidityWind)

evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("humidity + wind: Root Mean Squared Error (RMSE) on test data = %g" % rmse)


#3: humidity + cloud
model = rf8g.fit(trainingData8gHumidityCloud)
predictions = model.transform(testDataHumidityCloud)

evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("humidity + cloud: Root Mean Squared Error (RMSE) on test data = %g" % rmse)

cloud + wind: Root Mean Squared Error (RMSE) on test data = 3.36441
humidity + wind: Root Mean Squared Error (RMSE) on test data = 3.10203
humidity + cloud: Root Mean Squared Error (RMSE) on test data = 3.18839
