In [3]:
import geohash
from datetime import datetime

def parseLine(line):
    variables = line.split("\t")
    try:
        lat = float(variables[1])
        lon = float(variables[2])
        tem = float(variables[10])
        humidity = float(variables[8])
        wind_speed = float(variables[17])
        cloud_cover = float(variables[12])   
        
        ts = int(variables[0][0: 10])
        # if you encounter a "year is out of range" error the timestamp
        # may be in milliseconds, try `ts /= 1000` in that case
        yearMonth = datetime.utcfromtimestamp(ts).strftime('%Y-%m')
        
        gh = geohash.encode(lat, lon)
        return (gh[0: 2] + '\t' + yearMonth, tem, humidity, wind_speed, cloud_cover)
    except:
        return ('', 0, 0, 0, 0)
    
#text_file = spark.read.load('hdfs://orion11:21001/3hr_sample/sampled_2015/*', format='csv', sep='\t', inferSchema=True, header=True)
text_file = sc.textFile("hdfs://orion11:21001/3hr_sample/*")

# (GeoHash, wind_energy_factor, cloud_cover)
parsed_data = text_file \
    .map(lambda line: parseLine(line))

In [17]:
parsed_data.take(5)

[('', 0, 0, 0, 0, 0),
 ('d5', '2018-11', 302.01184, 15.0, 9.237024, 53.0),
 ('d6', '2018-11', 301.91187, 22.0, 8.837025, 0.0),
 ('ff', '2018-11', 277.56186, 83.0, 5.837025, 80.0),
 ('c9', '2018-11', 262.87186, 81.0, 10.137025, 94.0)]

In [4]:
av_data = parsed_data.map(lambda x: (x[0], (x[1], x[2], x[3], x[4]))) \
   .mapValues(lambda x: (x, 1)) \
   .reduceByKey(lambda x, y: (((x[0][0] + y[0][0]), (x[0][1] + y[0][1]), (x[0][2] + y[0][2]), (x[0][3] + y[0][3])), x[1]+y[1])) \
   .mapValues(lambda v: (v[0][0] / v[1], v[0][1] / v[1], v[0][2] / v[1], v[0][3] / v[1]))

In [5]:
#geo-time, average_temperature, average_humidity, average_wind_speed, average_cloud_cover
av_data = av_data.sortBy(lambda x: x[0], True)

In [67]:
import numpy as np
x = [1, 2, 3, 4, 5.5]
y = [3, 4 ,5, 6.1, 7.3]
pccs = np.corrcoef(x, y)
print(pccs)

[[1.         0.99844958]
 [0.99844958 1.        ]]


In [15]:
import numpy as np
a = np.array(av_data.collect())

In [16]:
#geo - raw data in time order
raw = {} 

for i in a:
    if len(i[0]) > 0:
        raw.setdefault(i[0][0: 2], []).append(i[1])



In [19]:
#geo - average_temperature in time order
temperature_array = {} 

for i in a:
    if len(i[0]) > 0:
        temperature_array.setdefault(i[0][0: 2], []).append(i[1][0])
        
#geo - average_humidity in time order
humidity_array = {} 

for i in a:
    if len(i[0]) > 0:
        humidity_array.setdefault(i[0][0: 2], []).append(i[1][1])

#geo - average_wind_speed in time order
wind_speed_array = {} 

for i in a:
    if len(i[0]) > 0:
        wind_speed_array.setdefault(i[0][0: 2], []).append(i[1][2])
        
#geo - average_cloud_cover in time order
cloud_cover_array = {} 

for i in a:
    if len(i[0]) > 0:
        cloud_cover_array.setdefault(i[0][0: 2], []).append(i[1][3])


In [22]:
import numpy as np

for key in temperature_array.keys():
    print('-----------------')
    print('geohash: ' + key)
    print('')
    
    print('humidity: ')
    pccs = np.corrcoef(temperature_array[key], humidity_array[key])
    print(pccs)
    print('')

    print('wind_speed: ')
    pccs = np.corrcoef(temperature_array[key], wind_speed_array[key])
    print(pccs)
    print('')

    print('could_cover: ')
    pccs = np.corrcoef(temperature_array[key], cloud_cover_array[key])
    print(pccs)
    print('')

-----------------
geohash: 8g

humidity: 
[[ 1.         -0.07573358]
 [-0.07573358  1.        ]]

wind_speed: 
[[ 1.        -0.0757577]
 [-0.0757577  1.       ]]

could_cover: 
[[ 1.         -0.38374192]
 [-0.38374192  1.        ]]

-----------------
geohash: 8u

humidity: 
[[ 1.         -0.26890238]
 [-0.26890238  1.        ]]

wind_speed: 
[[ 1.         -0.25467187]
 [-0.25467187  1.        ]]

could_cover: 
[[ 1.         -0.27426788]
 [-0.27426788  1.        ]]

-----------------
geohash: 8v

humidity: 
[[ 1.        -0.6180439]
 [-0.6180439  1.       ]]

wind_speed: 
[[ 1.         -0.57841825]
 [-0.57841825  1.        ]]

could_cover: 
[[ 1.         -0.61329266]
 [-0.61329266  1.        ]]

-----------------
geohash: 8x

humidity: 
[[ 1.         -0.81826103]
 [-0.81826103  1.        ]]

wind_speed: 
[[ 1.         -0.34704516]
 [-0.34704516  1.        ]]

could_cover: 
[[ 1.         -0.11887706]
 [-0.11887706  1.        ]]

-----------------
geohash: 8y

humidity: 
[[ 1.         -0.6