In [1]:
#Step1: Determine temperature trends over the past 5 years

import geohash
from datetime import datetime

def parseLine(line):
    variables = line.split("\t")
    try:
        lat = float(variables[1])
        lon = float(variables[2])
        tem = float(variables[10])
        
        ts = int(variables[0][0: 10])
        # if you encounter a "year is out of range" error the timestamp
        # may be in milliseconds, try `ts /= 1000` in that case
        year = datetime.utcfromtimestamp(ts).strftime('%Y')
        
        gh = geohash.encode(lat, lon)
        return (gh[0: 2] + '\t' + year, tem)
    except:
        return ('', 0)
    
#text_file = spark.read.load('hdfs://orion11:21001/3hr_sample/sampled_2015/*', format='csv', sep='\t', inferSchema=True, header=True)
text_fileCC = sc.textFile("hdfs://orion11:21001/3hr_sample/*")

# (GeoHash, wind_energy_factor, cloud_cover)
parsed_dataCC = text_fileCC \
    .map(lambda line: parseLine(line))

av_data = parsed_dataCC.map(lambda x: (x[0], x[1])) \
    .mapValues(lambda x: (x, 1)) \
    .reduceByKey(lambda x, y: (x[0] + y[0], x[1]+y[1])) \
    .mapValues(lambda v: (v[0] / v[1])) \
    .sortBy(lambda x: x[0], True)



In [2]:
def parseLine(line):
    try:
        key = line[0]
        value = line[1]
        
        print(key)
        print(value)
        
        variables = key.split("\t")
        year = variables[1]
        geo = variables[0]
        
        if year == '2015':
            return (geo, value, 0, 0, 0, 0)
        elif year == '2016':
            return (geo, 0, value, 0, 0, 0)
        elif year == '2017':
            return (geo, 0, 0, value, 0, 0)
        elif year == '2018':
            return (geo, 0, 0, 0, value, 0)
        elif year == '2019':
            return (geo, 0, 0, 0, 0, value)
        else:
            return ('', 0, 0, 0, 0, 0)
    except:
        return ('', 0, 0, 0, 0, 0)
    

trend_data = av_data \
    .map(lambda line: parseLine(line)) \
    .map(lambda x: (x[0], (x[1], x[2], x[3], x[4], x[5]))) \
    .reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1], x[2] + y[2], x[3] + y[3], x[4] + y[4]))

#year_data.take(100)

In [3]:
#Anser Question1: temperature trends over the past 5 years.

import numpy as np
a = np.array(trend_data.collect())

for i in a:
    print('-----------------')
    print('geohash: ' + i[0] + '. Five years trend: ' + str(i[1]))
    print('-----------------')

-----------------
geohash: . Five years trend: (0, 0, 0, 0, 0)
-----------------
-----------------
geohash: 9t. Five years trend: (292.3740264019512, 292.0545433190392, 292.7062824710396, 292.40777327929044, 292.17809724301696)
-----------------
-----------------
geohash: b9. Five years trend: (281.94971430927603, 281.8657709841434, 280.8322968986304, 281.53038880701735, 282.33973934571424)
-----------------
-----------------
geohash: 9d. Five years trend: (301.21274428900006, 300.89027061326294, 300.5322136974001, 300.8009366723187, 300.3252629504862)
-----------------
-----------------
geohash: 8g. Five years trend: (297.69041888657705, 296.9503182832287, 296.97685361751866, 297.20653801726144, 297.20918538700556)
-----------------
-----------------
geohash: dz. Five years trend: (283.6980253782013, 283.6951032556055, 282.39626633726533, 282.6079796363637, 282.95916053800755)
-----------------
-----------------
geohash: 9x. Five years trend: (282.14881158584683, 280.75218591312796, 2

In [4]:
#Find out geohash with increasing temperatures
#We regard regions that avrage temperature of 2019 is great than 2015 as "regions with increasing temperatures"

def checkIncreasing(line):
    try:
        #if line[0] < line[1] and line[1] < line[2] and line[2] < line[3] and line[3] < line[4]:
        if line[0] < line[4]:
            return True
        else:
            return False
    except:
        return False
    
for i in a:
    if checkIncreasing(i[1]):
        print(i[0])

b9
f6
9g
b8
f9
fd
dn
9u
fc


In [3]:
#Now, we try to compute the PCC  

'''
geohashs with increasing temperatures:
b9
f6
9g
b8
f9
fd
dn
9u
fc
'''
import geohash
from datetime import datetime

def parseLine(line):
    variables = line.split("\t")
    try:
        lat = float(variables[1])
        lon = float(variables[2])
        tem = float(variables[10])
        
        albedo_surface = float(variables[3])
        precipitable_water_entire_atmosphere_single_layer = float(variables[4])
        pressure_maximum_wind = float(variables[5])
        pressure_surface = float(variables[6])
        pressure_tropopause = float(variables[7])
        snow_depth_surface = float(variables[9])
        total_precipitation_surface_3_hour_accumulation = float(variables[13])
        visibility_surface = float(variables[14])
        
        humidity = float(variables[8])
        wind_speed = float(variables[17])
        cloud_cover = float(variables[12])   
        
        ts = int(variables[0][0: 10])
        # if you encounter a "year is out of range" error the timestamp
        # may be in milliseconds, try `ts /= 1000` in that case
        yearMonth = datetime.utcfromtimestamp(ts).strftime('%Y-%m')
        
        gh = geohash.encode(lat, lon)
        return (gh[0: 2] + '\t' + yearMonth, tem, humidity, wind_speed, cloud_cover, \
                precipitable_water_entire_atmosphere_single_layer, pressure_surface, \
               snow_depth_surface, total_precipitation_surface_3_hour_accumulation)
    except:
        return ('', 0, 0, 0, 0, 0, 0, 0, 0)
    
    
def isIncreaseGeo(line):
    try:
        #if line[0] < line[1] and line[1] < line[2] and line[2] < line[3] and line[3] < line[4]:
        if line.startswith('b9') or line.startswith('f6') or line.startswith('9g') or line.startswith('b8') \
        or line.startswith('f9') or line.startswith('fd') or line.startswith('dn') or line.startswith('9u') \
        or line.startswith('fc'):
            return True
        else:
            return False
    except:
        return False
    
#text_file = spark.read.load('hdfs://orion11:21001/3hr_sample/sampled_2015/*', format='csv', sep='\t', inferSchema=True, header=True)
text_fileCC = sc.textFile("hdfs://orion11:21001/3hr_sample/*")

# (GeoHash, wind_energy_factor, cloud_cover)
parsed_dataCC = text_fileCC \
    .map(lambda line: parseLine(line)) \
    .filter(lambda line: isIncreaseGeo(line[0]))

av_data = parsed_dataCC.map(lambda x: (x[0], (x[1], x[2], x[3], x[4], x[5], x[6], x[7], x[8]))) \
   .mapValues(lambda x: (x, 1)) \
   .reduceByKey(lambda x, y: (((x[0][0] + y[0][0]), (x[0][1] + y[0][1]), (x[0][2] + y[0][2]), (x[0][3] + y[0][3]), (x[0][4] + y[0][4]), (x[0][5] + y[0][5]), (x[0][6] + y[0][6]), (x[0][7] + y[0][7])), x[1]+y[1])) \
   .mapValues(lambda v: (v[0][0] / v[1], v[0][1] / v[1], v[0][2] / v[1], v[0][3] / v[1], v[0][4] / v[1], v[0][5] / v[1], v[0][6] / v[1], v[0][7] / v[1]))

#geo-time, average_temperature, average_humidity, average_wind_speed, average_cloud_cover
av_data = av_data.sortBy(lambda x: x[0], True)

import numpy as np
a = np.array(av_data.collect())

#geo - average_temperature in time order
temperature_array = {} 

for i in a:
    if len(i[0]) > 0:
        temperature_array.setdefault(i[0][0: 2], []).append(i[1][0])
        
#geo - average_humidity in time order
humidity_array = {} 

for i in a:
    if len(i[0]) > 0:
        humidity_array.setdefault(i[0][0: 2], []).append(i[1][1])

#geo - average_wind_speed in time order
wind_speed_array = {} 

for i in a:
    if len(i[0]) > 0:
        wind_speed_array.setdefault(i[0][0: 2], []).append(i[1][2])
        
#geo - average_cloud_cover in time order
cloud_cover_array = {} 

for i in a:
    if len(i[0]) > 0:
        cloud_cover_array.setdefault(i[0][0: 2], []).append(i[1][3])
        
#geo - precipitable_water_entire_atmosphere_single_layer in time order
precipitable_water_entire_atmosphere_single_layer_array = {} 

for i in a:
    if len(i[0]) > 0:
        precipitable_water_entire_atmosphere_single_layer_array.setdefault(i[0][0: 2], []).append(i[1][4])
        
#geo - pressure_surface in time order
pressure_surface_array = {} 

for i in a:
    if len(i[0]) > 0:
        pressure_surface_array.setdefault(i[0][0: 2], []).append(i[1][5])
        
#geo -  snow_depth_surface in time order
snow_depth_surface_array = {} 

for i in a:
    if len(i[0]) > 0:
        snow_depth_surface_array.setdefault(i[0][0: 2], []).append(i[1][6])
        
#geo - total_precipitation_surface_3_hour_accumulation in time order
total_precipitation_surface_3_hour_accumulation_array = {} 

for i in a:
    if len(i[0]) > 0:
        total_precipitation_surface_3_hour_accumulation_array.setdefault(i[0][0: 2], []).append(i[1][7])


In [14]:
#Anser Question2: correlation matrix 

import numpy as np
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt


for key in temperature_array.keys():
    data = {'temperature': temperature_array[key],
            'humidity': humidity_array[key],
            'wind': wind_speed_array[key],
            'cloud': cloud_cover_array[key],
            'water': precipitable_water_entire_atmosphere_single_layer_array[key],
            'pressure': pressure_surface_array[key],
            'snow': snow_depth_surface_array[key],
            'precipitation': total_precipitation_surface_3_hour_accumulation_array[key]
            }

    df = pd.DataFrame(data, columns=['temperature','humidity','wind','cloud','water', 'pressure', 'snow', 'precipitation'])

    corrMatrix = df.corr()
    print ('--------------------')
    print ('Geohash: ' + key)
    print (' ')
    print (corrMatrix)
    
    #Get the visualized matrix
    sn.heatmap(corrMatrix, annot=True)
    plt.show()
    print ('--------------------')

--------------------
Geohash: 9g
 
               temperature  humidity      wind     cloud     water  pressure  \
temperature       1.000000  0.674599 -0.161981 -0.265946  0.727523 -0.834945   
humidity          0.674599  1.000000 -0.370336  0.386288  0.953021 -0.462680   
wind             -0.161981 -0.370336  1.000000 -0.256084 -0.468118 -0.006234   
cloud            -0.265946  0.386288 -0.256084  1.000000  0.362238  0.255357   
water             0.727523  0.953021 -0.468118  0.362238  1.000000 -0.550121   
pressure         -0.834945 -0.462680 -0.006234  0.255357 -0.550121  1.000000   
snow             -0.285235 -0.092215  0.001559  0.216369 -0.109911  0.163431   
precipitation     0.406953  0.830720 -0.392452  0.587567  0.856538 -0.347455   

                   snow  precipitation  
temperature   -0.285235       0.406953  
humidity      -0.092215       0.830720  
wind           0.001559      -0.392452  
cloud          0.216369       0.587567  
water         -0.109911       0.856538 

<Figure size 640x480 with 2 Axes>

--------------------
--------------------
Geohash: 9u
 
               temperature  humidity      wind     cloud     water  pressure  \
temperature       1.000000  0.655561 -0.566284 -0.825942  0.919266 -0.721430   
humidity          0.655561  1.000000 -0.535912 -0.326373  0.829690 -0.229260   
wind             -0.566284 -0.535912  1.000000  0.582678 -0.610165  0.046104   
cloud            -0.825942 -0.326373  0.582678  1.000000 -0.629278  0.549334   
water             0.919266  0.829690 -0.610165 -0.629278  1.000000 -0.609070   
pressure         -0.721430 -0.229260  0.046104  0.549334 -0.609070  1.000000   
snow             -0.274563 -0.150930 -0.102134  0.211513 -0.208536  0.360022   
precipitation     0.356355  0.520325 -0.321806 -0.122385  0.578143 -0.296135   

                   snow  precipitation  
temperature   -0.274563       0.356355  
humidity      -0.150930       0.520325  
wind          -0.102134      -0.321806  
cloud          0.211513      -0.122385  
water         -0.2

<Figure size 640x480 with 2 Axes>

--------------------
--------------------
Geohash: b8
 
               temperature  humidity      wind     cloud     water  pressure  \
temperature       1.000000 -0.838987 -0.271854  0.157902  0.820941  0.299612   
humidity         -0.838987  1.000000  0.540365 -0.096683 -0.834809 -0.641970   
wind             -0.271854  0.540365  1.000000 -0.236586 -0.528897 -0.630052   
cloud             0.157902 -0.096683 -0.236586  1.000000  0.325860  0.144203   
water             0.820941 -0.834809 -0.528897  0.325860  1.000000  0.549459   
pressure          0.299612 -0.641970 -0.630052  0.144203  0.549459  1.000000   
snow                   NaN       NaN       NaN       NaN       NaN       NaN   
precipitation     0.315164 -0.040718  0.420911  0.187552  0.233953 -0.421804   

               snow  precipitation  
temperature     NaN       0.315164  
humidity        NaN      -0.040718  
wind            NaN       0.420911  
cloud           NaN       0.187552  
water           NaN       0.233953  
p

<Figure size 640x480 with 2 Axes>

--------------------
--------------------
Geohash: b9
 
               temperature  humidity      wind     cloud     water  pressure  \
temperature       1.000000 -0.803855 -0.271809  0.416043  0.851638  0.279872   
humidity         -0.803855  1.000000  0.401974 -0.170093 -0.767845 -0.597080   
wind             -0.271809  0.401974  1.000000 -0.197238 -0.465019 -0.558361   
cloud             0.416043 -0.170093 -0.197238  1.000000  0.565678  0.018257   
water             0.851638 -0.767845 -0.465019  0.565678  1.000000  0.500794   
pressure          0.279872 -0.597080 -0.558361  0.018257  0.500794  1.000000   
snow                   NaN       NaN       NaN       NaN       NaN       NaN   
precipitation     0.171967  0.181300  0.609593  0.246732  0.043912 -0.509464   

               snow  precipitation  
temperature     NaN       0.171967  
humidity        NaN       0.181300  
wind            NaN       0.609593  
cloud           NaN       0.246732  
water           NaN       0.043912  
p

<Figure size 640x480 with 2 Axes>

--------------------
--------------------
Geohash: dn
 
               temperature  humidity      wind     cloud     water  pressure  \
temperature       1.000000 -0.688499 -0.844617 -0.570840  0.959633 -0.652638   
humidity         -0.688499  1.000000  0.702450  0.780498 -0.579844  0.316028   
wind             -0.844617  0.702450  1.000000  0.624563 -0.881766  0.338685   
cloud            -0.570840  0.780498  0.624563  1.000000 -0.444544  0.279931   
water             0.959633 -0.579844 -0.881766 -0.444544  1.000000 -0.514274   
pressure         -0.652638  0.316028  0.338685  0.279931 -0.514274  1.000000   
snow             -0.715470  0.537608  0.516860  0.299412 -0.606296  0.654058   
precipitation     0.093135  0.372599  0.139018  0.639190  0.220496 -0.138439   

                   snow  precipitation  
temperature   -0.715470       0.093135  
humidity       0.537608       0.372599  
wind           0.516860       0.139018  
cloud          0.299412       0.639190  
water         -0.6

<Figure size 640x480 with 2 Axes>

--------------------
--------------------
Geohash: f6
 
               temperature  humidity      wind     cloud     water  pressure  \
temperature       1.000000 -0.276782  0.177585  0.696785  0.934133 -0.175326   
humidity         -0.276782  1.000000  0.148034  0.314745 -0.547242  0.210494   
wind              0.177585  0.148034  1.000000  0.366781  0.153408 -0.593045   
cloud             0.696785  0.314745  0.366781  1.000000  0.513987 -0.155771   
water             0.934133 -0.547242  0.153408  0.513987  1.000000 -0.247280   
pressure         -0.175326  0.210494 -0.593045 -0.155771 -0.247280  1.000000   
snow             -0.946995  0.251385 -0.267258 -0.717693 -0.882468  0.258397   
precipitation     0.744952 -0.173629  0.639757  0.620579  0.760966 -0.486045   

                   snow  precipitation  
temperature   -0.946995       0.744952  
humidity       0.251385      -0.173629  
wind          -0.267258       0.639757  
cloud         -0.717693       0.620579  
water         -0.8

<Figure size 640x480 with 2 Axes>

--------------------
--------------------
Geohash: f9
 
               temperature  humidity      wind     cloud     water  pressure  \
temperature       1.000000 -0.356012 -0.709369  0.428749  0.943913  0.454953   
humidity         -0.356012  1.000000  0.178969  0.530513 -0.498677 -0.202868   
wind             -0.709369  0.178969  1.000000 -0.303825 -0.710282 -0.589021   
cloud             0.428749  0.530513 -0.303825  1.000000  0.278506  0.199559   
water             0.943913 -0.498677 -0.710282  0.278506  1.000000  0.325894   
pressure          0.454953 -0.202868 -0.589021  0.199559  0.325894  1.000000   
snow             -0.939329  0.220891  0.570788 -0.507378 -0.821095 -0.415243   
precipitation     0.588229 -0.194101 -0.205083  0.416327  0.634145 -0.137585   

                   snow  precipitation  
temperature   -0.939329       0.588229  
humidity       0.220891      -0.194101  
wind           0.570788      -0.205083  
cloud         -0.507378       0.416327  
water         -0.8

<Figure size 640x480 with 2 Axes>

--------------------
--------------------
Geohash: fc
 
               temperature  humidity      wind     cloud     water  pressure  \
temperature       1.000000 -0.733868 -0.589044  0.044323  0.845200  0.318549   
humidity         -0.733868  1.000000  0.553261  0.364343 -0.784302 -0.273268   
wind             -0.589044  0.553261  1.000000 -0.121268 -0.825196 -0.597154   
cloud             0.044323  0.364343 -0.121268  1.000000  0.093666 -0.207068   
water             0.845200 -0.784302 -0.825196  0.093666  1.000000  0.397079   
pressure          0.318549 -0.273268 -0.597154 -0.207068  0.397079  1.000000   
snow             -0.882543  0.507031  0.368187 -0.243035 -0.685621 -0.056246   
precipitation     0.134380  0.020767  0.217158  0.322524  0.093805 -0.339642   

                   snow  precipitation  
temperature   -0.882543       0.134380  
humidity       0.507031       0.020767  
wind           0.368187       0.217158  
cloud         -0.243035       0.322524  
water         -0.6

<Figure size 640x480 with 2 Axes>

--------------------
--------------------
Geohash: fd
 
               temperature  humidity      wind     cloud     water  pressure  \
temperature       1.000000 -0.285120 -0.456755  0.605955  0.902736  0.276748   
humidity         -0.285120  1.000000  0.268457  0.191846 -0.564170  0.125936   
wind             -0.456755  0.268457  1.000000 -0.193343 -0.575221 -0.634834   
cloud             0.605955  0.191846 -0.193343  1.000000  0.501793  0.117815   
water             0.902736 -0.564170 -0.575221  0.501793  1.000000  0.224405   
pressure          0.276748  0.125936 -0.634834  0.117815  0.224405  1.000000   
snow             -0.967148  0.291414  0.397621 -0.602895 -0.872831 -0.220411   
precipitation     0.503763 -0.241640  0.282436  0.480821  0.507535 -0.252668   

                   snow  precipitation  
temperature   -0.967148       0.503763  
humidity       0.291414      -0.241640  
wind           0.397621       0.282436  
cloud         -0.602895       0.480821  
water         -0.8

<Figure size 640x480 with 2 Axes>

--------------------
