# D - *Forstbotanischer Garten* Stations

In [1]:
# To have interactive plots
%matplotlib notebook 

# To have static plots
# %matplotlib inline

import numpy
import pandas
from matplotlib import pyplot

from mpl_toolkits.mplot3d import Axes3D
from mpl_toolkits.axes_grid1 import make_axes_locatable

numpy.set_printoptions( precision=4, suppress=True )

In [2]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [3]:
df = pandas.read_csv( '../data/foboga.csv', parse_dates=['Timestamp'] )
df.set_index( ['Timestamp'], inplace=True, drop=True )

In [4]:
datacols = [ 'Temperature (2m)', 'Humidity (2m)', 'PAR Radiation (2m)',
             'Temperature (-0.25m)', 'Soil moisture (-0.25m)',
             'Temperature (-0.55m)', 'Soil moisture (-0.55m)' ]

In [5]:
data = df[ datacols ].values

In [6]:
scaler = StandardScaler()
scaler.fit( data )
# print( scaler.mean_, scaler.var_ )
scaled_data = scaler.transform( data )

In [7]:
pca = PCA()
pca.fit( scaled_data ) # The function expects variables to be rows!

PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

In [8]:
pca.components_

array([[ 0.4812, -0.3313,  0.2164,  0.467 , -0.3348,  0.4573, -0.2695],
       [ 0.119 , -0.1985,  0.5957,  0.0662,  0.5126,  0.0885,  0.5626],
       [-0.1404, -0.5458,  0.4492, -0.3928, -0.2088, -0.4109, -0.3374],
       [-0.1723,  0.5725,  0.5161,  0.0659,  0.2324,  0.1317, -0.5482],
       [ 0.1272, -0.3639, -0.359 ,  0.0147,  0.7253,  0.0278, -0.4421],
       [-0.8264, -0.3045, -0.0294,  0.2682, -0.0252,  0.3875,  0.0288],
       [-0.0765,  0.0037,  0.0205,  0.7394,  0.0287, -0.6679, -0.0122]])

In [9]:
pca.explained_variance_

array([3.752 , 1.2049, 1.0507, 0.5197, 0.3591, 0.1059, 0.008 ])

In [10]:
pca.explained_variance_ratio_

array([0.536 , 0.1721, 0.1501, 0.0742, 0.0513, 0.0151, 0.0011])

In [11]:
pca.explained_variance_ratio_.cumsum()

array([0.536 , 0.7081, 0.8582, 0.9324, 0.9837, 0.9989, 1.    ])

In [12]:
fig, ax = pyplot.subplots( nrows=1, ncols=1, figsize=(5,3) )

ax.bar( list( range( 7 )), pca.explained_variance_ratio_, color='black' )
ax.plot( list( range( 7 )), pca.explained_variance_ratio_.cumsum(), '-', marker='o', color='red' )
ax.hlines( 0.95, -1, 8, linestyle='--', color='black' )
ax.hlines( 0.99, -1, 8, linestyle='--', color='black' )

axR = ax.twinx()
axR.yaxis.tick_right()
axR.plot( list( range( 7 )), pca.explained_variance_, '-', marker='d', color='blue' )
axR.set_ylim( [0,5] )

ax.set_xticks( list( range( 7 )) )
ax.set_xticklabels( ['PC 1', 'PC 2', 'PC 3', 'PC 4', 'PC 5', 'PC 6', 'PC 7'] )

ax.set_ylabel( 'Explained variance (ratio)' )
axR.set_ylabel( 'Explained variance' )

ax.set_xlim( [-0.5, 6.5] )
pyplot.title( 'Station 4 - PCA' )
#plt.savefig( './scree/Station ' + str(station_n) + '_scree.png' )

pyplot.show()

<IPython.core.display.Javascript object>

In [13]:
projected_data = pca.transform( data )
print( projected_data.shape )

(26496, 7)


In [14]:
df_transformed = pandas.DataFrame( projected_data, columns=['PC 1', 'PC 2', 'PC 3', 'PC 4', 'PC 5', 'PC 6', 'PC 7'] )
df_transformed.shape

(26496, 7)

In [15]:
df_transformed[ 'Station' ] = df[ 'Station' ].values
df_transformed.index = df.index
df_transformed.shape

(26496, 8)

In [16]:
df_transformed.head()

Unnamed: 0_level_0,PC 1,PC 2,PC 3,PC 4,PC 5,PC 6,PC 7,Station
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2010-07-01 00:00:00,-13.167484,-1.079219,-63.704756,46.489155,-27.507194,-30.181186,1.839435,1
2010-07-01 01:00:00,-15.333555,-2.178461,-66.316037,49.632752,-29.517007,-30.895358,1.923803,1
2010-07-01 02:00:00,-16.054603,-2.62622,-67.111216,50.43033,-30.034713,-30.971387,1.936542,1
2010-07-01 03:00:00,-16.704746,-3.051416,-68.133298,51.423594,-30.663841,-31.479742,1.923519,1
2010-07-01 04:00:00,-16.323881,-1.421455,-67.235854,53.451783,-32.044951,-31.826691,1.959292,1


In [17]:
df_transformed.tail()

Unnamed: 0_level_0,PC 1,PC 2,PC 3,PC 4,PC 5,PC 6,PC 7,Station
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2010-12-31 19:00:00,-40.726943,-4.124235,-68.1389,58.287417,-32.510238,-29.504606,0.547082,6
2010-12-31 20:00:00,-40.727641,-4.130589,-68.123245,58.257641,-32.491046,-29.475312,0.548056,6
2010-12-31 21:00:00,-40.713836,-4.140884,-68.142181,58.254714,-32.4969,-29.513911,0.544577,6
2010-12-31 22:00:00,-40.575612,-4.066002,-68.091631,58.177201,-32.43755,-29.659257,0.535882,6
2010-12-31 23:00:00,-40.322944,-3.956462,-67.795271,57.836747,-32.228592,-29.587694,0.524185,6


## Using PCA to filter data

In [18]:
datacols = [ 'Temperature (2m)', 'Humidity (2m)', 'PAR Radiation (2m)',
             'Temperature (-0.25m)', 'Soil moisture (-0.25m)',
             'Temperature (-0.55m)', 'Soil moisture (-0.55m)' ]

In [19]:
data = df[ datacols ].values

In [20]:
scaler = StandardScaler()
scaler.fit( data )
# print( scaler.mean_, scaler.var_ )
scaled_data = scaler.transform( data )

pca = PCA( n_components=3 )
pca.fit( scaled_data ) # The function expects variables to be rows!

PCA(copy=True, iterated_power='auto', n_components=3, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

In [21]:
pca.components_

array([[ 0.4812, -0.3313,  0.2164,  0.467 , -0.3348,  0.4573, -0.2695],
       [ 0.119 , -0.1985,  0.5957,  0.0662,  0.5126,  0.0885,  0.5626],
       [-0.1404, -0.5458,  0.4492, -0.3928, -0.2088, -0.4109, -0.3374]])

In [22]:
pca.explained_variance_

array([3.752 , 1.2049, 1.0507])

In [23]:
pca.explained_variance_ratio_

array([0.536 , 0.1721, 0.1501])

In [24]:
pca.explained_variance_ratio_.cumsum()

array([0.536 , 0.7081, 0.8582])

In [25]:
projected_data = pca.transform( scaled_data )

filtered_data = pca.inverse_transform( projected_data )

print( 'Shape of the filtered (reconstructed) data: ', filtered_data.shape )

Shape of the filtered (reconstructed) data:  (26496, 7)


In [26]:
filtered_data = scaler.inverse_transform( filtered_data )

In [27]:
df_filtered = pandas.DataFrame( filtered_data, columns=df.columns[:7] )
df_filtered[ 'Station' ] = df[ 'Station' ].values
df_filtered.index = df.index
df_filtered.head()

Unnamed: 0_level_0,Temperature (2m),Humidity (2m),PAR Radiation (2m),Temperature (-0.25m),Soil moisture (-0.25m),Temperature (-0.55m),Soil moisture (-0.55m),Station
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2010-07-01 00:00:00,14.928891,86.296414,13.073067,14.290897,10.921906,13.540981,8.249136,1
2010-07-01 01:00:00,14.317739,88.744429,-9.963268,14.219694,11.129665,13.49271,8.464096,1
2010-07-01 02:00:00,14.103386,89.459382,-16.79963,14.176729,11.189283,13.459807,8.521683,1
2010-07-01 03:00:00,13.963432,90.280717,-24.478155,14.193107,11.254583,13.477514,8.597044,1
2010-07-01 04:00:00,13.934931,90.514607,-26.105711,14.204685,11.296173,13.489332,8.642297,1


In [28]:
fig, ax = pyplot.subplots( figsize=(6,4) )

station_n = 4
col = 'PAR Radiation (2m)'
#col = 'Temperature (2m)'

dfq = df[ df['Station']==station_n ]
ax.plot( dfq.index, dfq[col], color='blue', alpha=0.5 )

dfq = df_filtered[ df_filtered['Station']==station_n ]
ax.plot( dfq.index, dfq[col], color='red', alpha=0.5 )

pyplot.show()

<IPython.core.display.Javascript object>


To register the converters:
	>>> from pandas.plotting import register_matplotlib_converters
	>>> register_matplotlib_converters()


In [30]:
fig, ax = pyplot.subplots( figsize=(5,5) )

col = 'PAR Radiation (2m)'
#col = 'Temperature (2m)'

ax.plot( [-1000, 2000], [-1000, 2000], color='black', zorder=-1 )
ax.scatter( df[col], df_filtered[col], marker='o', alpha=0.3 )
ax.set_xlim( [-200, 2000] )
ax.set_ylim( [-200, 2000] )

pyplot.show()

<IPython.core.display.Javascript object>

## Using PCA to look for similarities

In [31]:
datacols = [ 'Temperature (2m)', 'Humidity (2m)', 'PAR Radiation (2m)',
             'Temperature (-0.25m)', 'Soil moisture (-0.25m)',
             'Temperature (-0.55m)', 'Soil moisture (-0.55m)' ]

In [32]:
data = df[ datacols ].values

In [33]:
scaler = StandardScaler()
scaler.fit( data )
# print( scaler.mean_, scaler.var_ )
scaled_data = scaler.transform( data )

pca = PCA( n_components=3 )
pca.fit( scaled_data ) # The function expects variables to be rows!

PCA(copy=True, iterated_power='auto', n_components=3, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

In [34]:
pca.components_

array([[ 0.4812, -0.3313,  0.2164,  0.467 , -0.3348,  0.4573, -0.2695],
       [ 0.119 , -0.1985,  0.5957,  0.0662,  0.5126,  0.0885,  0.5626],
       [-0.1404, -0.5458,  0.4492, -0.3928, -0.2088, -0.4109, -0.3374]])

In [35]:
pca.explained_variance_

array([3.752 , 1.2049, 1.0507])

In [36]:
pca.explained_variance_ratio_

array([0.536 , 0.1721, 0.1501])

In [37]:
pca.explained_variance_ratio_.cumsum()

array([0.536 , 0.7081, 0.8582])

In [38]:
projected_data = pca.transform( scaled_data )

print( 'Shape of the filtered (reconstructed) data: ', projected_data.shape )

Shape of the filtered (reconstructed) data:  (26496, 3)


In [40]:
df_transformed = pandas.DataFrame( projected_data, columns=['PC 1', 'PC 2', 'PC 3' ] )
df_transformed.shape

(26496, 3)

In [41]:
df_transformed[ 'Station' ] = df[ 'Station' ].values
df_transformed.index = df.index
df_transformed.shape

(26496, 4)

In [42]:
df_transformed.head()

Unnamed: 0_level_0,PC 1,PC 2,PC 3,Station
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-07-01 00:00:00,1.513727,-0.875933,-0.040877,1
2010-07-01 01:00:00,1.35201,-0.956414,-0.20859,1
2010-07-01 02:00:00,1.298912,-0.982408,-0.253104,1
2010-07-01 03:00:00,1.255977,-1.007134,-0.317075,1
2010-07-01 04:00:00,1.2432,-1.005362,-0.338161,1


In [43]:
df_transformed.tail()

Unnamed: 0_level_0,PC 1,PC 2,PC 3,Station
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-12-31 19:00:00,-2.643064,-0.366966,0.649147,6
2010-12-31 20:00:00,-2.643227,-0.366774,0.650815,6
2010-12-31 21:00:00,-2.640895,-0.367761,0.64996,6
2010-12-31 22:00:00,-2.629806,-0.362592,0.651509,6
2010-12-31 23:00:00,-2.609333,-0.357587,0.671688,6


In [51]:
colors = [ 'red', 'blue', 'green', 'yellow', 'cyan', 'magenta' ]

fig, ax = pyplot.subplots( figsize=(5,5) )

for station_n in range( 1, 7 ): # 6 Stations to be plotted

    dfq = df_transformed[ df_transformed[ 'Station' ]==station_n ]
    
    ax.scatter( dfq['PC 1'], dfq['PC 2'], color=colors[station_n-1], alpha=0.3, label='Station '+str(station_n) )
    
ax.legend()
pyplot.show()

<IPython.core.display.Javascript object>

In [None]:
colors = [ 'red', 'blue', 'green', 'yellow', 'cyan', 'magenta' ]

fig, ax = pyplot.subplots( figsize=(5,5) )

for station_n in range( 1, 7 ): # 6 Stations to be plotted

    dfq = df_transformed[ df_transformed[ 'Station' ]==station_n ]
    
    ax.scatter( dfq['PC 1'], dfq['PC 2'])
    
ax.legend()
pyplot.show()

In [58]:
fig = pyplot.figure()
ax = fig.add_subplot(111, projection='3d')

# for station_n in range( 1, 7 ): # 6 Stations to be plotted
for station_n in [1,2,4]: # Stations to be plotted

    dfq = df_transformed[ df_transformed[ 'Station' ]==station_n ]

    x = dfq[ 'PC 1' ]
    y = dfq[ 'PC 2' ]
    z = dfq[ 'PC 3' ]

    ax.scatter(x, y, z, color=colors[station_n-1], alpha=0.3, label='Station '+str(station_n) )

    ax.set_xlabel( 'PC 1' )
    ax.set_ylabel( 'PC 2' )
    ax.set_zlabel( 'PC 3' )

ax.legend()

pyplot.show()

<IPython.core.display.Javascript object>