In [1]:
import numpy as np
import xarray as xr
import pandas as pd
import matplotlib.pyplot as plt
import sklearn, os

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense, LSTM

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
dataset_names = {'pCO2': 'pCO2_2D_mon_CESM001_1x1_198201-201701.nc',
                 'XCO2': 'XCO2_1D_mon_CESM001_native_198201-201701.nc',
                 'SST': 'SST_2D_mon_CESM001_1x1_198201-201701.nc',
                 'SSS': 'SSS_2D_mon_CESM001_1x1_198201-201701.nc',
                 'MLD': 'MLD_2D_mon_CESM001_1x1_198201-201701.nc',
                 'Chl': 'Chl_2D_mon_CESM001_1x1_198201-201701.nc'}

ds = {}
for dataset in dataset_names.keys():
    filename = os.path.join(dataset_names[dataset])
    ds[dataset] = xr.open_dataset(filename)

In [3]:
merged_dataset = xr.merge([ds[name][name] for name in ds.keys()])
df = merged_dataset.to_dataframe().reset_index()

df['A'], df['B'], df['C'] = np.sin(df['ylat']), np.sin(df['xlon'])*np.cos(df['ylat']), -np.cos(df['xlon'])*np.cos(df['ylat'])

df.dropna(subset=['pCO2', 'XCO2', 'SST', 'SSS', 'MLD', 'Chl'], inplace=True)
shift_param = int(df.shape[0]/df['time'].nunique())

df.drop(columns=['xlon', 'ylat', 'time', 'TLONG', 'TLAT'], inplace=True)

print('We shift the dataset by %i rows to incorporate the sequential nature of the data.' %shift_param)
print('The dataset contains %i years worth of data.' %((df.shape[0]/shift_param)/12))

We shift the dataset by 41070 rows to incorporate the sequential nature of the data.
The dataset contains 35 years worth of data.


In [4]:
scaler = MinMaxScaler(feature_range=(0, 1))
scaled = scaler.fit_transform(df)
scaled_df = pd.DataFrame(df)

# shift the time series down by one timestep
scaled_df.columns = df.columns
scaled_df['pCO2(t)'] = scaled_df['pCO2']
scaled_df = pd.concat([scaled_df[scaled_df.columns[:-1]].shift(-shift_param), scaled_df['pCO2(t)']], axis=1)

# remove null values generated by shifting the time series
scaled_df.dropna(how='any', inplace=True)

In [35]:
values = scaled_df.values
n_train_months = shift_param * 12 * 30 # training on 30 years and evaluating on five years

train = values[:n_train_months, :]
test = values[n_train_months:, :]

train_X, train_y = train[:, :-1], train[:, -1]
test_X, test_y = test[:, :-1], test[:, -1]

In [36]:
# reshape input to be 3D [samples, timesteps, features]
train_X = train_X.reshape((train_X.shape[0], 1, train_X.shape[1]))
test_X = test_X.reshape((test_X.shape[0], 1, test_X.shape[1]))

print(train_X.shape, train_y.shape, test_X.shape, test_y.shape)

(14785200, 1, 9) (14785200,) (2464200, 1, 9) (2464200,)


In [37]:
model = Sequential()
model.add(LSTM(100, input_shape=(train_X.shape[1], train_X.shape[2])))
model.add(Dense(521, activation='relu'))
model.add(Dense(1))
model.compile(loss='mae', optimizer='adam',  metrics=['acc'])

history = model.fit(train_X, train_y, epochs=10, batch_size=64, validation_data=(test_X, test_y), shuffle=False)

Train on 14785200 samples, validate on 2464200 samples
Epoch 1/10
   30336/14785200 [..............................] - ETA: 32:10 - loss: 73.7911 - acc: 0.0000e+00

KeyboardInterrupt: 

In [None]:
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(loss) + 1)

plt.figure()
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()

plt.show()

The cells below widen the dataframe. This is a test approach for the LSTM input.

In [4]:
df['ylat'] = df['ylat'].astype(str)
df['xlon'] = df['xlon'].astype(str)

df['xlon_ylat'] = df['xlon'] + '_' + df['ylat']

df_test = df.loc[df['time'] == df['time'].unique()[0]]
df_test.drop(columns=['xlon', 'ylat'], inplace=True)

dummies_df = pd.get_dummies(df_test)

ylat_columns = [columns for columns in dummies_df.columns if 'ylat' in columns]
driver_columns = [columns for columns in df_test.columns if columns not in ['xlon_ylat']]
expanded_columns = [driver + ':' + ylat for driver in driver_columns for ylat in ylat_columns]

dataframes = []

for ind, row in dummies_df.iterrows():
    temp_columns, X = [], [] 
    for var in expanded_columns:
        driver, pos = var.split(':')[0], var.split(':')[1]
        
        if row[driver] * row[pos] != 0:
            temp_columns.append(var)
            X.append(row[driver] * row[pos])
        
    dataframes.append(pd.DataFrame([X], columns=temp_columns))
    print(pd.DataFrame([X], columns=temp_columns))

result = pd.concat(dataframes, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
