In [2]:
import os
import numpy as np
import xarray as xr
import pandas as pd
import datetime as dt
import tensorflow as tf

from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split

pd.set_option("display.max_columns", 100)

# Load Datasets and Convert to DataFrame

In [3]:
DATA_DIR = '../data/raw'
dataset_names = {'pCO2': 'pCO2_2D_mon_CESM001_1x1_198201-201701.nc',
                 'XCO2': 'XCO2_1D_mon_CESM001_native_198201-201701.nc',
                 'SST': 'SST_2D_mon_CESM001_1x1_198201-201701.nc',
                 'SSS': 'SSS_2D_mon_CESM001_1x1_198201-201701.nc',
                 'MLD': 'MLD_2D_mon_CESM001_1x1_198201-201701.nc',
                 'Chl': 'Chl_2D_mon_CESM001_1x1_198201-201701.nc'}
ds = {}
for dataset in dataset_names.keys():
    filename = os.path.join(DATA_DIR, dataset_names[dataset])
    ds[dataset] = xr.open_dataset(filename)

In [4]:
merged_dataset = xr.merge([ds[name][name] for name in ds.keys()])

# include a variable which tells if a grid is in socat location
# so that we can split a test set during the model training
merged_dataset = xr.merge([merged_dataset, ds['pCO2']['socat_mask']])

In [5]:
df = merged_dataset.to_dataframe().reset_index()

In [8]:
df.dropna(subset=['pCO2', 'XCO2', 'SST', 'SSS', 'MLD', 'Chl', 'time', 'xlon', 'ylat'], inplace=True)

In [9]:
print(df.shape)
df[200:205]

(17290470, 12)


Unnamed: 0,time,xlon,ylat,pCO2,TLONG,TLAT,XCO2,SST,SSS,MLD,Chl,socat_mask
327,1982-01-16 12:00:00,1.5,57.5,315.279871,73.062502,-26.035913,340.848541,7.094026,34.374374,50.79306,0.175069,0.0
328,1982-01-16 12:00:00,1.5,58.5,314.729753,73.062502,-26.035913,340.848541,7.197004,34.70377,54.556305,0.142559,0.0
329,1982-01-16 12:00:00,1.5,59.5,320.074456,73.062502,-26.035913,340.848541,7.542988,35.141186,72.361664,0.089886,0.0
330,1982-01-16 12:00:00,1.5,60.5,326.017223,73.062502,-26.035913,340.848541,7.77528,35.282021,77.93116,0.055659,0.0
331,1982-01-16 12:00:00,1.5,61.5,334.071207,73.062502,-26.035913,340.848541,7.967356,35.379269,88.206825,0.033811,0.0


# Add Temporal Data

In [10]:
df['t0'] = np.cos(df.time.dt.dayofyear * 2 * np.pi / 365)
df['t1'] = np.sin(df.time.dt.dayofyear * 2 * np.pi / 365)

In [11]:
df[200:205]

Unnamed: 0,time,xlon,ylat,pCO2,TLONG,TLAT,XCO2,SST,SSS,MLD,Chl,socat_mask,t0,t1
327,1982-01-16 12:00:00,1.5,57.5,315.279871,73.062502,-26.035913,340.848541,7.094026,34.374374,50.79306,0.175069,0.0,0.962309,0.271958
328,1982-01-16 12:00:00,1.5,58.5,314.729753,73.062502,-26.035913,340.848541,7.197004,34.70377,54.556305,0.142559,0.0,0.962309,0.271958
329,1982-01-16 12:00:00,1.5,59.5,320.074456,73.062502,-26.035913,340.848541,7.542988,35.141186,72.361664,0.089886,0.0,0.962309,0.271958
330,1982-01-16 12:00:00,1.5,60.5,326.017223,73.062502,-26.035913,340.848541,7.77528,35.282021,77.93116,0.055659,0.0,0.962309,0.271958
331,1982-01-16 12:00:00,1.5,61.5,334.071207,73.062502,-26.035913,340.848541,7.967356,35.379269,88.206825,0.033811,0.0,0.962309,0.271958


# Add Spatial Data

In [12]:
df['s0'] = np.sin(df.ylat)
df['s1'] = np.sin(df.xlon) * np.cos(df.ylat)
df['s2'] = -np.cos(df.xlon) * np.cos(df.ylat)

In [13]:
df[200:205]

Unnamed: 0,time,xlon,ylat,pCO2,TLONG,TLAT,XCO2,SST,SSS,MLD,Chl,socat_mask,t0,t1,s0,s1,s2
327,1982-01-16 12:00:00,1.5,57.5,315.279871,73.062502,-26.035913,340.848541,7.094026,34.374374,50.79306,0.175069,0.0,0.962309,0.271958,0.81419,0.579145,-0.04107
328,1982-01-16 12:00:00,1.5,58.5,314.729753,73.062502,-26.035913,340.848541,7.197004,34.70377,54.556305,0.142559,0.0,0.962309,0.271958,0.928466,-0.370488,0.026273
329,1982-01-16 12:00:00,1.5,59.5,320.074456,73.062502,-26.035913,340.848541,7.542988,35.141186,72.361664,0.089886,0.0,0.962309,0.271958,0.189115,-0.979495,0.069461
330,1982-01-16 12:00:00,1.5,60.5,326.017223,73.062502,-26.035913,340.848541,7.77528,35.282021,77.93116,0.055659,0.0,0.962309,0.271958,-0.724108,-0.687959,0.048787
331,1982-01-16 12:00:00,1.5,61.5,334.071207,73.062502,-26.035913,340.848541,7.967356,35.379269,88.206825,0.033811,0.0,0.962309,0.271958,-0.971589,0.236083,-0.016742


# Define Training and Testing Data

In [23]:
X = df[['XCO2', 'SST', 'SSS', 'MLD', 'Chl', 't0', 't1', 's0', 's1', 's2']]
y = df.pCO2.values

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=42)

In [25]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [26]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(32, activation=tf.nn.relu, kernel_initializer='glorot_normal'),
    tf.keras.layers.Dropout(0.4),
    tf.keras.layers.Dense(16, activation=tf.nn.relu, kernel_initializer='glorot_normal'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(1, kernel_initializer='truncated_normal')])

model.compile(optimizer='adam',
              loss='mean_squared_error',
              metrics=['mse'])

In [28]:
history = model.fit(X_train_scaled, y_train, epochs=10, batch_size=320)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [29]:
history = model.fit(X_train_scaled, y_train, epochs=5, batch_size=320)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [1]:
904.3755 ** .5

30.072836580542248