In [33]:
import os
import numpy as np
import xarray as xr
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

pd.set_option("display.max_columns", 100)

# Load datasets

In [2]:
DATA_DIR = '../data/raw'
dataset_names = {'pCO2': 'pCO2_2D_mon_CESM001_1x1_198201-201701.nc',
                 'XCO2': 'XCO2_1D_mon_CESM001_native_198201-201701.nc',
                 'SST': 'SST_2D_mon_CESM001_1x1_198201-201701.nc',
                 'SSS': 'SSS_2D_mon_CESM001_1x1_198201-201701.nc',
                 'MLD': 'MLD_2D_mon_CESM001_1x1_198201-201701.nc',
                 'Chl': 'Chl_2D_mon_CESM001_1x1_198201-201701.nc'}
ds = {}
for dataset in dataset_names.keys():
    filename = os.path.join(DATA_DIR, dataset_names[dataset])
    ds[dataset] = xr.open_dataset(filename)

# Merge datasets by coordinates (time, longitude, latitude)

In [12]:
merged_dataset = xr.merge([ds[name][name] for name in ds.keys()])

# include a variable which tells if a grid is in socat location
# so that we can split a test set during the model training
merged_dataset = xr.merge([merged_dataset, ds['pCO2']['socat_mask']])

In [13]:
merged_dataset

<xarray.Dataset>
Dimensions:     (time: 421, xlon: 360, ylat: 180)
Coordinates:
  * xlon        (xlon) float64 0.5 1.5 2.5 3.5 4.5 ... 356.5 357.5 358.5 359.5
  * ylat        (ylat) float64 -89.5 -88.5 -87.5 -86.5 ... 86.5 87.5 88.5 89.5
  * time        (time) datetime64[ns] 1982-01-16T12:00:00 ... 2017-01-16T12:00:00
    TLONG       float64 ...
    TLAT        float64 ...
Data variables:
    pCO2        (time, ylat, xlon) float64 ...
    XCO2        (time) float32 ...
    SST         (time, ylat, xlon) float32 ...
    SSS         (time, ylat, xlon) float32 ...
    MLD         (time, ylat, xlon) float32 ...
    Chl         (time, ylat, xlon) float32 ...
    socat_mask  (time, ylat, xlon) float64 ...

# Convert to dataframe

In [14]:
df = merged_dataset.to_dataframe().reset_index()

In [18]:
print(df.shape)
df[200:205]

(27280800, 12)


Unnamed: 0,time,xlon,ylat,pCO2,TLONG,TLAT,XCO2,SST,SSS,MLD,Chl,socat_mask
200,1982-01-16 12:00:00,1.5,-69.5,256.084833,73.062502,-26.035913,340.848541,-1.538383,33.624409,12.393412,1.066948,0.0
201,1982-01-16 12:00:00,1.5,-68.5,261.690707,73.062502,-26.035913,340.848541,-1.621055,33.651684,19.549749,0.839482,0.0
202,1982-01-16 12:00:00,1.5,-67.5,261.93586,73.062502,-26.035913,340.848541,-1.59843,33.552433,18.862717,0.584478,0.0
203,1982-01-16 12:00:00,1.5,-66.5,263.325143,73.062502,-26.035913,340.848541,-1.462761,33.398407,19.353331,0.383653,0.0
204,1982-01-16 12:00:00,1.5,-65.5,267.600604,73.062502,-26.035913,340.848541,-1.137036,33.302494,20.290634,0.342793,0.0


# Drop useless columns

In [27]:
df.drop(columns=['TLONG', 'TLAT'], inplace=True)
df[200:205]

Unnamed: 0,time,xlon,ylat,pCO2,XCO2,SST,SSS,MLD,Chl,socat_mask
200,1982-01-16 12:00:00,1.5,-69.5,256.084833,340.848541,-1.538383,33.624409,12.393412,1.066948,0.0
201,1982-01-16 12:00:00,1.5,-68.5,261.690707,340.848541,-1.621055,33.651684,19.549749,0.839482,0.0
202,1982-01-16 12:00:00,1.5,-67.5,261.93586,340.848541,-1.59843,33.552433,18.862717,0.584478,0.0
203,1982-01-16 12:00:00,1.5,-66.5,263.325143,340.848541,-1.462761,33.398407,19.353331,0.383653,0.0
204,1982-01-16 12:00:00,1.5,-65.5,267.600604,340.848541,-1.137036,33.302494,20.290634,0.342793,0.0


# Drop rows with null value in any columns of `pCO2`, `XCO2`, `SST`, `SSS`, `MLD`, or `Chl`

In [31]:
df.dropna(subset=['pCO2', 'XCO2', 'SST', 'SSS', 'MLD', 'Chl'], inplace=True)
df.shape

(17290470, 10)

# Define train and test dataset

In [32]:
features = ['XCO2', 'SST', 'SSS', 'MLD', 'Chl']
X_train = df[df.socat_mask == 1][features]
y_train = df[df.socat_mask == 1]['pCO2']
X_test = df[df.socat_mask == 0][features]
y_test = df[df.socat_mask == 0]['pCO2']

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(245577, 5) (245577,)
(17044893, 5) (17044893,)


# Train and evaluate the model

In [34]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [36]:
reg = LinearRegression().fit(X_train_scaled, y_train)

In [37]:
reg.score(X_test_scaled, y_test)

0.34890652906572994