In [1]:
import dask.dataframe as dd
import matplotlib.pyplot as plt
import os
from sklearn.metrics import r2_score
import dask_ml
import dask
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore", category=UserWarning, message=".*Sending large graph.*")

from dask.distributed import Client, LocalCluster
import dask.multiprocessing

cluster = LocalCluster(processes=True,n_workers=6, threads_per_worker=1)
client = Client(cluster)



In [2]:
folders = [
    'train0_25',
    'train25_50',
    'train50_75',
    'train75_100'
]

# Read Parquet files from each folder into Dask DataFrames
dfs = [dd.read_parquet(folder) for folder in folders]

# Concatenate all DataFrames into a single DataFrame
data = dd.concat(dfs)

In [3]:
feat60 = ['state_t', 'state_q0001','state_q0002','state_q0003','state_u','state_v','pbuf_ozone','pbuf_CH4','pbuf_N2O']
feat1 = ['state_ps','pbuf_SOLIN','pbuf_LHFLX','pbuf_SHFLX','pbuf_TAUX','pbuf_TAUY','pbuf_COSZRS','cam_in_ALDIF','cam_in_ALDIR','cam_in_ASDIF','cam_in_ASDIR','cam_in_LWUP','cam_in_ICEFRAC','cam_in_LANDFRAC','cam_in_OCNFRAC','cam_in_SNOWHLAND']

target60 = ['ptend_t','ptend_q0001','ptend_q0002','ptend_q0003','ptend_u','ptend_v']
target1 = ['cam_out_NETSW','cam_out_FLWDS','cam_out_PRECSC','cam_out_PRECC','cam_out_SOLS','cam_out_SOLL','cam_out_SOLSD','cam_out_SOLLD']

features60 = [] 
for f in feat60:
    features60 = features60 + [f+'_'+str(i) for i in range(60)]
allF = features60 + feat1

targets60 = [] 
for f in target60:
    targets60 = targets60 + [f+'_'+str(i) for i in range(60)]
allT = targets60 + target1

targetsToDrop12 = [ 'ptend_q0001', 'ptend_q0002', 'ptend_q0003', 'ptend_u', 'ptend_v']
dropT = ['ptend_q0002_12','ptend_q0002_13','ptend_q0002_14'] # attention, I think i also need to predict _15
for f in targetsToDrop12:
    dropT = dropT + [f+'_'+str(i) for i in range(12)]

allT2 = [i for i in allT if i not in dropT]

In [4]:
np.random.seed(42)

orig_partitions = [i for i in range(0,int(data.npartitions))]
np.random.shuffle(orig_partitions) #shuffles inplace

trainSep = int(0.95* data.npartitions)
valEnd = data.npartitions #int(0.05* data.npartitions) + trainSep

sampledPartIdxTrain = orig_partitions[0:trainSep]
sampledPartIdxTest  = orig_partitions[trainSep:valEnd]

In [5]:
n60Feat = len(feat60)
n1dFeat = len(feat1)
n60Targ = len(target60)
n1dTarg = len(target1)

# data processing

In [6]:
import tensorflow as tf

from tensorflow.keras import Input
from tensorflow.keras.layers import Dense, LSTM, Embedding, Concatenate,BatchNormalization, Reshape
from tensorflow.keras.models import Model

In [7]:
from keras import backend as K

def r2_scoretf(y_true, y_pred):
    sum_squares_residuals = tf.reduce_sum(tf.square(y_true - y_pred), axis=0)
    sum_squares_total = tf.reduce_sum(tf.square(y_true - tf.reduce_mean(y_true)), axis=0)
    r2 = 1 - (sum_squares_residuals / sum_squares_total)
    return r2 #tf.reduce_mean(r2)

def r2_scoreTrain(y_true, y_pred):
    sum_squares_residuals = tf.reduce_sum(tf.square(y_true - y_pred), axis=0)
    sum_squares_total = tf.reduce_sum(tf.square(y_true - tf.reduce_mean(y_true)), axis=0)
    r2 = (sum_squares_residuals / sum_squares_total) # alwaysPositive, the smaller the better
    return tf.reduce_mean(r2)

class RSquaredMetric(tf.keras.metrics.Metric):
    def __init__(self, shape, name='r_squared', **kwargs):
        super().__init__(name=name, **kwargs)
        self.total_sum_squares = self.add_weight(name='total_sum_squares', initializer='zeros', shape=shape)
        self.residual_sum_squares = self.add_weight(name='residual_sum_squares', initializer='zeros', shape=shape)
        self.num_samples = self.add_weight(name="num_samples", initializer='zeros',dtype=tf.int32)
 
    def update_state(self, y_true, y_pred, sample_weight=None):
        y_true = tf.cast(y_true, self._dtype)
        y_pred = tf.cast(y_pred, self._dtype)
        
        sum_squares_residuals = tf.reduce_sum(tf.square(y_true - y_pred), axis=0)
        sum_squares_total = tf.reduce_sum(tf.square(y_true - tf.reduce_mean(y_true, axis=0)), axis=0)
        sum_squares_total = tf.where(tf.equal(sum_squares_total, 0.0), tf.ones_like(sum_squares_total), sum_squares_total)
        
        self.total_sum_squares.assign_add(sum_squares_total)
        self.residual_sum_squares.assign_add(sum_squares_residuals)

    def result(self):
        r_squared = 1 - (self.residual_sum_squares / self.total_sum_squares)
        r_squared = tf.where(tf.math.is_nan(r_squared), tf.ones_like(r_squared), r_squared)
        return tf.reduce_mean(r_squared)

    def reset_state(self):
        self.total_sum_squares.assign(tf.zeros_like(self.total_sum_squares))
        self.residual_sum_squares.assign(tf.zeros_like(self.residual_sum_squares))

In [24]:
m = RSquaredMetric((60,n60Targ))
m.update_state(y2d_val, y2d_pred)
m.result()

<tf.Tensor: shape=(), dtype=float32, numpy=-2.0725546e+28>

In [None]:
m = RSquaredMetric(n1dTarg)
m.update_state(y1d_val, y1d_val)
m.result()

In [8]:
def getTensorData(data, partPerLoop, startPartIdx,sampledPartIdx):
    X1d, X2d, y1d, y2d, X1dI, X2dI, y1dI,y2dI  = None, None, None, None, False, False, False, False
    for j in range(partPerLoop):
        a = data.get_partition(int(sampledPartIdx[startPartIdx+j])).compute()
        b = np.reshape(a[features60], (a.shape[0], n60Feat, 60))
        b = np.transpose(b, (0,2,1))
        X2d = np.concatenate([X2d,b], axis=0) if X2dI else b
        b = np.reshape(a[targets60], (a.shape[0], n60Targ, 60))
        b = np.transpose(b, (0,2,1))
        y2d = np.concatenate([y2d,b], axis=0) if y2dI else b
        X1d = np.concatenate([X1d,a[feat1]], axis=0) if X1dI else a[feat1]
        y1d = np.concatenate([y1d,a[target1]], axis=0) if y1dI else a[target1]
        X1dI, X2dI, y1dI,y2dI = True, True, True, True
    return X1d, X2d, y1d, y2d

In [9]:
# validation data
partPerLoop = 70

for i in range(1):
    startPartIdx = i*partPerLoop
    X1d_val, X2d_val, y1d_val, y2d_val = getTensorData(data, partPerLoop, startPartIdx, sampledPartIdxTest)



In [10]:
# training sequentially
partPerLoop = 100

for i in range(1):
    startPartIdx = i*partPerLoop
    X1d, X2d, y1d, y2d = getTensorData(data, partPerLoop, startPartIdx, sampledPartIdxTrain)  

# simple fully connected model
- doesn't work at all so far

In [None]:
tf.random.set_seed(42)

OneDInput = Input(shape=(n1dFeat,))
TwoDInput = Input(shape=(60,n60Feat))

x = BatchNormalization()(TwoDInput)
x = Dense(n60Feat, activation='relu')(x)
print(x.shape)
for i in range(6):
    x = Dense(n60Feat, activation='relu')(x)
    print(x.shape)

# add info to 1d output
x0 = Dense(1, activation='relu')(x) #reduce to 1d
x0 = x0[:,:,0]
y = BatchNormalization()(OneDInput)
y = Dense(n1dFeat, activation='relu')(y)

y = Concatenate(axis=1)([x0, y])
print(y.shape)
for i in range(6):
    y = Dense(60+n1dFeat, activation='relu')(y)
    print(y.shape)
y = Dense(n1dTarg, activation='linear', name='1d')(y)



x = Dense(n60Targ, activation='linear',name='2d')(x)
print(x.shape)
output2d =x
output1d =y

model = Model(inputs=[TwoDInput, OneDInput], outputs=[output2d, output1d])
model.compile(optimizer='adam', loss='mse', metrics=[r2_score])

hist = model.fit([X2d, X1d], [y2d, y1d], epochs=10, batch_size=32, validation_data=([X2d_val, X1d_val],[y2d_val, y1d_val]))

In [None]:
[y2d_pred, y1d_pred] = model.predict([X2d_val, X1d_val])

y2d_pred = np.reshape(y2d_pred, (y2d_pred.shape[0],-1))
y2d_val0 = np.reshape(y2d_val, (y2d_val.shape[0],-1))
r2_scores = []
f = np.reshape(np.reshape(np.array(targets60), (n60Targ,60)).transpose(), (1,-1))
for i in range(y2d_val0.shape[1]):
    r2 = r2_score(y2d_val0[:, i], y2d_pred[:, i])
    print(f[0][i], r2)
    r2_scores.append(r2)

r2_scores1d = []
for i in range(y1d_pred.shape[1]):
    r2 = r2_score(y1d_val[:, i], y1d_pred[:, i])
    print(target1[i], r2)
    r2_scores1d.append(r2)


In [None]:
print('mean 2d',np.mean(np.array(r2_scores)))
print('mean 1d',np.mean(np.array(r2_scores1d)))

#oss: 2140.1865 - 2d_loss: 1.4499e-05 - 1d_loss: 2140.1865 - 2d_r2_score: -28348.7305 - 1d_r2_score: 0.9105 - val_loss: 2108.1790 - val_2d_loss: 1.6521e-06 - val_1d_loss: 2108.1790 - val_2d_r2_score: -3997.2546 - val_1d_r2_score: 0.9232

In [None]:
a = pd.DataFrame(y1d_pred, columns=target1)
b = pd.DataFrame(y1d_val, columns=target1)

In [None]:
a.cam_out_PRECSC.plot()
b.cam_out_PRECSC.plot()

# network for 1d and 2d
1d:
- snow & rain rate have big problems

## 1d

In [None]:
tf.random.set_seed(42)

OneDInput = Input(shape=(n1dFeat,))
TwoDInput = Input(shape=(60,n60Feat), name='input')
x = Reshape((60 * n60Feat,),name='inputReshape')(TwoDInput)

x = BatchNormalization()(x)

x = Dense(60*n60Feat, activation='relu')(x)
print(x.shape)
for i in range(2):
    x = Dense((60/(1))*n60Feat, activation='relu')(x)
    print(x.shape)

# add info to 1d output
y = BatchNormalization()(OneDInput)
y = Dense(n1dFeat, activation='relu')(y)

commonLayer = Concatenate(axis=1)([x, y])
commonLayerSize = 60*n60Targ+n1dFeat
y = Dense(commonLayerSize, activation='relu')(commonLayer)
print('y',y.shape)
for i in range(2):
    y = Dense(int(commonLayerSize / (2*(i+1))), activation='relu')(y)
    print('y',y.shape)
y = Dense(n1dTarg, activation='relu', name='1d')(y)
print('y',y.shape)
output1d =y

x = Dense(commonLayerSize, activation='relu')(commonLayer)
x = Dense(60*n60Targ, activation='linear')(x)
print(x.shape)
output2d = Reshape((60, n60Targ),name='2d')(x)
print(output2d.shape)


model1 = Model(inputs=[TwoDInput, OneDInput], outputs=[output1d])
model1.compile(optimizer='adam', loss='mse', metrics=[RSquaredMetric(n1dTarg)])

hist = model1.fit([X2d, X1d], y1d, epochs=10, batch_size=32, validation_data=([X2d_val, X1d_val],y1d_val))

In [None]:
y1d_pred = model1.predict([X2d_val, X1d_val])

r2_scores1d = []
for i in range(y1d_pred.shape[1]):
    r2 = r2_score(y1d_val[:, i], y1d_pred[:, i])
    print(target1[i], r2)
    r2_scores1d.append(r2)
print('mean 1d',np.mean(np.array(r2_scores1d)))

## 2d

In [27]:
tf.random.set_seed(42)

OneDInput = Input(shape=(n1dFeat,))
TwoDInput = Input(shape=(60,n60Feat), name='input')
x = Reshape((60 * n60Feat,),name='inputReshape')(TwoDInput)

x = BatchNormalization()(x)

x = Dense(60*n60Feat, activation='relu')(x)
print(x.shape)
for i in range(2):
    x = Dense((60/(1))*n60Feat, activation='relu')(x)
    print(x.shape)

# add info to 1d output
y = BatchNormalization()(OneDInput)
y = Dense(n1dFeat, activation='relu')(y)

commonLayer = Concatenate(axis=1)([x, y])
commonLayerSize = 60*n60Targ+n1dFeat
y = Dense(commonLayerSize, activation='relu')(commonLayer)
print('y',y.shape)
for i in range(2):
    y = Dense(int(commonLayerSize / (2*(i+1))), activation='relu')(y)
    print('y',y.shape)
y = Dense(n1dTarg, activation='relu', name='1d')(y)
print('y',y.shape)
output1d =y

x = Dense(commonLayerSize, activation='relu')(commonLayer)
x = Dense(60*n60Targ, activation='linear')(x)
print(x.shape)
output2d = Reshape((60, n60Targ),name='2d')(x)
print(output2d.shape)


model2 = Model(inputs=[TwoDInput, OneDInput], outputs=output2d)
model2.compile(optimizer='adam', loss='mse', metrics=[RSquaredMetric((60,n60Targ))])

hist = model2.fit([X2d, X1d], y2d, epochs=15, batch_size=256, validation_data=([X2d_val, X1d_val],y2d_val))

(None, 540)
(None, 540)
(None, 540)
y (None, 376)
y (None, 188)
y (None, 94)
y (None, 8)
(None, 360)
(None, 60, 6)
Epoch 1/15

: 

: 

In [12]:
y2d_pred = model2.predict([X2d_val, X1d_val])

y2d_pred0 = np.reshape(y2d_pred, (y2d_pred.shape[0],-1))
y2d_val0 = np.reshape(y2d_val, (y2d_val.shape[0],-1))
r2_scores = []
f = np.reshape(np.reshape(np.array(targets60), (n60Targ,60)).transpose(), (1,-1))
for i in range(y2d_val0.shape[1]):
    r2 = r2_score(y2d_val0[:, i], y2d_pred0[:, i])
    print(f[0][i], r2)
    r2_scores.append(r2)

ptend_t_0 -0.02979062499247309
ptend_q0001_0 -4.094335223717862e+16
ptend_q0002_0 0.0
ptend_q0003_0 0.0
ptend_u_0 0.0
ptend_v_0 0.0
ptend_t_1 -2.864382991364356
ptend_q0001_1 -3.147757394981592e+16
ptend_q0002_1 0.0
ptend_q0003_1 0.0
ptend_u_1 0.0
ptend_v_1 0.0
ptend_t_2 -0.8658040209168187
ptend_q0001_2 -1.1277540411804138e+17
ptend_q0002_2 0.0
ptend_q0003_2 0.0
ptend_u_2 0.0
ptend_v_2 0.0
ptend_t_3 -0.08846489113757028
ptend_q0001_3 -2.253743688348976e+21
ptend_q0002_3 0.0
ptend_q0003_3 0.0
ptend_u_3 0.0
ptend_v_3 0.0
ptend_t_4 -0.5032827644305575
ptend_q0001_4 0.0
ptend_q0002_4 0.0
ptend_q0003_4 0.0
ptend_u_4 0.0
ptend_v_4 0.0
ptend_t_5 -0.37687703840364284
ptend_q0001_5 0.0
ptend_q0002_5 0.0
ptend_q0003_5 0.0
ptend_u_5 0.0
ptend_v_5 0.0
ptend_t_6 -0.4581587529575726
ptend_q0001_6 0.0
ptend_q0002_6 0.0
ptend_q0003_6 0.0
ptend_u_6 0.0
ptend_v_6 0.0
ptend_t_7 -0.09010278762341395
ptend_q0001_7 0.0
ptend_q0002_7 0.0
ptend_q0003_7 0.0
ptend_u_7 0.0
ptend_v_7 0.0
ptend_t_8 -1.42239709163

In [None]:
min(r2_scores),np.mean(r2_scores)

In [None]:
m = RSquaredMetric2D((60,n60Targ))
m.update_state(y2d_val, y2d_pred)
m.result()

In [None]:
m.total_sum_squares