In [None]:
import dask.dataframe as dd
import matplotlib.pyplot as plt
import os
from sklearn.metrics import r2_score
import dask_ml
import dask
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore", category=UserWarning, message=".*Sending large graph.*")

from dask.distributed import Client, LocalCluster
import dask.multiprocessing

cluster = LocalCluster(processes=True,n_workers=6, threads_per_worker=1)
client = Client(cluster)

# data reading

In [None]:
folders = [
    'train0_25',
    'train25_50',
    'train50_75',
    'train75_100'
]

# Read Parquet files from each folder into Dask DataFrames
dfs = [dd.read_parquet(folder) for folder in folders]

# Concatenate all DataFrames into a single DataFrame
data = dd.concat(dfs)

In [None]:
feat60 = ['state_t', 'state_q0001','state_q0002','state_q0003','state_u','state_v','pbuf_ozone','pbuf_CH4','pbuf_N2O']
feat1 = ['state_ps','pbuf_SOLIN','pbuf_LHFLX','pbuf_SHFLX','pbuf_TAUX','pbuf_TAUY','pbuf_COSZRS','cam_in_ALDIF','cam_in_ALDIR','cam_in_ASDIF','cam_in_ASDIR','cam_in_LWUP','cam_in_ICEFRAC','cam_in_LANDFRAC','cam_in_OCNFRAC','cam_in_SNOWHLAND']

target60 = ['ptend_t','ptend_q0001','ptend_q0002','ptend_q0003','ptend_u','ptend_v']
target1 = ['cam_out_NETSW','cam_out_FLWDS','cam_out_PRECSC','cam_out_PRECC','cam_out_SOLS','cam_out_SOLL','cam_out_SOLSD','cam_out_SOLLD']

features60 = [] 
for f in feat60:
    features60 = features60 + [f+'_'+str(i) for i in range(60)]
allF = features60 + feat1

targets60 = [] 
for f in target60:
    targets60 = targets60 + [f+'_'+str(i) for i in range(60)]
allT = targets60 + target1

targetsToDrop12 = [ 'ptend_q0001', 'ptend_q0002', 'ptend_q0003', 'ptend_u', 'ptend_v']
dropT = ['ptend_q0002_12','ptend_q0002_13','ptend_q0002_14'] # attention, I think i also need to predict _15
for f in targetsToDrop12:
    dropT = dropT + [f+'_'+str(i) for i in range(12)]

allT2 = [i for i in allT if i not in dropT]

In [None]:
np.random.seed(42)

orig_partitions = [i for i in range(0,int(data.npartitions))]
np.random.shuffle(orig_partitions) #shuffles inplace

trainSep = int(0.95* data.npartitions)
valEnd = data.npartitions #int(0.05* data.npartitions) + trainSep

sampledPartIdxTrain = orig_partitions[0:trainSep]
sampledPartIdxTest  = orig_partitions[trainSep:valEnd]

In [None]:
n60Feat = len(feat60)
n1dFeat = len(feat1)
n60Targ = len(target60)
n1dTarg = len(target1)

# metrics

In [None]:
import tensorflow as tf

from tensorflow.keras import Input
from tensorflow.keras.layers import Dense, LSTM, Embedding, Concatenate,BatchNormalization, Reshape
from tensorflow.keras.models import Model

from keras import backend as K

def r2_scoretf(y_true, y_pred):
    sum_squares_residuals = tf.reduce_sum(tf.square(y_true - y_pred), axis=0)
    sum_squares_total = tf.reduce_sum(tf.square(y_true - tf.reduce_mean(y_true)), axis=0)
    r2 = 1 - (sum_squares_residuals / sum_squares_total)
    return r2 #tf.reduce_mean(r2)

def r2_scoreTrain(y_true, y_pred):
    sum_squares_residuals = tf.reduce_sum(tf.square(y_true - y_pred), axis=0)
    sum_squares_total = tf.reduce_sum(tf.square(y_true - tf.reduce_mean(y_true)), axis=0)
    r2 = (sum_squares_residuals / sum_squares_total) # alwaysPositive, the smaller the better
    return tf.reduce_mean(r2)

class RSquaredMetric(tf.keras.metrics.Metric):
    def __init__(self, name='r_squared', **kwargs):
        super().__init__(name=name, **kwargs)
        self.total_sum_squares = None#self.add_weight(name='total_sum_squares', initializer='zeros', shape=shape)
        self.residual_sum_squares = None#self.add_weight(name='residual_sum_squares', initializer='zeros', shape=shape)
        self.num_samples = self.add_weight(name="num_samples", initializer='zeros',dtype=tf.int32)
 
    def update_state(self, y_true, y_pred, sample_weight=None):
        y_true = tf.cast(y_true, self._dtype)
        y_pred = tf.cast(y_pred, self._dtype)
        
        sum_squares_residuals = tf.reduce_sum(tf.square(y_true - y_pred), axis=0)
        sum_squares_total = tf.reduce_sum(tf.square(y_true - tf.reduce_mean(y_true, axis=0)), axis=0)
        sum_squares_total = tf.where(tf.equal(sum_squares_total, 0.0), tf.ones_like(sum_squares_total), sum_squares_total)
        
        if self.total_sum_squares is None:
            self.total_sum_squares = self.add_weight(name='total_sum_squares', initializer='zeros', shape=sum_squares_total.shape)
            self.residual_sum_squares = self.add_weight(name='residual_sum_squares', initializer='zeros', shape=sum_squares_residuals.shape)

        self.total_sum_squares.assign_add(sum_squares_total)
        self.residual_sum_squares.assign_add(sum_squares_residuals)

    def result(self):
        r_squared = 1 - (self.residual_sum_squares / self.total_sum_squares)
        r_squared = tf.where(tf.math.is_nan(r_squared), tf.ones_like(r_squared), r_squared)
        return tf.reduce_mean(r_squared)

    def reset_state(self):
        self.total_sum_squares.assign(tf.zeros_like(self.total_sum_squares))
        self.residual_sum_squares.assign(tf.zeros_like(self.residual_sum_squares))

# data preprocessing

In [None]:
def getTensorData(data, partPerLoop, startPartIdx,sampledPartIdx):
    X1d, X2d, y1d, y2d, X1dI, X2dI, y1dI,y2dI  = None, None, None, None, False, False, False, False
    for j in range(partPerLoop):
        a = data.get_partition(int(sampledPartIdx[startPartIdx+j])).compute()
        b = np.reshape(a[features60], (a.shape[0], n60Feat, 60))
        b = np.transpose(b, (0,2,1))
        X2d = np.concatenate([X2d,b], axis=0) if X2dI else b
        b = np.reshape(a[targets60], (a.shape[0], n60Targ, 60))
        b = np.transpose(b, (0,2,1))
        y2d = np.concatenate([y2d,b], axis=0) if y2dI else b
        X1d = np.concatenate([X1d,a[feat1]], axis=0) if X1dI else a[feat1]
        y1d = np.concatenate([y1d,a[target1]], axis=0) if y1dI else a[target1]
        X1dI, X2dI, y1dI,y2dI = True, True, True, True
    return X1d, X2d, y1d, y2d

In [None]:
# validation data
partPerLoop = 60

for i in range(1):
    startPartIdx = i*partPerLoop
    X1d_val, X2d_val, y1d_val, y2d_val = getTensorData(data, partPerLoop, startPartIdx, sampledPartIdxTest)

In [None]:
# training sequentially
partPerLoop = 100

for i in range(1):
    startPartIdx = i*partPerLoop
    X1d, X2d, y1d, y2d = getTensorData(data, partPerLoop, startPartIdx, sampledPartIdxTrain)  

# transformer

In [None]:
# 2d positional encoding (feature & position)
def positional_encoding(length, depth):
  depth = depth/2

  positions = np.arange(length)[:, np.newaxis]     # (seq, 1)
  depths = np.arange(depth)[np.newaxis, :]/depth   # (1, depth)

  angle_rates = 1 / (10000**depths)         # (1, depth)
  angle_rads = positions * angle_rates      # (pos, depth)

  pos_encoding = np.concatenate(
      [np.sin(angle_rads), np.cos(angle_rads)],
      axis=-1) 

  return tf.cast(pos_encoding, dtype=tf.float32)

def positional_encoding_2d(seq_length, num_features, wave_length=10000):
    positions = np.arange(seq_length)[:, np.newaxis]
    features = np.arange(num_features)[np.newaxis, :]
    divisors = np.exp(-2 * np.pi * features / wave_length)
    pos_enc = positions * divisors
    return tf.cast(pos_enc, dtype=tf.float32)

class PositionalEmbedding(tf.keras.layers.Layer):
  def __init__(self, sequence_length, d_model):
    super().__init__()
    self.d_model = d_model #d_model or model depth = dimensionality of encoding vector
    # no need for an embedding layer since my input is a time series and not a word token
    #self.embedding = tf.keras.layers.Embedding(sequence_length, d_model, mask_zero=True) 
    
    #self.pos_encoding = positional_encoding(length=sequence_length, depth=d_model)
    self.pos_encoding = positional_encoding_2d(sequence_length, d_model)

    #TODO implement an encoding for the store and family
    #self.store_fam_encoding


  def call(self, x):
    #print('encoder x', x.shape)
    length = tf.shape(x)[1]

    #x = self.embedding(x)
    # This factor sets the relative scale of the embedding and positonal_encoding.
    #x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))

    x = x + self.pos_encoding#[tf.newaxis, :length, :]
    return x
class BaseAttention(tf.keras.layers.Layer):
  def __init__(self, **kwargs):
    super().__init__()
    self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
    self.layernorm = tf.keras.layers.LayerNormalization()
    self.add = tf.keras.layers.Add()

class CrossAttention(BaseAttention):
  def call(self, x, context):
    #print(x.shape)
    #print(context.shape)
    attn_output, attn_scores = self.mha(
        query=x,
        key=context,
        value=context,
        return_attention_scores=True)
    #print(attn_output.shape)
    

    # Cache the attention scores for plotting later.
    self.last_attn_scores = attn_scores

    x = self.add([x, attn_output])
    x = self.layernorm(x)

    return x

class GlobalSelfAttention(BaseAttention):
  def call(self, x):
    attn_output = self.mha(
        query=x,
        value=x,
        key=x)
    x = self.add([x, attn_output])
    x = self.layernorm(x)
    return x

# only connect sequence to past & not to future
class CausalSelfAttention(BaseAttention):
  def call(self, x):
    attn_output = self.mha(
        query=x,
        value=x,
        key=x,
        use_causal_mask = True)
    x = self.add([x, attn_output])
    x = self.layernorm(x)
    return x

class FeedForward(tf.keras.layers.Layer):
  def __init__(self, d_model, dff, dropout_rate=0.1):
    super().__init__()
    self.seq = tf.keras.Sequential([
      tf.keras.layers.Dense(dff, activation='relu'),
      tf.keras.layers.Dense(d_model),
      tf.keras.layers.Dropout(dropout_rate)
    ])
    self.add = tf.keras.layers.Add()
    self.layer_norm = tf.keras.layers.LayerNormalization()

  def call(self, x):
    x = self.add([x, self.seq(x)])
    x = self.layer_norm(x) 
    return x

class DecoderLayer(tf.keras.layers.Layer):
  def __init__(self,
               *,
               d_model,
               num_heads,
               dff,
               dropout_rate=0.1):
    super(DecoderLayer, self).__init__(name='decoderlayer')

    self.causal_self_attention = CausalSelfAttention(
        num_heads=num_heads,
        key_dim=d_model,
        dropout=dropout_rate)

    self.cross_attention = CrossAttention(
        num_heads=num_heads,
        key_dim=d_model,
        dropout=dropout_rate)

    self.ffn = FeedForward(d_model, dff)

  def call(self, x, context):
    x = self.causal_self_attention(x=x)
    x = self.cross_attention(x=x, context=context)

    # Cache the last attention scores for plotting later
    self.last_attn_scores = self.cross_attention.last_attn_scores

    x = self.ffn(x)  # Shape `(batch_size, seq_len, d_model)`.
    return x

class Decoder(tf.keras.layers.Layer):
  def __init__(self, *, num_layers, d_model, num_heads, dff, sequence_length,
               dropout_rate=0.1):
    super(Decoder, self).__init__(name='decoder')

    self.d_model = d_model
    self.num_layers = num_layers

    self.pos_embedding = PositionalEmbedding(sequence_length=sequence_length,
                                             d_model=d_model)
    self.dropout = tf.keras.layers.Dropout(dropout_rate)
    self.dec_layers = [
        DecoderLayer(d_model=d_model, num_heads=num_heads,
                     dff=dff, dropout_rate=dropout_rate)
        for i in range(num_layers)]

    self.last_attn_scores = None

  def call(self, x, context):
    # `x` is token-IDs shape (batch, target_seq_len)
    x = self.pos_embedding(x)  # (batch_size, target_seq_len, d_model)

    x = self.dropout(x)

    for i in range(self.num_layers):
      x  = self.dec_layers[i](x, context)

    self.last_attn_scores = self.dec_layers[-1].last_attn_scores

    # The shape of x is (batch_size, target_seq_len, d_model).
    return x

class EncoderLayer(tf.keras.layers.Layer):
  def __init__(self,*, d_model, num_heads, dff, dropout_rate=0.1):
    super().__init__(name='encoderlayer')

    self.self_attention = GlobalSelfAttention(
        num_heads=num_heads,
        key_dim=d_model,
        dropout=dropout_rate)

    self.ffn = FeedForward(d_model, dff)

  def call(self, x):
    x = self.self_attention(x)
    x = self.ffn(x)
    return x

class Encoder(tf.keras.layers.Layer):
  def __init__(self, *, num_layers, d_model, num_heads,
               dff, sequence_length, dropout_rate=0.1):
    super().__init__(name='encoder')

    self.d_model = d_model
    self.num_layers = num_layers

    self.pos_embedding = PositionalEmbedding(
        sequence_length=sequence_length, d_model=d_model)

    self.enc_layers = [
        EncoderLayer(d_model=d_model,
                     num_heads=num_heads,
                     dff=dff,
                     dropout_rate=dropout_rate)
        for _ in range(num_layers)]
    self.dropout = tf.keras.layers.Dropout(dropout_rate)

  def call(self, x):
    # `x` is token-IDs shape: (batch, seq_len)
    x = self.pos_embedding(x)  # Shape `(batch_size, seq_len, d_model)`.

    # Add dropout.
    x = self.dropout(x)

    for i in range(self.num_layers):
      x = self.enc_layers[i](x)

    return x  # Shape `(batch_size, seq_len, d_model)`.

class Transformer(tf.keras.Model):
  def __init__(self, *, enc_dec_num_layers, input_seq_dim_enc, input_seq_dim_dec, num_heads, fully_connected_size,
               input_sequence_len, output_sequence_len, dropout_rate=0.1):
    super().__init__(name='transformerBase')
    self.encoder = Encoder(num_layers=enc_dec_num_layers, d_model=input_seq_dim_enc,
                           num_heads=num_heads, dff=fully_connected_size,
                           sequence_length=input_sequence_len,
                           dropout_rate=dropout_rate)

    self.decoder = Decoder(num_layers=enc_dec_num_layers, d_model=input_seq_dim_dec,
                           num_heads=num_heads, dff=fully_connected_size,
                           sequence_length=output_sequence_len,
                           dropout_rate=dropout_rate)

    self.final_layer = tf.keras.layers.Dense(output_sequence_len, name='finallayer')

  def call(self, inputs):
    # To use a Keras model with `.fit` you must pass all your inputs in the
    # first argument.
    
    context, x  = inputs
    context = self.encoder(context)  # (batch_size, context_len, d_model)

    x = self.decoder(x, context)  # (batch_size, target_len, d_model)

    # Final linear layer output.
    logits = self.final_layer(x)  # (batch_size, target_len, target_vocab_size)


    # needed if out inputs are not always the same length and the embedding only works for a part
    #try:
    #  # Drop the keras mask, so it doesn't scale the losses/metrics.
    #  # b/250038731
    #  del logits._keras_mask
    #except AttributeError:
    #  pass

    # Return the final output and the attention weights.
    return logits

# end to end transformer

In [None]:
from tensorflow.keras.layers import Layer, Dense, Input, Concatenate, GlobalAveragePooling1D
from tensorflow.keras.models import Model

In [None]:
tf.random.set_seed(42)
class PositionalEncoding(Layer):
    def __init__(self, position, d_model):
        super(PositionalEncoding, self).__init__()
        print('d_model', d_model)
        self.positional_encoding = self.get_positional_encoding(position, d_model)

    
    def get_positional_encoding(self, position, d_model):
        print('d_model', d_model)
        #angle_rads = self.get_angles(np.arange(position)[:, np.newaxis], np.arange(d_model)[np.newaxis, :], d_model)
        #angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
        #angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
        angle_rads = np.reshape(np.arange(position), (position,1)) / 60
        pos_encoding = angle_rads[np.newaxis, ...]
        return tf.cast(pos_encoding, dtype=tf.float32)
    
    def get_angles(self, pos, i, d_model):
        angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
        return pos * angle_rates

    
    def call(self, inputs):
        #print(inputs + tf.tile(tf.expand_dims(self.positional_encoding[:, :tf.shape(inputs)[1], 0],axis=-1), [1,1,inputs.shape[2]]))
        return inputs + tf.tile(tf.expand_dims(self.positional_encoding[:, :tf.shape(inputs)[1], 0],axis=-1), [1,1,tf.shape(inputs)[2]])

class TransformerBlock(Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential([
            Dense(ff_dim, activation="relu"),
            Dense(embed_dim)
        ])
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

def build_transformer_model(input_dim2d, const_input_dim, seq_output_dim, const_output_dim, featureSpaceSeq=512, featureSpaceConst=16, num_heads=8, ff_dim=2048, max_len=60):
    # Sequence input
    seq_input = Input(shape=(max_len, input_dim2d))
    const_input = Input(shape=(const_input_dim,))


    # embeddings
    seq_embedding = Dense(featureSpaceSeq)(seq_input)
    print('seq_embedding',seq_embedding.shape)
    # transform the 1d input values into sequences of 60 in order to represent the effect on each position of the const values
    const_embedding = Dense(featureSpaceConst)(const_input)
    print('const_embedding',const_embedding.shape)
    repeated_const_input = tf.tile(tf.expand_dims(const_embedding, axis=1), [1, max_len, 1])
    print('repeated_const_input',repeated_const_input.shape)
    combined_input = Concatenate()([seq_embedding, repeated_const_input])
    print('combined_input',combined_input.shape)

    
    # Positional encoding
    d_model = featureSpaceSeq + featureSpaceConst
    print('d_model', d_model)
    positional_encoding = PositionalEncoding(max_len, d_model)
    seq_positional_encoded = positional_encoding(combined_input)
    print('seq_positional_encoded',seq_positional_encoded.shape)
    
    # Transformer encoder
    transformer_block = TransformerBlock(d_model, num_heads, ff_dim)
    encoder_output = transformer_block(seq_positional_encoded) # this is the reach feature space with d_model*60 sequences
    print('encoder_output',encoder_output.shape)
    
    
    # Head 1: Outputs a sequence of shape (60, 6)
    seq_output = Dense(seq_output_dim, name='2d')(encoder_output)
    print('seq_output',seq_output.shape)
    
    # Head 2: Outputs five constant target values
    avg_pool = GlobalAveragePooling1D()(encoder_output)
    print('avg_pool',avg_pool.shape)
    combined = Concatenate()([avg_pool, const_input])
    print('combined',combined.shape)
    const_dense = Dense(d_model, activation='relu')(combined)
    print('const_dense',const_dense.shape)
    const_output = Dense(const_output_dim, name='1d')(const_dense)
    print('const_output',const_output.shape)
    
    # Build model
    model = Model(inputs=[seq_input, const_input], outputs=[seq_output])#, const_output])
    return model

# Example usage
input_dim2d = n60Feat       # number of features in the input sequence
const_input_dim = n1dFeat   # number of constant input features
seq_output_dim = n60Targ    # number of features in the output sequence
const_output_dim = n1dTarg  # number of constant output features
max_len = 60
featureSpaceSeq = 64
featureSpaceConst = 16#2*n1dFeat
ffDim = 128

model = build_transformer_model(input_dim2d, const_input_dim, seq_output_dim, const_output_dim, max_len=max_len,featureSpaceSeq=featureSpaceSeq,featureSpaceConst=featureSpaceConst, ff_dim=ffDim)
model.compile(optimizer='adam', loss='mse', metrics=[RSquaredMetric()])
model.summary()

In [None]:
hist = model.fit([X2d, X1d], [y2d, y1d], epochs=1, batch_size=32, validation_data=([X2d_val, X1d_val],[y2d_val, y1d_val]))

In [None]:
# 16 feature space seq   
#loss: 5.1093e-04 - r_squared: -4239921345867022336.0000 - val_loss: 3.2163e-07 - val_r_squared: -2053396349583360.0000
# without embedding
#loss: 6.1256e-04 - r_squared: -8781051153384210432.0000 - val_loss: 3.4702e-07 - val_r_squared: -874387024642048.0000
# with embedding,  fspace=64,constf=16,ffdim=128
#2.9717e-04 - r_squared: -4653134858015473664.0000 - val_loss: 1.1647e-07 - val_r_squared: -414988464291840.0000
tf.random.set_seed(42)
hist = model.fit([X2d, X1d], [y2d], epochs=5, batch_size=128, validation_data=([X2d_val, X1d_val],[y2d_val]))

In [None]:
y2d_pred = model.predict([X2d_val, X1d_val])

In [None]:
y2d_pred0 = np.reshape(y2d_pred, (y2d_pred.shape[0],-1))
y2d_val0 = np.reshape(y2d_val, (y2d_val.shape[0],-1))
r2_scores = []
f = np.reshape(np.reshape(np.array(targets60), (n60Targ,60)).transpose(), (1,-1))
for i in range(y2d_val0.shape[1]):
    r2 = r2_score(y2d_val0[:, i], y2d_pred0[:, i])
    print(f[0][i], r2)
    r2_scores.append(r2)

In [None]:
a = pd.DataFrame(y2d_pred0, columns=f[0])
b = pd.DataFrame(y2d_val0, columns=f[0])

In [None]:
#training data looks odd
a.ptend_t_0.plot()
b.ptend_t_0.plot()

In [None]:
# 64 feature space 
# seq loss: 0.0012 - r_squared: -9318544663843438592.0000
hist = model.fit([X2d, X1d], [y2d], epochs=1, batch_size=32, validation_data=([X2d_val, X1d_val],[y2d_val]))

In [None]:
positional_encoding = PositionalEncoding(max_len, 512)
inputs = np.ones((1, 60, 512))#.astype(np.float32)
out = positional_encoding(inputs)
out[0,0:3,0:2]

# LSTM

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, Embedding, LayerNormalization, Dropout
from tensorflow.keras.models import Model

# Encoder
encoder_inputs = tf.keras.layers.Input(shape=(60, 9))
encoder_embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim)(encoder_inputs)
encoder_output, state_h, state_c = tf.keras.layers.LSTM(units=hidden_units, return_state=True)(encoder_embedding)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = tf.keras.layers.Input(shape=(None, 6))
decoder_embedding = Embedding(input_dim=vocab_size, output_dim=embedding_dim)(decoder_inputs)
decoder_lstm = tf.keras.layers.LSTM(units=hidden_units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='categorical_crossentropy')
