<a href="https://colab.research.google.com/github/myPar/NSU_Practice/blob/dev/baseline/RNNbaseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Prepare

In [128]:
import pandas as pd
import numpy as np
import statistics as st
from google.colab import drive
from pathlib import Path

import tensorflow as tf
from tensorflow.keras.layers import SimpleRNN, Input, Dense, TimeDistributed
from tensorflow.keras import activations
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split

import plotly.express as px
import plotly.graph_objects as go

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [129]:
# normilize data ('min/max' normalization):
interval_th = [-1, 1]     # normalization interval for 'th' activation function
interval_sigmoid = [0, 1] # normalization interval for 'sigmoid' activation function

In [130]:
dir_name = '/content/drive/MyDrive/Colab Notebooks/'
file_name = 'data.txt'
file_path = Path(dir_name + file_name)
df = pd.read_csv(filepath_or_buffer=file_path, sep=' ', header=0)

df.head

<bound method NDFrame.head of              Date      Time  qo_lc[m3/d]  qw_lc[m3/d]  qg_lc[m3/d]  \
0     07-May-2014  10:13:26      109.280    -0.312890       3810.3   
1     07-May-2014  10:14:26      113.460    -0.437350       3774.0   
2     07-May-2014  10:15:26      131.630    -0.533550       3892.5   
3     07-May-2014  10:16:26      140.670     0.160910       4043.1   
4     07-May-2014  10:17:26      150.020    -1.145600       4134.6   
...           ...       ...          ...          ...          ...   
1264  08-May-2014  07:34:35       98.744     0.310070       3174.5   
1265  08-May-2014  07:35:35       72.045     0.162760       3134.1   
1266  08-May-2014  07:36:35       87.370    -0.125770       3340.4   
1267  08-May-2014  07:37:35       32.121     0.021522       2834.1   
1268  08-May-2014  07:38:35       35.914     0.228020       2612.4   

      qo_sc[Sm3/d]  qw_sc[Sm3/d]  qg_sc[Sm3/d]  qo_scnp[Sm3/d]  \
0          108.430     -0.314870       92700.0         108.430 

In [131]:
print(df.columns)

Index(['Date', 'Time', 'qo_lc[m3/d]', 'qw_lc[m3/d]', 'qg_lc[m3/d]',
       'qo_sc[Sm3/d]', 'qw_sc[Sm3/d]', 'qg_sc[Sm3/d]', 'qo_scnp[Sm3/d]',
       'qw_scnp[Sm3/d]', 'qg_scnp[Sm3/d]', 'Fo_lc[%]', 'Fw_lc[%]', 'Fg_lc[%]',
       'WLR[%]', 'GVF[%]', 'GLR[m3/m3]', 'BSW[%]', 'GOR[Sm3/Sm3]',
       'GOR1[Sm3/Sm3]', 'mo_lc[kg/d]', 'mw_lc[kg/d]', 'mg_lc[kg/d]',
       'm_lc[kg/d]', 'mo_sc[kg/d]', 'mw_sc[kg/d]', 'mg_sc[kg/d]',
       'Mu_o_lc[cP]', 'Mu_l_lc[cP]', 'Do_lc[g/cm3]', 'Dw_lc[g/cm3]',
       'Dg_lc[g/cm3]', 'Dm_lc[g/cm3]', 'Dl_lc[g/cm3]', 'bo[Sm3/m3]',
       'bw[Sm3/m3]', 'bg[Sm3/m3]', 'Z', 'Rst[Sm3/Sm3]', 'Rwst[Sm3/Sm3]',
       'rgmp[Sm3/Sm3]', 'N32[cps]', 'N81[cps]', 'N356[cps]', 'NTotal[cps]',
       'DeadTime[s]', 'SampleTime[s]', 'DPV[mbar]', 'PL[bara]', 'TL[DegC]',
       'TAMB[DegC]', 'Dyn_DP[mbar]', 'OperatingPointLE[1/m]',
       'OperatingPointHE[1/m]', 'OilPointLE[1/m]', 'OilPointHE[1/m]',
       'WaterPointLE[1/m]', 'WaterPointHE[1/m]', 'GasPointLE[1/m]',
       'GasPoin

In [132]:
selected_rows = ['DPV[mbar]', 'PL[bara]', 'qg_sc[Sm3/d]', 'qo_sc[Sm3/d]', 'TL[DegC]']
outputs = ['qg_sc[Sm3/d]', 'qo_sc[Sm3/d]']
inputs = np.setdiff1d(selected_rows, outputs)

df = df[selected_rows]

print(df.head)
print("inputs: " + str(inputs))
print("outputs: " + str(outputs))

<bound method NDFrame.head of       DPV[mbar]  PL[bara]  qg_sc[Sm3/d]  qo_sc[Sm3/d]  TL[DegC]
0       1448.70    21.481       92700.0       108.430   -2.5452
1       1462.10    21.473       91895.0       112.580   -2.5323
2       1592.30    21.672       95860.0       130.550   -2.3931
3       1703.30    21.926      100690.0       139.440   -2.0451
4       1821.70    22.196      104290.0       148.640   -1.8009
...         ...       ...           ...           ...       ...
1264    1147.10    23.997       87402.0        97.703   -3.3542
1265    1090.50    23.919       85594.0        71.288   -3.2890
1266    1279.10    24.477       93660.0        86.386   -3.2191
1267     976.62    23.808       76564.0        31.789   -3.2819
1268        NaN       NaN       69609.0        35.567       NaN

[1269 rows x 5 columns]>
inputs: ['DPV[mbar]' 'PL[bara]' 'TL[DegC]']
outputs: ['qg_sc[Sm3/d]', 'qo_sc[Sm3/d]']


## Help reused functions

In [133]:
class Data(object):
  def __init__(self, train_input, train_output, test_input, test_output):
    self.train_input = train_input
    self.train_output = train_output
    self.test_input = test_input
    self.test_output = test_output

##### min/max data normalization function
def normilize_df(df, normilize_interval):
  return ((df - df.min()) / (df.max() - df.min())) * (normilize_interval[1] - normilize_interval[0]) + normilize_interval[0]

##### returns splitted from src dataframe train test data in numpy representation
def get_train_test(df, test_share, inputs, outputs):
  train_df, test_df = train_test_split(df, test_size=test_share, shuffle=False)

  train_input_data = train_df[inputs].to_numpy()
  train_output_data = train_df[outputs].to_numpy()

  test_input_data = test_df[inputs].to_numpy()
  test_output_data = test_df[outputs].to_numpy()

  return Data(train_input_data, train_output_data, test_input_data, test_output_data)

## help function for data sequence getting:
def get_sequence(data, timesteps):
  return np.array([data[i*timesteps : i * timesteps + timesteps] for i in range(data.shape[0] // timesteps)])

def get_remaining_data(data, sequences_count, timesteps):
  return np.array(data[timesteps * sequences_count:])

##### returns sequences data vectors; timesteps - the length of sequence
def get_data_sequence(data, timesteps, inputs, outputs):
  sequences = list(get_sequence(data, timesteps))
  seq_count = len(sequences)
  
  if seq_count * timesteps < data.shape[0]:
    remaining_input_sequence = get_remaining_data(data, seq_count, timesteps)
    sequences.append(remaining_input_sequence)
  
  return sequences

##### print data sequences function
def print_batches_shapes(in_tensor, out_tensor):
  print(" input batches shapes:")

  for i in range(len(in_tensor)):
    print(" " + str(in_tensor[i].shape))

  print(" ------------")
  print(" output batches shapes:")

  for i in range(len(out_tensor)):
    print(" " + str(out_tensor[i].shape))

##### model training function (returns loss list for graphic)
def train_model(model, input_sequences, output_sequences, epoch_count):
  assert len(input_sequences) == len(output_sequences)
  batch_count = len(input_sequences)
  loss_list = []

  in_sh = input_sequences[0].shape
  out_sh = output_sequences[0].shape
  in_rem_sh = input_sequences[batch_count - 1].shape
  out_rem_sh = output_sequences[batch_count - 1].shape

  input_shape_remaining = (1, in_rem_sh[0], in_rem_sh[1])
  output_shape_remaining = (1, out_rem_sh[0], out_rem_sh[1])

  input_shape = (1, in_sh[0], in_sh[1])
  output_shape = (1, out_sh[0], out_sh[1])

  for epoch in range(epoch_count):
    for batch_idx in range(batch_count):
      in_shape = input_shape
      out_shape = output_shape

      if batch_idx == batch_count - 1:
        in_shape = input_shape_remaining 
        out_shape = output_shape_remaining

      input_batch = input_sequences[batch_idx].reshape(in_shape)
      output_batch = output_sequences[batch_idx].reshape(out_shape)

      loss = model.train_on_batch(input_batch, output_batch, reset_metrics=False, return_dict=True)
    loss_list.append(loss['loss'])

    print("epoch-" + str(epoch) + ";" + str(loss))
  
  return loss_list

# testing model (returns accumulate loss)
def test_model(model, input_test_sequences, output_test_sequences):
  batch_count = len(input_test_sequences)

  in_sh = input_test_sequences[0].shape
  out_sh = output_test_sequences[0].shape
  in_rem_sh = input_test_sequences[batch_count - 1].shape
  out_rem_sh = output_test_sequences[batch_count - 1].shape

  input_shape = (1, in_sh[0], in_sh[1])
  output_shape = (1, out_sh[0], out_sh[1])
  
  input_shape_remaining = (1, in_rem_sh[0], in_rem_sh[1])
  output_shape_remaining = (1, out_rem_sh[0], out_rem_sh[1])

  loss = 0

  for batch_idx in range(batch_count):
    in_shape = input_shape
    out_shape = output_shape

    if batch_idx == batch_count - 1:
      in_shape = input_shape_remaining 
      out_shape = output_shape_remaining

    input_batch = input_test_sequences[batch_idx].reshape(in_shape)
    output_batch = output_test_sequences[batch_idx].reshape(out_shape)

    loss = model.test_on_batch(input_batch, output_batch, return_dict=True, reset_metrics=False)
  
  return loss

def predict(model, input_data_sequences):
  batch_count = len(input_data_sequences)
  predictions = []

  in_sh = input_data_sequences[0].shape
  in_rem_sh = input_data_sequences[batch_count - 1].shape

  input_shape = (1, in_sh[0], in_sh[1])
  input_shape_remaining = (1, in_rem_sh[0], in_rem_sh[1])

  for i in range(batch_count):
    in_shape = input_shape

    if i == batch_count - 1:
      in_shape = input_shape_remaining 
    
    input_batch = input_data_sequences[i].reshape(in_shape)

    prediction = model.predict_on_batch(input_batch)
    predictions.append(prediction)
  
  return predictions

##### checking attributes cardinality, skip percent and categoricality. retutns list of categorical attributes
def check_attributes_info(df, cardinality_hold, expected_col_size):
  categorical_attributes = []

  for column in df.columns:
    col_data = df[column].to_numpy()
    size = len(col_data)
    
    assert size == expected_col_size
    cleared_data = col_data[~np.isnan(col_data)]

    card = len(np.unique(cleared_data))  # cardinality without Nan items
    skip_count = size - len(cleared_data)

    print(column + ": card=" + str(card) + "; skip: percent=" + str(skip_count * 100 / size) + ", count=" + str(skip_count))
    
    if card / size < cardinality_hold:
      categorical_attributes.append(card)

  return categorical_attributes

##### 'nan' items supplement function
def supplement_data(df, categorical_attributes):
  for column in df.columns:
    data =  np.array(df[column].to_numpy(), dtype=np.float64)

    for i in range(len(data)):
      if str(data[i]) == 'nan':
        if len(categorical_attributes) > 0 and column in categorical_attributes:
          data[i] = st.mode(data[~np.isnan(data)])
        else:
          data[i] = np.mean(data[~np.isnan(data)])
    df.drop([column], axis=1, inplace=True)
    df.insert(0, column, data)



### check attrubutes and supplement data

In [134]:
cardinality_hold = 0.15
expected_col_size = 1269

print("before supplement:")
categorical_attributes = check_attributes_info(df, cardinality_hold, expected_col_size)
print("categorical attributes: " + str(categorical_attributes) + "\n")

supplement_data(df, categorical_attributes)

print("after supplement:")
categorical_attributes = check_attributes_info(df, cardinality_hold, expected_col_size)
print("categorical attributes: " + str(categorical_attributes))

before supplement:
DPV[mbar]: card=1163; skip: percent=0.07880220646178093, count=1
PL[bara]: card=1042; skip: percent=0.07880220646178093, count=1
qg_sc[Sm3/d]: card=1250; skip: percent=0.0, count=0
qo_sc[Sm3/d]: card=1246; skip: percent=0.0, count=0
TL[DegC]: card=1214; skip: percent=0.07880220646178093, count=1
categorical attributes: []

after supplement:
TL[DegC]: card=1215; skip: percent=0.0, count=0
qo_sc[Sm3/d]: card=1246; skip: percent=0.0, count=0
qg_sc[Sm3/d]: card=1250; skip: percent=0.0, count=0
PL[bara]: card=1043; skip: percent=0.0, count=0
DPV[mbar]: card=1164; skip: percent=0.0, count=0
categorical attributes: []




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



## Create model

In [135]:
# model parameters
batch_size = 60
timesteps = 60
input_size = len(inputs)
output_size = len(outputs)
lr = 0.001

mse = tf.keras.losses.MeanSquaredError()

model = tf.keras.Sequential()
model.add(Input((None, input_size)))
model.add(SimpleRNN(units=120, activation='tanh', return_sequences=True))
model.add(SimpleRNN(units=120, activation='tanh', return_sequences=True))
model.add(SimpleRNN(units=2, activation='tanh', return_sequences=True))
model.summary()

Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 simple_rnn_18 (SimpleRNN)   (None, None, 120)         14880     
                                                                 
 simple_rnn_19 (SimpleRNN)   (None, None, 120)         28920     
                                                                 
 simple_rnn_20 (SimpleRNN)   (None, None, 2)           246       
                                                                 
Total params: 44,046
Trainable params: 44,046
Non-trainable params: 0
_________________________________________________________________


In [136]:
model.compile(loss=mse, metrics=[mse], optimizer=Adam(learning_rate=lr))

## Prepare data

In [137]:
result_interval = interval_th
df = normilize_df(df, result_interval)

data = get_train_test(df, 0.3, inputs, outputs)

train_input_data = data.train_input
train_output_data = data.train_output

test_input_data = data.test_input
test_output_data = data.test_output


print("data shapes:")
print(train_input_data.shape)
print(test_input_data.shape)

print(train_output_data.shape)
print(test_output_data.shape)

data shapes:
(888, 3)
(381, 3)
(888, 2)
(381, 2)


In [138]:
input_train_sequences = get_data_sequence(train_input_data, timesteps, inputs, outputs)
output_train_sequences = get_data_sequence(train_output_data, timesteps, inputs, outputs)

input_test_sequences = get_data_sequence(test_input_data, timesteps, inputs, outputs)
output_test_sequences = get_data_sequence(test_output_data, timesteps, inputs, outputs)

train_sequences_count = len(input_train_sequences)
assert train_sequences_count == len(output_train_sequences)

test_sequences_count = len(input_test_sequences)
assert test_sequences_count == len(output_test_sequences)

print("train batches shapes: ")
print_batches_shapes(input_train_sequences, output_train_sequences)
print()
print("test batches shapes: ")
print_batches_shapes(input_test_sequences, output_test_sequences)

train batches shapes: 
 input batches shapes:
 (60, 3)
 (60, 3)
 (60, 3)
 (60, 3)
 (60, 3)
 (60, 3)
 (60, 3)
 (60, 3)
 (60, 3)
 (60, 3)
 (60, 3)
 (60, 3)
 (60, 3)
 (60, 3)
 (48, 3)
 ------------
 output batches shapes:
 (60, 2)
 (60, 2)
 (60, 2)
 (60, 2)
 (60, 2)
 (60, 2)
 (60, 2)
 (60, 2)
 (60, 2)
 (60, 2)
 (60, 2)
 (60, 2)
 (60, 2)
 (60, 2)
 (48, 2)

test batches shapes: 
 input batches shapes:
 (60, 3)
 (60, 3)
 (60, 3)
 (60, 3)
 (60, 3)
 (60, 3)
 (21, 3)
 ------------
 output batches shapes:
 (60, 2)
 (60, 2)
 (60, 2)
 (60, 2)
 (60, 2)
 (60, 2)
 (21, 2)


## Train model

In [139]:
epoch_count = 500
batch_count = len(input_train_sequences)

loss_list = train_model(model, input_train_sequences, output_train_sequences, epoch_count)

epoch-0;{'loss': 0.6766645312309265, 'mean_squared_error': 0.6766645312309265}
epoch-1;{'loss': 0.48173797130584717, 'mean_squared_error': 0.48173797130584717}
epoch-2;{'loss': 0.37623924016952515, 'mean_squared_error': 0.37623924016952515}
epoch-3;{'loss': 0.3199055790901184, 'mean_squared_error': 0.3199055790901184}
epoch-4;{'loss': 0.2703857123851776, 'mean_squared_error': 0.2703857123851776}
epoch-5;{'loss': 0.23593738675117493, 'mean_squared_error': 0.23593738675117493}
epoch-6;{'loss': 0.21409985423088074, 'mean_squared_error': 0.21409985423088074}
epoch-7;{'loss': 0.20386332273483276, 'mean_squared_error': 0.20386332273483276}
epoch-8;{'loss': 0.188686802983284, 'mean_squared_error': 0.188686802983284}
epoch-9;{'loss': 0.1755930781364441, 'mean_squared_error': 0.1755930781364441}
epoch-10;{'loss': 0.1639619916677475, 'mean_squared_error': 0.1639619916677475}
epoch-11;{'loss': 0.1534063071012497, 'mean_squared_error': 0.1534063071012497}
epoch-12;{'loss': 0.1434733271598816, 'mea

In [140]:
def plot_loss(loss_list):
  # plot loss graphic
  x = np.arange(1, len(loss_list) + 1)  
  y = np.array(loss_list)

  assert len(x) == len(y)

  fig = px.line(x=x, y=y, labels={'x':'epoch', 'y':'loss'})
  fig.show()

In [141]:
plot_loss(loss_list)

## Test model

In [142]:
# test on batches
test_loss = test_model(model, input_test_sequences, output_test_sequences)
print("test loss=" + str(test_loss['loss']))

test loss=0.007948346436023712


## Making predictions

In [143]:
input_data = df[inputs].to_numpy()
output_data = df[outputs].to_numpy()

print("data shapes:")
print(input_data.shape)
print(output_data.shape)

input_sequences = get_data_sequence(input_data, timesteps, inputs, outputs)
output_sequences = get_data_sequence(output_data, timesteps, inputs, outputs)

seq_count = len(input_sequences)
assert seq_count == len(output_sequences)

print("sequence count=" + str(seq_count))
print_batches_shapes(input_sequences, output_sequences)

predictions = predict(model, input_sequences)

data shapes:
(1269, 3)
(1269, 2)
sequence count=22
 input batches shapes:
 (60, 3)
 (60, 3)
 (60, 3)
 (60, 3)
 (60, 3)
 (60, 3)
 (60, 3)
 (60, 3)
 (60, 3)
 (60, 3)
 (60, 3)
 (60, 3)
 (60, 3)
 (60, 3)
 (60, 3)
 (60, 3)
 (60, 3)
 (60, 3)
 (60, 3)
 (60, 3)
 (60, 3)
 (9, 3)
 ------------
 output batches shapes:
 (60, 2)
 (60, 2)
 (60, 2)
 (60, 2)
 (60, 2)
 (60, 2)
 (60, 2)
 (60, 2)
 (60, 2)
 (60, 2)
 (60, 2)
 (60, 2)
 (60, 2)
 (60, 2)
 (60, 2)
 (60, 2)
 (60, 2)
 (60, 2)
 (60, 2)
 (60, 2)
 (60, 2)
 (9, 2)


In [144]:
# plot approximation graphic
def get_1d_data(tensor):
  size = len(tensor)
  result = tensor[0][0]

  for i in range(size - 1):
    item = tensor[i + 1][0]
    result = np.concatenate((result, item), axis=0)

  return result

def get_attribute_data(attribute_idx, one_dim_data):
  result = []
  
  for item in one_dim_data:
    result.append(item[attribute_idx])
  
  return np.array(result)

def plot_approximation(attribute_name, predicted_data, mode):
  comparison_graphic = go.Figure()

  x = np.arange(1, input_data.shape[0])
  y_actual = df[attribute_name].to_numpy()
  y_predicted = predicted_data

  comparison_graphic.add_trace(go.Scatter(x=x, y=y_actual,
                    mode=mode,
                    name=attribute_name + ' actual graphic',))

  comparison_graphic.add_trace(go.Scatter(x=x, y=y_predicted,
                    mode=mode,
                    name=attribute_name + ' predicted',))

  comparison_graphic.update_layout(
    height=900,
    width = 4000,
    title_text=attribute_name
  )
  return comparison_graphic

In [145]:
one_dim_predicted_data = get_1d_data(predictions)

print(one_dim_predicted_data.shape)

predicted_data1 = get_attribute_data(0, one_dim_predicted_data)
predicted_data2 = get_attribute_data(1, one_dim_predicted_data)

print("predicted data shapes: " + str(predicted_data1.shape) + " " + str(predicted_data2.shape))

(1269, 2)
predicted data shapes: (1269,) (1269,)


In [146]:
plot_approximation(outputs[0], predicted_data1, 'markers').show()

In [147]:
plot_approximation(outputs[1], predicted_data2, 'markers').show()

In [148]:
# plot as graphics
plot_approximation(outputs[0], predicted_data1, 'lines').show()

In [149]:
plot_approximation(outputs[1], predicted_data2, 'lines').show()

In [150]:
# calc R^2 for predected attributes
def get_determination(actual_data, predicted_data):
  data_size = len(actual_data)
  assert data_size == len(predicted_data)

  actual_mean = np.mean(actual_data)
  actual_mean_vector = np.full((data_size,), actual_mean)

  arg1 = np.linalg.norm((actual_data - predicted_data).reshape(1,data_size), ord='fro')**2

  arg2 = np.linalg.norm((actual_mean_vector - actual_data).reshape(1, data_size), ord='fro')**2

  return 1 - arg1 / arg2

In [151]:
actual_data1 = df[outputs[0]].to_numpy()
actual_data2 = df[outputs[1]].to_numpy()

k_1 = get_determination(predicted_data1, actual_data1)
k_2 = get_determination(predicted_data2, actual_data2)

print(outputs[0] + " determination=" + str(k_1))
print(outputs[1] + " determination=" + str(k_2))

qg_sc[Sm3/d] determination=0.9439729004086674
qo_sc[Sm3/d] determination=0.9248704129151507


## Save model

In [152]:
# save model in the default TensorFlow SavedModel format
need_safe = False
path = Path('/content/drive/MyDrive/Colab Notebooks/RNN_model')

if need_safe:
  model.save(path)

## Elman net baseline

### prepare data

In [153]:
file_name = 'DataVX.xlsx'
file_path = Path(dir_name + file_name)
print(file_path)

/content/drive/MyDrive/Colab Notebooks/DataVX.xlsx


In [154]:
df1 = pd.read_excel(file_path, sheet_name=0)
df1

Unnamed: 0,Date & Time,qo_lc[m3/d],qw_lc[m3/d],qg_lc[m3/d],qo_sc[Sm3/d],qw_sc[Sm3/d],qg_sc[Sm3/d],qo_scnp[Sm3/d],qw_scnp[Sm3/d],qg_scnp[Sm3/d],...,OperatingPointHE[1/m],OilPointLE[1/m],OilPointHE[1/m],WaterPointLE[1/m],WaterPointHE[1/m],GasPointLE[1/m],GasPointHE[1/m],co1min_qo_lc[m3/d],co1min_qw_lc[m3/d],co1min_qg_lc[m3/d]
0,07-May-2014 10:13:26,109.280,-0.312890,3810.3,108.430,-0.314870,92700,108.430,-0.314870,91298,...,1.00060,23.568,14.781,171.01,25.097,0.49280,0.32761,222.740,,
1,07-May-2014 10:14:26,113.460,-0.437350,3774.0,112.580,-0.440110,91895,112.580,-0.440110,90438,...,0.98577,23.570,14.782,171.01,25.097,0.49042,0.32603,354.370,,
2,07-May-2014 10:15:26,131.630,-0.533550,3892.5,130.550,-0.536910,95860,130.550,-0.536910,94153,...,1.04660,23.568,14.781,171.01,25.097,0.49134,0.32665,272.300,,
3,07-May-2014 10:16:26,140.670,0.160910,4043.1,139.440,0.161910,100690,139.440,0.161910,98844,...,1.11880,23.562,14.777,171.00,25.096,0.49673,0.33023,422.320,,
4,07-May-2014 10:17:26,150.020,-1.145600,4134.6,148.640,-1.152700,104290,148.640,-1.152700,102300,...,1.14540,23.553,14.771,170.99,25.094,0.50210,0.33380,289.400,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1419,08-May-2014 10:09:35,80.103,-0.336630,2803.4,79.417,-0.338870,73266,79.417,-0.338870,72160,...,0.83218,23.581,14.789,171.06,25.105,0.52021,0.34584,240.814,,
1420,08-May-2014 10:10:35,115.080,-0.604240,3057.9,114.010,-0.608230,81615,114.010,-0.608230,80000,...,1.09900,23.575,14.785,171.06,25.105,0.52916,0.35179,216.820,,
1421,08-May-2014 10:11:35,101.740,-0.369300,3188.5,100.750,-0.371710,84923,100.750,-0.371710,83494,...,1.14230,23.564,14.778,171.05,25.103,0.53543,0.35595,257.738,,
1422,08-May-2014 10:12:35,40.918,-0.060369,2357.4,40.563,-0.060761,59662,40.563,-0.060761,59109,...,0.94348,23.562,14.777,171.04,25.101,0.52875,0.35152,59.997,,


In [155]:
df2 = pd.read_excel(file_path, sheet_name=1)
df2

Unnamed: 0,qo_lc[m3/d],qw_lc[m3/d],qg_lc[m3/d],Date,co1min_qo_lc[m3/d],co1min_qw_lc[m3/d],co1min_qg_lc[m3/d],qo_lc_avg10,qo_lc_avg5,qo_lc_avg20,...,qw_lc_avg10,qw_lc_avg5,qw_lc_avg20,qw_lc_avg30,qw_lc_avg60,qg_lc_avg10,qg_lc_avg5,qg_lc_avg20,qg_lc_avg30,qg_lc_avg60
0,109.280,-0.312890,3810.3,07-May-2014 10:13:26,222.740,-0.750240,7584.3,109.7028,129.0120,103.18940,...,-0.368397,-0.453696,-0.342605,-0.350181,-0.374134,3868.08,3930.90,3775.645,3688.586667,3481.415000
1,113.460,-0.437350,3774.0,07-May-2014 10:14:26,354.370,-1.283790,11476.8,96.6760,90.3936,84.34390,...,-0.316812,-0.283099,-0.634228,-0.398086,2.503358,3683.21,3805.26,3379.860,3274.243333,3364.313333
2,131.630,-0.533550,3892.5,07-May-2014 10:15:26,495.040,-1.122880,15519.9,81.4646,103.3322,83.16180,...,-0.365334,-0.456182,-0.145568,3.357279,0.561701,3514.47,3820.22,3288.740,3295.953333,3465.146667
3,140.670,0.160910,4043.1,07-May-2014 10:16:26,645.060,-2.268480,19654.5,87.2232,90.0198,86.27455,...,-0.903123,-0.177442,0.162339,1.649437,0.611184,3245.25,3546.20,3247.110,3432.673333,3452.058333
4,150.020,-1.145600,4134.6,07-May-2014 10:17:26,784.440,-3.109510,23994.7,73.3470,69.6328,94.84145,...,-0.312707,-0.540187,7.118775,0.338276,0.184388,3153.03,3489.38,3419.875,3432.516667,3475.510000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1420,115.080,-0.604240,3057.9,08-May-2014 10:10:35,114101.902,-50.094090,4590911.4,,,,...,,,,,,,,,,
1421,101.740,-0.369300,3188.5,08-May-2014 10:11:35,114142.820,-50.154459,4593268.8,,,,...,,,,,,,,,,
1422,40.918,-0.060369,2357.4,08-May-2014 10:12:35,114161.899,-50.054654,4595193.8,,,,...,,,,,,,,,,
1423,19.079,0.099805,1925.0,08-May-2014 10:13:35,114290.911,-50.054654,4595193.8,,,,...,,,,,,,,,,


In [156]:
selected_rows = ['co1min_qo_lc[m3/d]', 'co1min_qg_lc[m3/d]', 'DPV[mbar]', 'PL[bara]', 'TL[DegC]']
outputs = ['co1min_qo_lc[m3/d]', 'co1min_qg_lc[m3/d]']
inputs = np.setdiff1d(selected_rows, outputs)

print("inputs: " + str(inputs))
print("outputs: " + str(outputs))

inputs: ['DPV[mbar]' 'PL[bara]' 'TL[DegC]']
outputs: ['co1min_qo_lc[m3/d]', 'co1min_qg_lc[m3/d]']


In [157]:
remove_spaces = lambda x: x.replace(" ", "")

# remove spaces from columns
for col in df1.columns:
  df1 = df1.rename(columns={col:remove_spaces(col)})

for col in df2.columns:
  df2 = df2.rename(columns={col:remove_spaces(col)})

In [158]:
df1

Unnamed: 0,Date&Time,qo_lc[m3/d],qw_lc[m3/d],qg_lc[m3/d],qo_sc[Sm3/d],qw_sc[Sm3/d],qg_sc[Sm3/d],qo_scnp[Sm3/d],qw_scnp[Sm3/d],qg_scnp[Sm3/d],...,OperatingPointHE[1/m],OilPointLE[1/m],OilPointHE[1/m],WaterPointLE[1/m],WaterPointHE[1/m],GasPointLE[1/m],GasPointHE[1/m],co1min_qo_lc[m3/d],co1min_qw_lc[m3/d],co1min_qg_lc[m3/d]
0,07-May-2014 10:13:26,109.280,-0.312890,3810.3,108.430,-0.314870,92700,108.430,-0.314870,91298,...,1.00060,23.568,14.781,171.01,25.097,0.49280,0.32761,222.740,,
1,07-May-2014 10:14:26,113.460,-0.437350,3774.0,112.580,-0.440110,91895,112.580,-0.440110,90438,...,0.98577,23.570,14.782,171.01,25.097,0.49042,0.32603,354.370,,
2,07-May-2014 10:15:26,131.630,-0.533550,3892.5,130.550,-0.536910,95860,130.550,-0.536910,94153,...,1.04660,23.568,14.781,171.01,25.097,0.49134,0.32665,272.300,,
3,07-May-2014 10:16:26,140.670,0.160910,4043.1,139.440,0.161910,100690,139.440,0.161910,98844,...,1.11880,23.562,14.777,171.00,25.096,0.49673,0.33023,422.320,,
4,07-May-2014 10:17:26,150.020,-1.145600,4134.6,148.640,-1.152700,104290,148.640,-1.152700,102300,...,1.14540,23.553,14.771,170.99,25.094,0.50210,0.33380,289.400,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1419,08-May-2014 10:09:35,80.103,-0.336630,2803.4,79.417,-0.338870,73266,79.417,-0.338870,72160,...,0.83218,23.581,14.789,171.06,25.105,0.52021,0.34584,240.814,,
1420,08-May-2014 10:10:35,115.080,-0.604240,3057.9,114.010,-0.608230,81615,114.010,-0.608230,80000,...,1.09900,23.575,14.785,171.06,25.105,0.52916,0.35179,216.820,,
1421,08-May-2014 10:11:35,101.740,-0.369300,3188.5,100.750,-0.371710,84923,100.750,-0.371710,83494,...,1.14230,23.564,14.778,171.05,25.103,0.53543,0.35595,257.738,,
1422,08-May-2014 10:12:35,40.918,-0.060369,2357.4,40.563,-0.060761,59662,40.563,-0.060761,59109,...,0.94348,23.562,14.777,171.04,25.101,0.52875,0.35152,59.997,,


In [159]:
df2

Unnamed: 0,qo_lc[m3/d],qw_lc[m3/d],qg_lc[m3/d],Date,co1min_qo_lc[m3/d],co1min_qw_lc[m3/d],co1min_qg_lc[m3/d],qo_lc_avg10,qo_lc_avg5,qo_lc_avg20,...,qw_lc_avg10,qw_lc_avg5,qw_lc_avg20,qw_lc_avg30,qw_lc_avg60,qg_lc_avg10,qg_lc_avg5,qg_lc_avg20,qg_lc_avg30,qg_lc_avg60
0,109.280,-0.312890,3810.3,07-May-2014 10:13:26,222.740,-0.750240,7584.3,109.7028,129.0120,103.18940,...,-0.368397,-0.453696,-0.342605,-0.350181,-0.374134,3868.08,3930.90,3775.645,3688.586667,3481.415000
1,113.460,-0.437350,3774.0,07-May-2014 10:14:26,354.370,-1.283790,11476.8,96.6760,90.3936,84.34390,...,-0.316812,-0.283099,-0.634228,-0.398086,2.503358,3683.21,3805.26,3379.860,3274.243333,3364.313333
2,131.630,-0.533550,3892.5,07-May-2014 10:15:26,495.040,-1.122880,15519.9,81.4646,103.3322,83.16180,...,-0.365334,-0.456182,-0.145568,3.357279,0.561701,3514.47,3820.22,3288.740,3295.953333,3465.146667
3,140.670,0.160910,4043.1,07-May-2014 10:16:26,645.060,-2.268480,19654.5,87.2232,90.0198,86.27455,...,-0.903123,-0.177442,0.162339,1.649437,0.611184,3245.25,3546.20,3247.110,3432.673333,3452.058333
4,150.020,-1.145600,4134.6,07-May-2014 10:17:26,784.440,-3.109510,23994.7,73.3470,69.6328,94.84145,...,-0.312707,-0.540187,7.118775,0.338276,0.184388,3153.03,3489.38,3419.875,3432.516667,3475.510000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1420,115.080,-0.604240,3057.9,08-May-2014 10:10:35,114101.902,-50.094090,4590911.4,,,,...,,,,,,,,,,
1421,101.740,-0.369300,3188.5,08-May-2014 10:11:35,114142.820,-50.154459,4593268.8,,,,...,,,,,,,,,,
1422,40.918,-0.060369,2357.4,08-May-2014 10:12:35,114161.899,-50.054654,4595193.8,,,,...,,,,,,,,,,
1423,19.079,0.099805,1925.0,08-May-2014 10:13:35,114290.911,-50.054654,4595193.8,,,,...,,,,,,,,,,


In [160]:
sheet1_columns = df1.columns
sheet2_columns = df2.columns
space = "     "

def print_columns(columns):
  for i in range(len(columns)):
    col = columns[i]
    if (i + 1) % 6 == 0:
      print(str(col) + space)
    else:
      print(str(col) + space, end='')

print("sheet1 columns: ")
print_columns(sheet1_columns)
print('\n\n')
print("sheet2 columns: ")
print_columns(sheet2_columns)

sheet1 columns: 
Date&Time     qo_lc[m3/d]     qw_lc[m3/d]     qg_lc[m3/d]     qo_sc[Sm3/d]     qw_sc[Sm3/d]     
qg_sc[Sm3/d]     qo_scnp[Sm3/d]     qw_scnp[Sm3/d]     qg_scnp[Sm3/d]     Fo_lc[%]     Fw_lc[%]     
Fg_lc[%]     WLR[%]     GVF[%]     GLR[m3/m3]     BSW[%]     GOR[Sm3/Sm3]     
GOR1[Sm3/Sm3]     mo_lc[kg/d]     mw_lc[kg/d]     mg_lc[kg/d]     m_lc[kg/d]     mo_sc[kg/d]     
mw_sc[kg/d]     mg_sc[kg/d]     Mu_o_lc[cP]     Mu_l_lc[cP]     Do_lc[g/cm3]     Dw_lc[g/cm3]     
Dg_lc[g/cm3]     Dm_lc[g/cm3]     Dl_lc[g/cm3]     bo[Sm3/m3]     bw[Sm3/m3]     bg[Sm3/m3]     
Z     Rst[Sm3/Sm3]     Rwst[Sm3/Sm3]     rgmp[Sm3/Sm3]     N32[cps]     N81[cps]     
N356[cps]     NTotal[cps]     DeadTime[s]     SampleTime[s]     DPV[mbar]     PL[bara]     
TL[DegC]     TAMB[DegC]     Dyn_DP[mbar]     OperatingPointLE[1/m]     OperatingPointHE[1/m]     OilPointLE[1/m]     
OilPointHE[1/m]     WaterPointLE[1/m]     WaterPointHE[1/m]     GasPointLE[1/m]     GasPointHE[1/m]     co1min_qo_lc

In [161]:
def check_containes(columns, attributes):
  for attr in attributes:
    contains = False

    for col in columns:
      if attr == col:
        print(str(attr) + ": contains")
        contains = True
        break
    if not contains:
      print(str(attr) + ": not contains")
  print()

print("check input in sheet1:")
check_containes(sheet1_columns, inputs)

print("check output in sheet1:")
check_containes(sheet1_columns, outputs)

print("check input in sheet2:")
check_containes(sheet2_columns, inputs)

print("check input in sheet2:")
check_containes(sheet2_columns, outputs)

check input in sheet1:
DPV[mbar]: contains
PL[bara]: contains
TL[DegC]: contains

check output in sheet1:
co1min_qo_lc[m3/d]: contains
co1min_qg_lc[m3/d]: contains

check input in sheet2:
DPV[mbar]: not contains
PL[bara]: not contains
TL[DegC]: not contains

check input in sheet2:
co1min_qo_lc[m3/d]: contains
co1min_qg_lc[m3/d]: contains



In [162]:
#check output data in both dataframes
df1[outputs]

Unnamed: 0,co1min_qo_lc[m3/d],co1min_qg_lc[m3/d]
0,222.740,
1,354.370,
2,272.300,
3,422.320,
4,289.400,
...,...,...
1419,240.814,
1420,216.820,
1421,257.738,
1422,59.997,


In [163]:
df2[outputs]

Unnamed: 0,co1min_qo_lc[m3/d],co1min_qg_lc[m3/d]
0,222.740,7584.3
1,354.370,11476.8
2,495.040,15519.9
3,645.060,19654.5
4,784.440,23994.7
...,...,...
1420,114101.902,4590911.4
1421,114142.820,4593268.8
1422,114161.899,4595193.8
1423,114290.911,4595193.8


In [164]:
print("rows count:")
print("df1: " + str(df1.shape[0]))
print("df2: " + str(df2.shape[0]))

rows count:
df1: 1424
df2: 1425


In [165]:
# constract dataframe (get inputs from df1 and ouputs from df2)
df2.drop([df2.shape[0] - 1], axis=0, inplace=True) # last row in df2 is redurand
df = pd.concat([df1[inputs], df2[outputs]], axis=1)
df

Unnamed: 0,DPV[mbar],PL[bara],TL[DegC],co1min_qo_lc[m3/d],co1min_qg_lc[m3/d]
0,1448.70,21.481,-2.5452,222.740,7584.3
1,1462.10,21.473,-2.5323,354.370,11476.8
2,1592.30,21.672,-2.3931,495.040,15519.9
3,1703.30,21.926,-2.0451,645.060,19654.5
4,1821.70,22.196,-1.8009,784.440,23994.7
...,...,...,...,...,...
1419,916.33,22.816,-3.9957,114000.162,4587722.9
1420,1093.20,23.209,-3.6189,114101.902,4590911.4
1421,1118.00,23.271,-3.2079,114142.820,4593268.8
1422,697.57,22.292,-3.1773,114161.899,4595193.8


In [166]:
print("rows count=" + str(df.shape[0]))

rows count=1424


### plot attributes distributions

In [167]:
def plot_attribute_distribution(df, attribute_name):
  y_data = df[attribute_name].to_numpy()
  quantile_first = np.quantile(y_data, 0.25)
  quantile_third = np.quantile(y_data, 0.75)

  x_data = np.arange(1, len(y_data) + 1)

  fig = go.Figure()

  fig.add_trace(go.Scatter(y=y_data, x=x_data, name=attribute_name, mode='markers'))
  fig.add_trace(go.Scatter(x=[1, len(y_data) + 1], y=[quantile_first, quantile_first], name='quantile first')) # add first quantile
  fig.add_trace(go.Scatter(x=[1, len(y_data) + 1], y=[quantile_third, quantile_third], name='quantile third')) # add third quantile
  fig.show()

In [168]:
plot_attribute_distribution(df, 'DPV[mbar]')

In [169]:
plot_attribute_distribution(df, 'PL[bara]')

In [170]:
plot_attribute_distribution(df, 'TL[DegC]')

In [171]:
plot_attribute_distribution(df, 'co1min_qo_lc[m3/d]')

In [172]:
plot_attribute_distribution(df, 'co1min_qg_lc[m3/d]')

### supplement skip

In [173]:
cardinality_hold = 0.15
expected_col_size = 1424

print("before supplement:")
categorical_attributes = check_attributes_info(df, cardinality_hold, expected_col_size)

print("categorical attributes:")
print(str(categorical_attributes))

supplement_data(df, categorical_attributes)

print("after supplement:")
categorical_attributes = check_attributes_info(df, cardinality_hold, expected_col_size)

print("categorical attributes:")
print(str(categorical_attributes))

before supplement:
DPV[mbar]: card=1302; skip: percent=0.0, count=0
PL[bara]: card=1150; skip: percent=0.0, count=0
TL[DegC]: card=1367; skip: percent=0.0, count=0
co1min_qo_lc[m3/d]: card=1424; skip: percent=0.0, count=0
co1min_qg_lc[m3/d]: card=1423; skip: percent=0.0, count=0
categorical attributes:
[]
after supplement:
co1min_qg_lc[m3/d]: card=1423; skip: percent=0.0, count=0
co1min_qo_lc[m3/d]: card=1424; skip: percent=0.0, count=0
TL[DegC]: card=1367; skip: percent=0.0, count=0
PL[bara]: card=1150; skip: percent=0.0, count=0
DPV[mbar]: card=1302; skip: percent=0.0, count=0
categorical attributes:
[]


In [174]:
result_interval = interval_sigmoid
df = ((df - df.min()) / (df.max() - df.min())) * (result_interval[1] - result_interval[0]) + result_interval[0]
df

Unnamed: 0,co1min_qg_lc[m3/d],co1min_qo_lc[m3/d],TL[DegC],PL[bara],DPV[mbar]
0,0.000000,0.000000,0.833583,0.131079,0.601399
1,0.000848,0.001154,0.835535,0.129928,0.608992
2,0.001730,0.002387,0.856602,0.158561,0.682774
3,0.002631,0.003702,0.909270,0.195108,0.745675
4,0.003577,0.004924,0.946228,0.233957,0.812770
...,...,...,...,...,...
1419,0.998372,0.997451,0.614060,0.323165,0.299716
1420,0.999067,0.998343,0.671086,0.379712,0.399944
1421,0.999580,0.998702,0.733288,0.388633,0.413998
1422,1.000000,0.998869,0.737919,0.247770,0.175750


In [175]:

data = get_train_test(df, 0.3, inputs, outputs)

train_input_data = data.train_input
train_output_data = data.train_output
test_input_data = data.test_input
test_output_data = data.test_output

print("data shapes:")
print(train_input_data.shape)
print(train_output_data.shape)

print(test_input_data.shape)
print(test_output_data.shape)

data shapes:
(996, 3)
(996, 2)
(428, 3)
(428, 2)


In [176]:
timesteps = 60
input_train_sequences = get_data_sequence(train_input_data, timesteps, inputs, outputs)
output_train_sequences = get_data_sequence(train_output_data, timesteps, inputs, outputs)

input_test_sequences = get_data_sequence(test_input_data, timesteps, inputs, outputs)
output_test_sequences = get_data_sequence(test_output_data, timesteps, inputs, outputs)

print("train batches:")
print_batches_shapes(input_train_sequences, output_train_sequences)
print()
print("test batches:")
print_batches_shapes(input_test_sequences, output_test_sequences)


train batches:
 input batches shapes:
 (60, 3)
 (60, 3)
 (60, 3)
 (60, 3)
 (60, 3)
 (60, 3)
 (60, 3)
 (60, 3)
 (60, 3)
 (60, 3)
 (60, 3)
 (60, 3)
 (60, 3)
 (60, 3)
 (60, 3)
 (60, 3)
 (36, 3)
 ------------
 output batches shapes:
 (60, 2)
 (60, 2)
 (60, 2)
 (60, 2)
 (60, 2)
 (60, 2)
 (60, 2)
 (60, 2)
 (60, 2)
 (60, 2)
 (60, 2)
 (60, 2)
 (60, 2)
 (60, 2)
 (60, 2)
 (60, 2)
 (36, 2)

test batches:
 input batches shapes:
 (60, 3)
 (60, 3)
 (60, 3)
 (60, 3)
 (60, 3)
 (60, 3)
 (60, 3)
 (8, 3)
 ------------
 output batches shapes:
 (60, 2)
 (60, 2)
 (60, 2)
 (60, 2)
 (60, 2)
 (60, 2)
 (60, 2)
 (8, 2)


## Create model

In [177]:
epoch_count = 500
input_size = len(inputs)
output_size = len(outputs)
lr = 0.001

mse = tf.keras.losses.MeanSquaredError()

model = tf.keras.Sequential()
model.add(Input((None, input_size)))
model.add(SimpleRNN(units=120, activation=activations.sigmoid, return_sequences=True))
model.add(SimpleRNN(units=120, activation=activations.sigmoid, return_sequences=True))

output_dense_layer = Dense(len(outputs), activation=activations.sigmoid)
model.add(TimeDistributed(output_dense_layer))

model.summary()

Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 simple_rnn_21 (SimpleRNN)   (None, None, 120)         14880     
                                                                 
 simple_rnn_22 (SimpleRNN)   (None, None, 120)         28920     
                                                                 
 time_distributed_9 (TimeDis  (None, None, 2)          242       
 tributed)                                                       
                                                                 
Total params: 44,042
Trainable params: 44,042
Non-trainable params: 0
_________________________________________________________________


In [178]:
model.compile(loss=mse, metrics=[mse], optimizer=Adam(learning_rate=lr))

In [179]:
loss_list = train_model(model, input_train_sequences, output_train_sequences, epoch_count)

epoch-0;{'loss': 0.13305428624153137, 'mean_squared_error': 0.13305428624153137}
epoch-1;{'loss': 0.08031371980905533, 'mean_squared_error': 0.08031371980905533}
epoch-2;{'loss': 0.082490473985672, 'mean_squared_error': 0.082490473985672}
epoch-3;{'loss': 0.0753120705485344, 'mean_squared_error': 0.0753120705485344}
epoch-4;{'loss': 0.07167946547269821, 'mean_squared_error': 0.07167946547269821}
epoch-5;{'loss': 0.06949924677610397, 'mean_squared_error': 0.06949924677610397}
epoch-6;{'loss': 0.0676262155175209, 'mean_squared_error': 0.0676262155175209}
epoch-7;{'loss': 0.06625227630138397, 'mean_squared_error': 0.06625227630138397}
epoch-8;{'loss': 0.06513819098472595, 'mean_squared_error': 0.06513819098472595}
epoch-9;{'loss': 0.0642031878232956, 'mean_squared_error': 0.0642031878232956}
epoch-10;{'loss': 0.0634128674864769, 'mean_squared_error': 0.0634128674864769}
epoch-11;{'loss': 0.06272850930690765, 'mean_squared_error': 0.06272850930690765}
epoch-12;{'loss': 0.062127385288476944

## Test on batches

In [180]:
# test on batches
test_loss = test_model(model, input_test_sequences, output_test_sequences)
print("test loss=" + str(test_loss['loss']))

test loss=0.04200601950287819


## Make predicttions

In [181]:
input_data = df[inputs].to_numpy()
output_data = df[outputs].to_numpy()

print("data shapes:")
print(input_data.shape)
print(output_data.shape)

input_sequences = get_data_sequence(input_data, timesteps, inputs, outputs)
output_sequences = get_data_sequence(output_data, timesteps, inputs, outputs)

seq_count = len(input_sequences)
assert seq_count == len(output_sequences)

print("sequence count=" + str(seq_count))
print_batches_shapes(input_sequences, output_sequences)

predictions = predict(model, input_sequences)

data shapes:
(1424, 3)
(1424, 2)
sequence count=24
 input batches shapes:
 (60, 3)
 (60, 3)
 (60, 3)
 (60, 3)
 (60, 3)
 (60, 3)
 (60, 3)
 (60, 3)
 (60, 3)
 (60, 3)
 (60, 3)
 (60, 3)
 (60, 3)
 (60, 3)
 (60, 3)
 (60, 3)
 (60, 3)
 (60, 3)
 (60, 3)
 (60, 3)
 (60, 3)
 (60, 3)
 (60, 3)
 (44, 3)
 ------------
 output batches shapes:
 (60, 2)
 (60, 2)
 (60, 2)
 (60, 2)
 (60, 2)
 (60, 2)
 (60, 2)
 (60, 2)
 (60, 2)
 (60, 2)
 (60, 2)
 (60, 2)
 (60, 2)
 (60, 2)
 (60, 2)
 (60, 2)
 (60, 2)
 (60, 2)
 (60, 2)
 (60, 2)
 (60, 2)
 (60, 2)
 (60, 2)
 (44, 2)


In [182]:
one_dim_predicted_data = get_1d_data(predictions)

print(one_dim_predicted_data.shape)

predicted_data1 = get_attribute_data(0, one_dim_predicted_data)
predicted_data2 = get_attribute_data(1, one_dim_predicted_data)

print("predicted data shapes: " + str(predicted_data1.shape) + " " + str(predicted_data2.shape))

(1424, 2)
predicted data shapes: (1424,) (1424,)


## Plot approximations

In [183]:
# plot as graphics
plot_approximation(outputs[0], predicted_data1, 'lines').show()

In [184]:
# plot as graphics
plot_approximation(outputs[1], predicted_data2, 'lines').show()

## Calc determination coefficient

In [185]:
actual_data1 = df[outputs[0]].to_numpy()
actual_data2 = df[outputs[1]].to_numpy()

k_1 = get_determination(predicted_data1, actual_data1)
k_2 = get_determination(predicted_data2, actual_data2)

print(outputs[0] + " determination=" + str(k_1))
print(outputs[1] + " determination=" + str(k_2))

co1min_qo_lc[m3/d] determination=-151.1892477209643
co1min_qg_lc[m3/d] determination=-96.30250613144345
