In [2]:
import os
import math

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.python.keras.api import keras
from tensorflow.keras.utils import timeseries_dataset_from_array

from hpcscripts.sharedutils.trainingutils import LoadModel, SetLowTFVerbose, MakeSinglePrediction
from hpcscripts.sharedutils.nomalization import DF_Nomalize, denorm
from hpcscripts.sharedutils.modelutils import SelectModelPrompt
from hpcscripts.option import pathhandler as ph
from hpcscripts.option import globalparams as G_PARAMS

SetLowTFVerbose()

2022-05-10 02:46:38.681511: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-05-10 02:46:38.725588: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudnn.so.8'; dlerror: libcudnn.so.8: cannot open shared object file: No such file or directory
2022-05-10 02:46:38.725612: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1850] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
2022-05-10 02:46:38.726135: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN

In [5]:
train_dir = ph.GetProcessedPath("Selected")
test_dir  = ph.GetProcessedPath("Test")

train_list = os.listdir(train_dir)
test_list  = os.listdir(test_dir)

for i, test_file in enumerate (test_list):
    train_list.remove(test_file)
    test_list[i] = os.path.join(test_dir, test_file)

for i, train_file in enumerate (train_list):
    train_list[i] = os.path.join(train_dir, train_file)

print ("Train count: {}".format(len(train_list)))
print ("Test count : {}".format(len(test_list)))

Train count: 350
Test count : 61


In [6]:
# train_list = train_list[:3]
test_list  = test_list[:3]

print ("Train count: {}".format(len(train_list)))
print ("Test count : {}".format(len(test_list)))

Train count: 350
Test count : 3


In [14]:
# Function Version

from typing import List

def to_datasets(files: List[str]):

    np_data = []

    for i, file in enumerate(files):
        try:
            data = pd.read_csv(file)
            data = data.loc[:, 
                            ["hralt_m", "theta_rad", "aoac_rad", "cas_mps", "elv_l_rad"]
                    ]
        except:
            raise Exception("Can't process {}".format(file))

        ds = timeseries_dataset_from_array(
                    data.to_numpy(),
                    targets=None,
                    sequence_length=5,
                    sequence_stride=1,
                    batch_size=99999
                )
        
        for elem in ds.take(1):
            np_data.append(
                    elem.numpy()
                )
            print ("{}, i: {}".format(elem.numpy().shape, i), end='\r')
    
    print()
    concated_np = np.concatenate(np_data)
    return concated_np
    # return tf.data.Dataset.from_tensor_slices(concated_np)


In [15]:
train_ds = to_datasets(train_list)

# for elem in train_ds:
#     print (elem)

shuffle_size = train_ds.shape[0]
print (shuffle_size)

train_ds = tf.data.Dataset.from_tensor_slices(train_ds).shuffle(shuffle_size).batch(32, drop_remainder=True)
train_ds.element_spec

(397, 5, 5), i: 349
108904


TensorSpec(shape=(32, 5, 5), dtype=tf.float64, name=None)

In [16]:
count = 0
for elem in train_ds:
    count += 1
    print ("{}, Count: {}".format(elem.shape, count), end='\r')

print ()
print (str(count).ljust(100, " "))
print ("Total: {:.0f}".format(count*32))

(32, 5, 5), Count: 3403
3403                                                                                                
Total: 108896


### With WindowGenerator Class version

In [7]:
# Definition for WindowGenerator are written based on tensorflow official
# tutorial on:
#        https://www.tensorflow.org/tutorials/structured_data/time_series
# which licensed under Apache License, Version 2.0



# Copyright 2019 The TensorFlow Authors

#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import List

class WindowGenerator():

    USABLE_COLUMNS  = set (G_PARAMS.FEATURE_COLUMNS + G_PARAMS.SEQUENTIAL_LABELS)
    FEATURE_COLUMNS = G_PARAMS.FEATURE_COLUMNS

    def __init__(self, input_width:int, label_width:int=1, shift:int=1,
                train_list=None, test_list=None, val_list=None,
                label_columns=None,
                shuffle_train:bool=True):
        # Store list of the data.
        self.train_list = train_list
        self.test_list = test_list
        self.val_list = val_list

        # Set shuffle_train
        self.shuffle_train = shuffle_train

        # Work out the label column indices.
        self.input_columns = self.FEATURE_COLUMNS
        self.label_columns = label_columns
        if label_columns is not None:
            self.label_columns_indices = {name: i for i, name in enumerate(label_columns)}

        self.column_indices = self.__get_column_indices()

        # Work out the window parameters.
        self.input_width = input_width
        self.label_width = label_width
        self.shift = shift

        self.total_window_size = input_width + shift

        self.input_slice = slice(0, input_width)
        self.input_indices = np.arange(self.total_window_size)[self.input_slice]

        self.label_start = self.total_window_size - self.label_width
        self.labels_slice = slice(self.label_start, None)
        self.label_indices = np.arange(self.total_window_size)[self.labels_slice]

    def __get_column_indices(self):
        # Get 1 Sample DataFrame and use it to determine column indices
        df = pd.read_csv(self.train_list[0])
        return {name: i for i, name in enumerate(df.columns) if name in self.USABLE_COLUMNS}

    def split_window(self, features):
        inputs = features[:, self.input_slice, :]
        labels = features[:, self.labels_slice, :]
        if self.input_columns is not None:
            inputs = tf.stack(
                            [inputs[:, :, self.column_indices[name]] for name in self.input_columns],
                            axis=-1
                    )
        if self.label_columns is not None:
            labels = tf.stack(
                            [labels[:, :, self.column_indices[name]] for name in self.label_columns],
                            axis=-1
                    )

        # Slicing doesn't preserve static shape information, so set the shapes
        # manually. This way the `tf.data.Datasets` are easier to inspect.
        inputs.set_shape([None, self.input_width, None])
        labels.set_shape([None, self.label_width, None])

        return inputs, labels
    
    def make_dataset(self, file_list: List[str], do_shuffle:bool = False):
        np_data = []
        for file in file_list:
            try:
                data = pd.read_csv(file)
                #
                # PREPROCESS HERE, 
                # NORMALIZATION, ETC.
            except:
                raise Exception("Can't process {}".format(file))

            ds = timeseries_dataset_from_array(
                        data=data.to_numpy(),
                        targets=None,
                        sequence_length=self.total_window_size,
                        sequence_stride=1,
                        batch_size=99999
                    )
            
            for elem in ds.take(1):
                np_data.append(
                        elem.numpy()
                    )
        
        concated_np = np.concatenate(np_data)
        buffer_size = concated_np.shape[0]

        if do_shuffle:
            ds = tf.data.Dataset.from_tensor_slices(concated_np).shuffle(buffer_size).batch(32, drop_remainder=True)
        else:
            ds = tf.data.Dataset.from_tensor_slices(concated_np).batch(32, drop_remainder=True)
        ds = ds.map(self.split_window)

        return ds

    def __repr__(self):
        return '\n'.join([
            f'Total window size: {self.total_window_size}',
            f'Input indices: {self.input_indices}',
            f'Label indices: {self.label_indices}',
            f'Label column name(s): {self.label_columns}'])

    @property
    def train(self):
        return self.make_dataset(self.train_list, self.shuffle_train)

    @property
    def val(self):
        return self.make_dataset(self.val_list)

    @property
    def test(self):
        return self.make_dataset(self.test_list)



In [8]:
w1 = WindowGenerator(
                    input_width=1, shift=0,
                    train_list=train_list,
                    label_columns=["elv_l_rad", "theta_rad"],
                    shuffle_train=False
                )
w1

Total window size: 1
Input indices: [0]
Label indices: [0]
Label column name(s): ['elv_l_rad', 'theta_rad']

In [12]:
train_ds = w1.train

In [15]:
for example_inputs, example_labels in train_ds.take(1):
    print(f'Inputs shape (batch, time, features): {example_inputs.shape}')
    print(f'Labels shape (batch, time, features): {example_labels.shape}')

    print (example_inputs[0, :, :])
    print (example_labels[0, :, :])


Inputs shape (batch, time, features): (32, 1, 4)
Labels shape (batch, time, features): (32, 1, 2)
tf.Tensor([[ 5.51992800e+02  1.15045175e-03 -3.29803196e-02  8.74548000e+01]], shape=(1, 4), dtype=float64)
tf.Tensor([[-0.07680304  0.00115045]], shape=(1, 2), dtype=float64)


In [14]:
_df = pd.read_csv(train_list[0])
_df = _df.loc[:, w1.USABLE_COLUMNS]
_df.head()

Unnamed: 0,theta_rad,cas_mps,aoac_rad,hralt_m,elv_l_rad
0,0.00115,87.4548,-0.03298,551.9928,-0.076803
1,-0.001342,87.390495,-0.038349,552.6024,-0.077874
2,-0.003451,87.229732,-0.037582,552.2976,-0.075732
3,-0.003835,87.32619,-0.042184,546.2016,-0.073589
4,-0.003643,87.390495,-0.029145,546.5064,-0.072518
