In [19]:
import os

TENSORFLOW = 1
TORCH = 1
if TORCH:
    os.environ['CUDA_VISIBLE_DEVICES']=''

if TENSORFLOW:
    import os
    import tensorflow as tf


    gpus = tf.config.list_physical_devices('GPU')
    if gpus:
        try:
            tf.config.set_visible_devices(gpus[0], 'GPU')
            tf.config.set_logical_device_configuration(
                gpus[0],
                [tf.config.LogicalDeviceConfiguration(memory_limit=10000)])
            logical_gpus = tf.config.list_logical_devices('GPU')
            print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
        except RuntimeError as e:
            print(e)


1 Physical GPUs, 1 Logical GPUs


In [20]:
import pandas as pd
from functools import reduce
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense, LSTM
from sklearn.metrics import mean_squared_error
import math
import tensorflow as tf


print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

column_names = [
    'ID', 'PARAM', 'TYPE', 'YEAR', 'DD',
    'Jan', 'Jan_SYM', 'Feb', 'Feb_SYM', 'Mar', 'Mar_SYM',
    'Apr', 'Apr_SYM', 'May', 'May_SYM', 'Jun', 'Jun_SYM',
    'Jul', 'Jul_SYM', 'Aug', 'Aug_SYM', 'Sep', 'Sep_SYM',
    'Oct', 'Oct_SYM', 'Nov', 'Nov_SYM', 'Dec', 'Dec_SYM'
]

MONTHS = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

data = pd.read_csv('data/credit_hydrometric_data_all_stations.csv', skiprows=2, names=column_names)

Num GPUs Available:  1


In [21]:
_02HB018 = data[data['ID'] == '02HB018'].reset_index(drop=True)

_02HB025 = data[data['ID'] == '02HB025'].reset_index(drop=True)

_02HB031 = data[data['ID'] == '02HB031'].reset_index(drop=True)

_02HB001 = data[data['ID'] == '02HB001'].reset_index(drop=True)

_02HB013 = data[data['ID'] == '02HB013'].reset_index(drop=True)

_02HB029 = data[data['ID'] == '02HB029'].reset_index(drop=True)  ### TARGET


In [22]:
import pandas as pd
import calendar

def is_valid_date(year, month, day):
    try:
        m = calendar.monthrange(year, month)
        if day > m[1]:
            return False
        else:
            return True
    except Exception as e:
        print(e)
        return False


def split_by_param_1_2(df: pd.DataFrame):
    data1 = df[df['PARAM'] == 1].reset_index(drop=True)
    data2 = df[df['PARAM'] == 2].reset_index(drop=True)
    return data1, data2

def wateroffice_to_timeseries(df : pd.DataFrame):
    data_dict = {'timestamp': [], 'value': []}

    for i, row in df.iterrows():
        year = row['YEAR']
        day = row['DD']
        for month in MONTHS :
            if is_valid_date(year, MONTHS.index(month) + 1, day):

                timestamp = pd.Timestamp(f'{year}-{month}-{day}')
                val = df.at[i, f'{month}']
                data_dict['timestamp'].append(timestamp)
                data_dict['value'].append(val)

    new_data = pd.DataFrame(data_dict)
    return new_data


def merge_param_1_2(df_left : pd.DataFrame, df_right : pd.DataFrame):
    return pd.merge(df_left, df_right, on='timestamp', how='inner')



In [23]:
_02HB018_1, _02HB018_2 = split_by_param_1_2(_02HB018)
_02HB025_1, _02HB025_2 = split_by_param_1_2(_02HB025)
_02HB031_1, _02HB031_2 = split_by_param_1_2(_02HB031)
_02HB001_1, _02HB001_2 = split_by_param_1_2(_02HB001)
_02HB013_1, _02HB013_2 = split_by_param_1_2(_02HB013)
_02HB029_1, _02HB029_2 = split_by_param_1_2(_02HB029)



In [24]:
_02HB018_1_ts = wateroffice_to_timeseries(_02HB018_1).rename(columns={'value': 'discharge_02HB018'})
_02HB018_2_ts = wateroffice_to_timeseries(_02HB018_2).rename(columns={'value': 'water_level_02HB018'})
_02HB025_1_ts = wateroffice_to_timeseries(_02HB025_1).rename(columns={'value': 'discharge_02HB025'})
_02HB025_2_ts = wateroffice_to_timeseries(_02HB025_2).rename(columns={'value': 'water_level_02HB025'})
_02HB031_1_ts = wateroffice_to_timeseries(_02HB031_1).rename(columns={'value': 'discharge_02HB031'})
_02HB031_2_ts = wateroffice_to_timeseries(_02HB031_2).rename(columns={'value': 'water_level_02HB031'})
_02HB001_1_ts = wateroffice_to_timeseries(_02HB001_1).rename(columns={'value': 'discharge_02HB001'})
_02HB001_2_ts = wateroffice_to_timeseries(_02HB001_2).rename(columns={'value': 'water_level_02HB001'})
_02HB013_1_ts = wateroffice_to_timeseries(_02HB013_1).rename(columns={'value': 'discharge_02HB013'})
_02HB013_2_ts = wateroffice_to_timeseries(_02HB013_2).rename(columns={'value': 'water_level_02HB013'})
_02HB029_1_ts = wateroffice_to_timeseries(_02HB029_1).rename(columns={'value': 'discharge_02HB029'})
_02HB029_2_ts = wateroffice_to_timeseries(_02HB029_2).rename(columns={'value': 'water_level_02HB029'})

KeyboardInterrupt: 

In [None]:
def merge_param_1_2(df_left : pd.DataFrame, df_right : pd.DataFrame):
    return pd.merge(df_left, df_right, on='timestamp', how='inner')

In [None]:
merged_02HB018 = merge_param_1_2(_02HB018_1_ts, _02HB018_2_ts)
merged_02HB025 = merge_param_1_2(_02HB025_1_ts, _02HB025_2_ts)
merged_02HB031 = merge_param_1_2(_02HB031_1_ts, _02HB031_2_ts)
merged_02HB001 = merge_param_1_2(_02HB001_1_ts, _02HB001_2_ts)
merged_02HB013 = merge_param_1_2(_02HB013_1_ts, _02HB013_2_ts)
merged_02HB029 = merge_param_1_2(_02HB029_1_ts, _02HB029_2_ts)


In [None]:
# merge into single dataframe with dischrage and water_level for all stations and add suffix to columns, dont have redundant columns
df_merged = reduce(lambda  left,right: pd.merge(left,right,on=['timestamp'],
                                            how='outer'), [merged_02HB018, merged_02HB025, merged_02HB031, merged_02HB001, merged_02HB013, merged_02HB029])

In [None]:
print(df_merged['timestamp'].diff().dt.days.unique())

[nan  1.]


In [None]:
df_merged.set_index('timestamp', inplace=True)

In [None]:
df_merged.ffill(inplace=True)
df_merged.bfill(inplace=True)

In [None]:
df_merged.head(2)

Unnamed: 0_level_0,discharge_02HB018,water_level_02HB018,discharge_02HB025,water_level_02HB025,discharge_02HB031,water_level_02HB031,discharge_02HB001,water_level_02HB001,discharge_02HB013,water_level_02HB013,discharge_02HB029,water_level_02HB029
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2002-01-01,2.5,3.944,3.27,2.829,0.288,2.014,1.2,1.475,0.38,11.205,12.1,4.563
2002-01-02,2.47,3.983,3.24,2.857,0.288,2.014,1.19,1.473,0.375,11.204,12.1,4.563


In [None]:


from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, Bidirectional
from keras.optimizers import Adam


scaler = MinMaxScaler(feature_range=(0, 1))
df_scaled = scaler.fit_transform(df_merged)

def create_dataset(data :np.array, look_back=1):
    """
    set y as the next value in the sequence at the last index (water_level_02HB029)
    """
    X, Y = [], []
    for i in range(len(data) - look_back):
        a = data[i:(i + look_back), :]
        X.append(a)
        Y.append(data[i + look_back, -1])
    return np.array(X), np.array(Y)

look_back = 1
X, Y = create_dataset(df_scaled, look_back)


# model = Sequential()
# model.add(LSTM(50, input_shape=(look_back, df_scaled.shape[1])))
# model.add(Dense(1))
# model.compile(loss='mean_squared_error', optimizer='adam')
# model.fit(X, Y, epochs=100, batch_size=1, verbose=2)

model = Sequential()
model.add(LSTM(100, return_sequences=True, input_shape=(look_back, df_scaled.shape[1]), dropout=0.2, recurrent_dropout=0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1))
adam = Adam(lr=0.001)
model.compile(loss='mean_squared_error', optimizer=adam)

# Fit the model
model.fit(X, Y, epochs=100, batch_size=1, verbose=2)

predictions = model.predict(X)
predictions = scaler.inverse_transform(np.hstack((np.zeros((predictions.shape[0], df_scaled.shape[1]-1)), predictions)))

Y = scaler.inverse_transform(np.hstack((np.zeros((Y.shape[0], df_scaled.shape[1]-1)), Y.reshape(-1, 1))))[:, -1]
predictions = predictions[:, -1]

rmse = math.sqrt(mean_squared_error(Y, predictions))
print(f'Train Score: {rmse:.2f} RMSE')







Epoch 1/100


2024-05-02 10:44:55.286609: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x767268762160 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-05-02 10:44:55.286633: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce RTX 3060, Compute Capability 8.6
2024-05-02 10:44:55.290147: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-05-02 10:44:55.431859: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:442] Loaded cuDNN version 8600
2024-05-02 10:44:55.499623: I ./tensorflow/compiler/jit/device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


8034/8034 - 39s - loss: 0.0068 - 39s/epoch - 5ms/step
Epoch 2/100
8034/8034 - 35s - loss: 0.0048 - 35s/epoch - 4ms/step
Epoch 3/100
8034/8034 - 35s - loss: 0.0040 - 35s/epoch - 4ms/step
Epoch 4/100
8034/8034 - 36s - loss: 0.0037 - 36s/epoch - 4ms/step
Epoch 5/100


KeyboardInterrupt: 