# Modeling

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from keras.models import Sequential
from keras.layers import LSTM, GRU, SimpleRNN
from keras.layers import Dense, Dropout, Normalization, BatchNormalization, LayerNormalization, Input
from tcn import TCN, tcn_full_summary
from catboost import CatBoostRegressor


from sklearn.preprocessing import MinMaxScaler

## Prepare the Data Sets
We want to create two separate datasets; one for our CatBoost model and one for our Keras models.

In [2]:
df = pd.read_parquet("../data/structured/general/combined_data.parquet")
df = df.dropna()
df

Unnamed: 0_level_0,Unit_4_Power,Unit_4_Reactive Power,Turbine_Guide Vane Opening,Turbine_Pressure Drafttube,Turbine_Pressure Spiral Casing,Turbine_Rotational Speed,Bolt_1_Tensile,Bolt_2_Tensile,Bolt_3_Tensile,Bolt_4_Tensile,...,Power / vane opening,seconds_since_last_data,seconds_since_last_start,Power / Drafttube pressure,Bolt_1_Tensile_adj,Bolt_2_Tensile_adj,Bolt_3_Tensile_adj,Bolt_4_Tensile_adj,Bolt_5_Tensile_adj,Bolt_6_Tensile_adj
timepoints,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1970-12-19 09:51:45,262.104319,3.344630,82.277248,173.989815,5311.219755,107.964273,1598.477449,1480.989528,1684.261611,1601.366508,...,3.185623,1.0,1.0,1.506435,115.477449,43.989528,72.261611,3.366508,6.588478,38.823883
1970-12-19 09:51:46,262.004330,3.790223,82.274520,174.024413,5311.640329,107.964269,1598.479316,1481.003188,1684.270504,1601.374254,...,3.184514,1.0,2.0,1.505561,115.479316,44.003188,72.270504,3.374254,6.583464,38.841318
1970-12-19 09:51:47,261.904340,4.235817,82.271792,174.059012,5312.060902,107.964264,1598.490184,1481.028827,1684.270683,1601.383179,...,3.183404,1.0,3.0,1.504687,115.490184,44.028827,72.270683,3.383179,6.581384,38.843245
1970-12-19 09:51:48,261.804351,4.064759,82.269064,174.153819,5312.405938,107.964259,1598.494073,1481.059017,1684.271062,1601.378391,...,3.182294,1.0,4.0,1.503294,115.494073,44.059017,72.271062,3.378391,6.591746,38.872300
1970-12-19 09:51:49,261.704362,3.170510,82.266336,174.422046,5312.533396,107.964254,1598.498916,1481.075521,1684.276622,1601.380601,...,3.181184,1.0,5.0,1.500409,115.498916,44.075521,72.276622,3.380601,6.607884,38.924469
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1971-01-25 11:06:44,308.716025,3.974309,94.425666,157.927905,5280.929965,108.057498,1637.386115,1504.557822,1701.651420,1606.276545,...,3.269408,1.0,19788.0,1.954791,154.386115,67.557822,89.651420,8.276545,11.704071,54.014705
1971-01-25 11:06:45,308.746393,4.103262,94.429003,157.974925,5280.633358,108.057492,1637.365865,1504.546091,1701.654301,1606.271877,...,3.269614,1.0,19789.0,1.954401,154.365865,67.546091,89.654301,8.271877,11.711250,54.017029
1971-01-25 11:06:46,308.776762,4.472929,94.432340,158.021945,5280.336751,108.057486,1637.384133,1504.538696,1701.656143,1606.250028,...,3.269820,1.0,19790.0,1.954012,154.384133,67.538696,89.656143,8.250028,11.699142,54.002008
1971-01-25 11:06:47,308.807131,4.842597,94.435677,158.068966,5280.040144,108.057479,1637.357141,1504.531582,1701.662201,1606.245665,...,3.270026,1.0,19791.0,1.953623,154.357141,67.531582,89.662201,8.245665,11.685782,53.995135


In [3]:
y_cols = [c for c in df if c.endswith("Tensile")]
adj_cols = [c for c in df if c.endswith("Tensile_adj")]

### Create CatBoost dataset

In [4]:
extra_cols = ["seconds_since_start", "month", "day_of_month", "day_of_week"]

In [9]:
lookback = 2

cX = df.drop(columns=y_cols+adj_cols)
X_cols = cX.drop(columns=extra_cols).columns
for i in range(1, lookback+1):
    cX.loc[:, [f"{c} (t-{i})" for c in X_cols]] = cX[X_cols].shift(i).rename(columns={c: f"{c} (t-{i})" for c in X_cols})

cy = df[y_cols]

cX

Unnamed: 0_level_0,Unit_4_Power,Unit_4_Reactive Power,Turbine_Guide Vane Opening,Turbine_Pressure Drafttube,Turbine_Pressure Spiral Casing,Turbine_Rotational Speed,seconds_since_start,month,day_of_month,day_of_week,...,Turbine_Guide Vane Opening (t-2),Turbine_Pressure Drafttube (t-2),Turbine_Pressure Spiral Casing (t-2),Turbine_Rotational Speed (t-2),is_starting (t-2),Netto Power (t-2),Power / vane opening (t-2),seconds_since_last_data (t-2),seconds_since_last_start (t-2),Power / Drafttube pressure (t-2)
timepoints,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1970-12-19 09:51:45,262.104319,3.344630,82.277248,173.989815,5311.219755,107.964273,1.0,12,19,5,...,,,,,,,,,,
1970-12-19 09:51:46,262.004330,3.790223,82.274520,174.024413,5311.640329,107.964269,2.0,12,19,5,...,,,,,,,,,,
1970-12-19 09:51:47,261.904340,4.235817,82.271792,174.059012,5312.060902,107.964264,3.0,12,19,5,...,82.277248,173.989815,5311.219755,107.964273,0,258.759689,3.185623,1.0,1.0,1.506435
1970-12-19 09:51:48,261.804351,4.064759,82.269064,174.153819,5312.405938,107.964259,4.0,12,19,5,...,82.274520,174.024413,5311.640329,107.964269,0,258.214106,3.184514,1.0,2.0,1.505561
1970-12-19 09:51:49,261.704362,3.170510,82.266336,174.422046,5312.533396,107.964254,5.0,12,19,5,...,82.271792,174.059012,5312.060902,107.964264,0,257.668524,3.183404,1.0,3.0,1.504687
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1971-01-25 11:06:44,308.716025,3.974309,94.425666,157.927905,5280.929965,108.057498,2769300.0,1,25,0,...,94.418992,157.924633,5281.305653,108.057511,0,303.845320,3.268996,1.0,19786.0,1.954447
1971-01-25 11:06:45,308.746393,4.103262,94.429003,157.974925,5280.633358,108.057492,2769301.0,1,25,0,...,94.422329,157.885326,5281.215929,108.057505,0,304.303431,3.269202,1.0,19787.0,1.955126
1971-01-25 11:06:46,308.776762,4.472929,94.432340,158.021945,5280.336751,108.057486,2769302.0,1,25,0,...,94.425666,157.927905,5280.929965,108.057498,0,304.741716,3.269408,1.0,19788.0,1.954791
1971-01-25 11:06:47,308.807131,4.842597,94.435677,158.068966,5280.040144,108.057479,2769303.0,1,25,0,...,94.429003,157.974925,5280.633358,108.057492,0,304.643131,3.269614,1.0,19789.0,1.954401


In [10]:
def train_test_split(X, y, test_percent=0.1, offset_percent=0):
    
    test_start = int(len(df) * offset_percent)
    test_end = int(len(df) * (offset_percent + test_percent))

    train = df.iloc[:test_start]
    test = df.iloc[test_start:test_end]

    X_train, X_test = X.iloc[:test_start], X.iloc[test_start:test_end]
    y_train, y_test = y.iloc[:test_start], y.iloc[test_start:test_end]
    
    return X_train, X_test, y_train, y_test

cX_train, cX_test, cy_train, cy_test = train_test_split(cX, cy, test_percent=0.1, offset_percent=0.9)

### CatBoost Modeling

In [11]:
def train_catboost(X_train, y_train, eval_set=None, params={}):
    
    model = CatBoostRegressor(**params)
    model.fit(X_train, y_train, eval_set=eval_set)
    
    return model

In [12]:
def plot_error(X_test, y_test, model):

    y_test = y_tests[j].copy()

    pred = model.predict(X_test)

    plt.subplots(figsize=(15, 10))
    plt.scatter(y_test.index, y_test, label="real", s=2)
    plt.scatter(y_test.index, pred, label="pred", s=2)
    plt.legend()
    plt.show()

In [20]:
def cv_catboost(X, y, n=4, start_offset=0.5, verbose=False, params={}):

    test_percent = (1 - start_offset) / n

    all_results = []
    for i in range(n):
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_percent = test_percent,
                                                            offset_percent = start_offset + i*test_percent)
        y_trains = [y_train[c] for c in y_train]
        y_tests = [y_test[c] for c in y_test]

        results = []
        for j in range(len(y_trains)):
            model = CatBoostRegressor(**params)
            model.fit(X_train, y_trains[j], eval_set=(X_test, y_tests[j]), verbose=verbose)

            pred = model.predict(X_test)
            mape = 100 * ((y_tests[j] - pred).abs() / y_tests[j]).mean()
            results.append(mape)
            print(f"iteration {i}, bolt {j}: MAPE={mape}")
        all_results.append(results)
    all_results = np.array(all_results)
    
    return all_results

params = {
    "loss_function": "MAPE",
    "iterations": 1000,
    "depth": 5
}

results = cv_catboost(cX, cy, params=params)

CatBoostError: features data: pandas.DataFrame column 'is_starting (t-1)' has dtype 'category' but is not in  cat_features list

In [21]:
cX.loc[:, [c for c in cX if "is_starting" in c]] = cX.loc[:, [c for c in cX if "is_starting" in c]].astype(int)

ValueError: Cannot convert float NaN to integer

In [None]:
results.mean()

In [None]:
results.mean()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(cX,
                                                    cy,
                                                    test_percent = 0.125,
                                                    offset_percent = 0.5 + 3*0.125)

y_trains = [y_train[c] for c in y_train]
y_tests = [y_test[c] for c in y_test]

model = CatBoostRegressor(**params)
model.fit(X_train, y_trains[0], eval_set=(X_test, y_tests[0]), verbose=True)

In [None]:
y_trains[0]

In [None]:
y_tests[0]

In [None]:
pred = model.predict(X_test)
((y_tests[j] - pred).abs() / y_tests[j]).mean()

In [None]:
y_tests[0].plot()

In [None]:
plt.plot(y_tests[0].index, )

In [None]:
y_test

## Create Keras datasets

In [None]:
test_size = 1000

kX = df[X_cols].copy()
ky = df[y_cols]

kX_train_df, kX_test_df, ky_train_df, ky_test_df = train_test_split(kX, ky, test_percent=0.1, offset_percent=0.9)
kX_train_df

In [None]:
def lstm_data_transform(x_data, y_data, num_steps=5):
    """ Changes data to the format for LSTM training 
for sliding window approach """
    # Prepare the list for the transformed data
    X, y = list(), list()
    # Loop of the entire data set
    for i in range(x_data.shape[0]):
        # compute a new (sliding window) index
        end_ix = i + num_steps
        # if index is larger than the size of the dataset, we stop
        if end_ix >= x_data.shape[0]:
            break
        # Get a sequence of data for x
        seq_X = x_data[i:end_ix]
        # Get only the last element of the sequency for y
        seq_y = y_data[end_ix]
        # Append the list with sequencies
        X.append(seq_X)
        y.append(seq_y)
    # Make final arrays
    x_array = np.array(X)
    y_array = np.array(y)
    return x_array, y_array

ky_trains = [ky_train_df[c] for c in ky_train_df]
ky_tests = [ky_test_df[c] for c in ky_test_df]

j = 0
kX_train, ky_train = lstm_data_transform(kX_train_df, ky_trains[j], num_steps=30)
kX_test, ky_test = lstm_data_transform(kX_test_df, ky_tests[j], num_steps=30)

In [None]:
kX_train.shape

In [None]:
kX_train

In [None]:
def plot_error(model, X_test, y_test):
    pred = pd.DataFrame(model.predict(X_test)).iloc[:, 0]
    pd.concat([pred, pd.DataFrame(y_test)], axis=1).plot()

In [None]:
def get_mse(model, X_test, y_test, scaler=None):
    pred = model.predict(X_test)
    if scaler is not None:
        pred = scaler.inverse_transform(pred)
        y_test = scaler.inverse_transform(y_test)
    return ((pred - y_test)**2).mean()

## Train Keras models

In [None]:
def train_keras(first, X_train, y_train, eval_set, epochs=20, batch_size=2048):
    
    norm = Normalization()
    norm.adapt(first(X_train))
    
    model = Sequential([
        first,
        norm,
        Dense(3),
        Dropout(0.4),
        Dense(3),
        Dense(3),
        Dense(3),
        Dense(3),
        Dense(1)
    ])

    model.compile(loss='mean_absolute_percentage_error', optimizer='adam')
    model.fit(X_train, y_train, validation_data=eval_set, epochs=epochs, batch_size=batch_size, verbose=1)
    
    return model

In [None]:
lstm_model = train_keras(
    LSTM(5, input_shape=(kX_train.shape[1], kX_train.shape[2])),
    X_train = kX_train,
    y_train = ky_train,
    eval_set = (kX_test, ky_test),
    epochs=20
)

In [None]:
gru_model = train_keras(
    GRU(10, input_shape=(kX_train.shape[1], kX_train.shape[2])),
    X_train = kX_train,
    y_train = ky_train,
    eval_set = (kX_test, ky_test),
    epochs=15
)

In [None]:
tcn_model = train_keras(
    TCN(nb_filters=4, input_shape=(kX_train.shape[1], kX_train.shape[2])),
    X_train = kX_train,
    y_train = ky_train,
    eval_set = (kX_test, ky_test),
    epochs=10
)