# Modeling

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from keras.models import Sequential
from keras.layers import LSTM, GRU, SimpleRNN
from keras.layers import Dense, Dropout, Normalization, BatchNormalization, LayerNormalization, Input
from tcn import TCN, tcn_full_summary
from catboost import CatBoostRegressor


from sklearn.preprocessing import MinMaxScaler

## Prepare the Data Sets
We want to create two separate datasets; one for our CatBoost model and one for our Keras models.

In [3]:
df = pd.read_parquet("../data/raw/input_dataset-2.parquet")
df = df.dropna()
df

Unnamed: 0_level_0,Unit_4_Power,Unit_4_Reactive Power,Turbine_Guide Vane Opening,Turbine_Pressure Drafttube,Turbine_Pressure Spiral Casing,Turbine_Rotational Speed,mode,Bolt_1_Steel tmp,Bolt_1_Tensile,Bolt_2_Tensile,...,Bolt_5_Tensile,Bolt_6_Tensile,Bolt_1_Torsion,Bolt_2_Torsion,Bolt_3_Torsion,Bolt_4_Torsion,Bolt_5_Torsion,Bolt_6_Torsion,lower_bearing_vib_vrt,turbine_bearing_vib_vrt
timepoints,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1970-12-29 10:59:57,311.093257,4.949223,94.206187,150.827828,5305.873472,108.033198,operation,2.458908,1609.140569,1488.522739,...,1637.373796,1678.107726,177.730733,163.324591,146.500052,226.013417,298.403158,161.914265,0.154292,0.517703
1970-12-29 10:59:58,311.103996,5.051777,94.206457,150.774664,5305.690188,108.033197,operation,2.458729,1609.127944,1488.494639,...,1637.353554,1678.100380,177.736039,163.323607,146.498639,226.013106,298.403272,161.912569,0.155838,0.530280
1970-12-29 10:59:59,311.114735,5.154330,94.206726,150.559452,5305.466701,108.033196,operation,2.459334,1609.138758,1488.493451,...,1637.360655,1678.106863,177.735616,163.328645,146.494156,226.013736,298.397846,161.912716,0.169547,0.540085
1970-12-29 11:00:00,311.125475,5.256883,94.206995,150.344239,5305.243213,108.033195,operation,2.459329,1609.121478,1488.488279,...,1637.365372,1678.078172,177.738552,163.331201,146.491341,226.017247,298.400658,161.912572,0.173561,0.526994
1970-12-29 11:00:01,311.136214,5.359436,94.207264,150.129027,5305.019725,108.033194,operation,2.459337,1609.113766,1488.488892,...,1637.354723,1678.067193,177.741082,163.329769,146.498284,226.013418,298.402517,161.915946,0.166955,0.524617
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1971-01-25 11:06:44,308.716025,3.974309,94.425666,157.927905,5280.929965,108.057498,operation,4.193937,1637.386115,1504.557822,...,1640.704071,1690.014705,183.204777,178.082932,145.759475,225.351989,300.667011,160.949816,0.159366,0.491265
1971-01-25 11:06:45,308.746393,4.103262,94.429003,157.974925,5280.633358,108.057492,operation,4.193254,1637.365865,1504.546091,...,1640.711250,1690.017029,183.203293,178.082287,145.766584,225.348279,300.674243,160.947868,0.155962,0.497242
1971-01-25 11:06:46,308.776762,4.472929,94.432340,158.021945,5280.336751,108.057486,operation,4.193261,1637.384133,1504.538696,...,1640.699142,1690.002008,183.212397,178.081678,145.764007,225.354785,300.674078,160.947644,0.141150,0.501525
1971-01-25 11:06:47,308.807131,4.842597,94.435677,158.068966,5280.040144,108.057479,operation,4.192795,1637.357141,1504.531582,...,1640.685782,1689.995135,183.212669,178.080734,145.763103,225.355483,300.675584,160.944036,0.160915,0.508167


In [4]:
X_cols = ["Unit_4_Power", "Unit_4_Reactive Power", "Turbine_Guide Vane Opening", "Turbine_Pressure Drafttube", "Turbine_Pressure Spiral Casing", "Turbine_Rotational Speed"]
y_cols = [c for c in df if c.endswith("Tensile")]

### Create CatBoost dataset

In [5]:
lookback = 20

cX = df[X_cols].copy()
for i in range(1, lookback+1):
    cX.loc[:, [f"{c} (t-{i})" for c in X_cols]] = cX[X_cols].shift(i).rename(columns={c: f"{c} (t-{i})" for c in X_cols})

cX = cX[[c for c in cX if c not in X_cols]]
cy = df[y_cols]

cX

Unnamed: 0_level_0,Unit_4_Power (t-1),Unit_4_Reactive Power (t-1),Turbine_Guide Vane Opening (t-1),Turbine_Pressure Drafttube (t-1),Turbine_Pressure Spiral Casing (t-1),Turbine_Rotational Speed (t-1),Unit_4_Power (t-2),Unit_4_Reactive Power (t-2),Turbine_Guide Vane Opening (t-2),Turbine_Pressure Drafttube (t-2),...,Turbine_Guide Vane Opening (t-19),Turbine_Pressure Drafttube (t-19),Turbine_Pressure Spiral Casing (t-19),Turbine_Rotational Speed (t-19),Unit_4_Power (t-20),Unit_4_Reactive Power (t-20),Turbine_Guide Vane Opening (t-20),Turbine_Pressure Drafttube (t-20),Turbine_Pressure Spiral Casing (t-20),Turbine_Rotational Speed (t-20)
timepoints,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1970-12-29 10:59:57,,,,,,,,,,,...,,,,,,,,,,
1970-12-29 10:59:58,311.093257,4.949223,94.206187,150.827828,5305.873472,108.033198,,,,,...,,,,,,,,,,
1970-12-29 10:59:59,311.103996,5.051777,94.206457,150.774664,5305.690188,108.033197,311.093257,4.949223,94.206187,150.827828,...,,,,,,,,,,
1970-12-29 11:00:00,311.114735,5.154330,94.206726,150.559452,5305.466701,108.033196,311.103996,5.051777,94.206457,150.774664,...,,,,,,,,,,
1970-12-29 11:00:01,311.125475,5.256883,94.206995,150.344239,5305.243213,108.033195,311.114735,5.154330,94.206726,150.559452,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1971-01-25 11:06:44,308.685656,4.382225,94.422329,157.885326,5281.215929,108.057505,308.655287,4.809967,94.418992,157.924633,...,94.362264,158.728007,5282.350036,108.057619,308.108652,4.311835,94.358928,158.501036,5282.203114,108.057625
1971-01-25 11:06:45,308.716025,3.974309,94.425666,157.927905,5280.929965,108.057498,308.685656,4.382225,94.422329,157.885326,...,94.365601,158.954979,5282.496959,108.057612,308.139020,4.309002,94.362264,158.728007,5282.350036,108.057619
1971-01-25 11:06:46,308.746393,4.103262,94.429003,157.974925,5280.633358,108.057492,308.716025,3.974309,94.425666,157.927905,...,94.368938,159.181950,5282.643881,108.057606,308.169389,4.357300,94.365601,158.954979,5282.496959,108.057612
1971-01-25 11:06:47,308.776762,4.472929,94.432340,158.021945,5280.336751,108.057486,308.746393,4.103262,94.429003,157.974925,...,94.372275,159.283704,5282.734255,108.057600,308.199758,4.405598,94.368938,159.181950,5282.643881,108.057606


In [10]:
def train_test_split(X, y, test_percent=0.1, offset_percent=0):
    
    test_start = int(len(df) * offset_percent)
    test_end = int(len(df) * (offset_percent + test_percent))

    train = df.iloc[:test_start]
    test = df.iloc[test_start:test_end]

    X_train, X_test = X.iloc[:test_start], X.iloc[test_start:test_end]
    y_train, y_test = y.iloc[:test_start], y.iloc[test_start:test_end]
    
    return X_train, X_test, y_train, y_test

cX_train, cX_test, cy_train, cy_test = train_test_split(cX, cy, test_percent=0.1, offset_percent=0.9)

### CatBoost Modeling

In [None]:
params = {
    "loss_function": "MAPE",
    "iterations": 100,
    "depth": 5
}

cy_trains = [cy_train[c] for c in cy_train]
cy_tests = [cy_test[c] for c in cy_test]

models = []

for j in range(len(cy_trains)):
    model = CatBoostRegressor(**params)
    model.fit(cX_train, cy_trains[j], eval_set=(cX_test, cy_tests[j]), verbose=False)
    models.append(model)

Custom logger is already specified. Specify more than one logger at same time is not thread safe.