# Modeling

In [3]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from catboost import CatBoostRegressor
import shap


from sklearn.preprocessing import MinMaxScaler

## Prepare the Data Sets
We want to create two separate datasets; one for our CatBoost model and one for our Keras models.

In [4]:
df = pd.read_parquet("../data/raw/input_dataset-2.parquet")
df = df.dropna()

In [5]:
X_cols = ["Unit_4_Power", "Unit_4_Reactive Power", "Turbine_Guide Vane Opening", "Turbine_Pressure Drafttube", "Turbine_Pressure Spiral Casing", "Turbine_Rotational Speed"]
y_cols = [c for c in df if c.endswith("Tensile")]

### Create CatBoost dataset

In [6]:
lookback = 20

cX = df[X_cols].copy()
for i in range(1, lookback+1):
    cX.loc[:, [f"{c} (t-{i})" for c in X_cols]] = cX[X_cols].shift(i).rename(columns={c: f"{c} (t-{i})" for c in X_cols})

cX = cX[[c for c in cX if c not in X_cols]]
cy = df[y_cols]

cX

Unnamed: 0_level_0,Unit_4_Power (t-1),Unit_4_Reactive Power (t-1),Turbine_Guide Vane Opening (t-1),Turbine_Pressure Drafttube (t-1),Turbine_Pressure Spiral Casing (t-1),Turbine_Rotational Speed (t-1),Unit_4_Power (t-2),Unit_4_Reactive Power (t-2),Turbine_Guide Vane Opening (t-2),Turbine_Pressure Drafttube (t-2),...,Turbine_Guide Vane Opening (t-19),Turbine_Pressure Drafttube (t-19),Turbine_Pressure Spiral Casing (t-19),Turbine_Rotational Speed (t-19),Unit_4_Power (t-20),Unit_4_Reactive Power (t-20),Turbine_Guide Vane Opening (t-20),Turbine_Pressure Drafttube (t-20),Turbine_Pressure Spiral Casing (t-20),Turbine_Rotational Speed (t-20)
timepoints,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1970-12-29 10:59:57,,,,,,,,,,,...,,,,,,,,,,
1970-12-29 10:59:58,311.093257,4.949223,94.206187,150.827828,5305.873472,108.033198,,,,,...,,,,,,,,,,
1970-12-29 10:59:59,311.103996,5.051777,94.206457,150.774664,5305.690188,108.033197,311.093257,4.949223,94.206187,150.827828,...,,,,,,,,,,
1970-12-29 11:00:00,311.114735,5.154330,94.206726,150.559452,5305.466701,108.033196,311.103996,5.051777,94.206457,150.774664,...,,,,,,,,,,
1970-12-29 11:00:01,311.125475,5.256883,94.206995,150.344239,5305.243213,108.033195,311.114735,5.154330,94.206726,150.559452,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1971-01-25 11:06:44,308.685656,4.382225,94.422329,157.885326,5281.215929,108.057505,308.655287,4.809967,94.418992,157.924633,...,94.362264,158.728007,5282.350036,108.057619,308.108652,4.311835,94.358928,158.501036,5282.203114,108.057625
1971-01-25 11:06:45,308.716025,3.974309,94.425666,157.927905,5280.929965,108.057498,308.685656,4.382225,94.422329,157.885326,...,94.365601,158.954979,5282.496959,108.057612,308.139020,4.309002,94.362264,158.728007,5282.350036,108.057619
1971-01-25 11:06:46,308.746393,4.103262,94.429003,157.974925,5280.633358,108.057492,308.716025,3.974309,94.425666,157.927905,...,94.368938,159.181950,5282.643881,108.057606,308.169389,4.357300,94.365601,158.954979,5282.496959,108.057612
1971-01-25 11:06:47,308.776762,4.472929,94.432340,158.021945,5280.336751,108.057486,308.746393,4.103262,94.429003,157.974925,...,94.372275,159.283704,5282.734255,108.057600,308.199758,4.405598,94.368938,159.181950,5282.643881,108.057606


In [7]:
def train_test_split(X, y, test_percent=0.1, offset_percent=0):
    
    test_start = int(len(df) * offset_percent)
    test_end = int(len(df) * (offset_percent + test_percent))

    X_train, X_test = X.iloc[:test_start], X.iloc[test_start:test_end]
    y_train, y_test = y.iloc[:test_start], y.iloc[test_start:test_end]
    
    return X_train, X_test, y_train, y_test

cX_train, cX_test, cy_train, cy_test = train_test_split(cX, cy, test_percent=0.1, offset_percent=0.9)

### CatBoost Modeling

In [8]:
params = {
    "loss_function": "MAPE",
    "iterations": 100,
    "depth": 5
}

cy_trains = [cy_train[c] for c in cy_train]
cy_tests = [cy_test[c] for c in cy_test]

models = []

for j in range(len(cy_trains)):
    model = CatBoostRegressor(**params)
    model.fit(cX_train, cy_trains[j], eval_set=(cX_test, cy_tests[j]), verbose=False)
    models.append(model)

In [9]:
model = models[0]

In [10]:
explainer = shap.Explainer(model)
shap_values = explainer(cX_train)

In [11]:
shap_values.shape

(849537, 120)

In [12]:
shap_values


.values =
array([[ 0.        ,  0.001404  ,  0.        , ..., -0.07108331,
         0.08506613, -0.00089547],
       [ 0.        ,  0.0060514 ,  0.        , ..., -0.11826315,
         0.12227749, -0.00089547],
       [ 0.        ,  0.0060514 ,  0.        , ..., -0.11826315,
         0.12227749, -0.00089547],
       ...,
       [ 0.        ,  0.05691564,  0.        , ...,  0.3750445 ,
         0.1728815 ,  0.00076257],
       [ 0.        ,  0.05691564,  0.        , ...,  0.3750445 ,
         0.1728815 ,  0.00076257],
       [ 0.        ,  0.05691564,  0.        , ...,  0.3750445 ,
         0.1728815 ,  0.00076257]])

.base_values =
array([1619.37954954, 1619.37954954, 1619.37954954, ..., 1619.37954954,
       1619.37954954, 1619.37954954])

.data =
array([[            nan,             nan,             nan, ...,
                    nan,             nan,             nan],
       [ 3.11093257e+02,  4.94922343e+00,  9.42061873e+01, ...,
                    nan,             nan,             

In [13]:
sdf = pd.DataFrame(shap_values.values, columns=cX_train.columns).abs().sum()
sdf

Unit_4_Power (t-1)                            0.000000
Unit_4_Reactive Power (t-1)               15806.326818
Turbine_Guide Vane Opening (t-1)              0.000000
Turbine_Pressure Drafttube (t-1)          99645.988634
Turbine_Pressure Spiral Casing (t-1)      18674.403082
                                             ...      
Unit_4_Reactive Power (t-20)              15827.065905
Turbine_Guide Vane Opening (t-20)         24343.842640
Turbine_Pressure Drafttube (t-20)        102119.733197
Turbine_Pressure Spiral Casing (t-20)     94245.823148
Turbine_Rotational Speed (t-20)            3731.454099
Length: 120, dtype: float64

In [29]:
sdf_ = pd.Series(sdf.index, index=sdf.index).apply(lambda s: s[:s.index("(")-1])
#sdf.groupby(sdf_).sum().sort_values()


TypeError: 'Index' object is not callable

In [33]:
sdf[[c for c in sdf.index if "Reactive Power" in c]].sort_values()

Unit_4_Reactive Power (t-8)      1609.965377
Unit_4_Reactive Power (t-7)      2928.844220
Unit_4_Reactive Power (t-12)     3095.025894
Unit_4_Reactive Power (t-3)      3373.090075
Unit_4_Reactive Power (t-10)     3760.947625
Unit_4_Reactive Power (t-16)     4393.168707
Unit_4_Reactive Power (t-5)      4783.449326
Unit_4_Reactive Power (t-6)      5467.979970
Unit_4_Reactive Power (t-9)      7103.728111
Unit_4_Reactive Power (t-13)     7792.596052
Unit_4_Reactive Power (t-15)     8839.078864
Unit_4_Reactive Power (t-4)      9468.345591
Unit_4_Reactive Power (t-17)     9650.702509
Unit_4_Reactive Power (t-14)    11794.061985
Unit_4_Reactive Power (t-18)    12163.559782
Unit_4_Reactive Power (t-1)     15806.326818
Unit_4_Reactive Power (t-20)    15827.065905
Unit_4_Reactive Power (t-11)    16357.576279
Unit_4_Reactive Power (t-2)     16371.071230
Unit_4_Reactive Power (t-19)    20551.026578
dtype: float64

In [None]:
#shap.plots.beeswarm(shap_values, max_display=50, order=shap_values.abs.mean(0))

In [None]:
#shap.plots.bar(shap_values.abs.mean(0))