# Modeling

In [32]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from catboost import CatBoostRegressor
import shap


from sklearn.preprocessing import MinMaxScaler

## Prepare the Data Sets
We want to create two separate datasets; one for our CatBoost model and one for our Keras models.

In [33]:
df = pd.read_parquet("../data/structured/general/combined_data.parquet")
df = df.dropna()

In [34]:
X_cols = ["Unit_4_Power", "Unit_4_Reactive Power", "Turbine_Guide Vane Opening", "Turbine_Pressure Drafttube", "Turbine_Pressure Spiral Casing", "Turbine_Rotational Speed"]
y_cols = [c for c in df if c.endswith("Tensile")]

### Create CatBoost dataset

In [35]:
X_cols = ["Unit_4_Power", "Unit_4_Reactive Power", "Turbine_Guide Vane Opening", "Turbine_Pressure Drafttube", "Turbine_Pressure Spiral Casing", "Turbine_Rotational Speed"]
extra_cols = ["seconds_since_start", "month", "day_of_month", "day_of_week"]

In [36]:
lookback = 20

iterate_cols = list(df.drop(columns=extra_cols+y_cols).columns)

cX = df.drop(columns=y_cols).copy()
for i in range(1, lookback+1):
    cX.loc[:, [f"{c} (t-{i})" for c in iterate_cols]] = cX[iterate_cols].shift(i).rename(columns={c: f"{c} (t-{i})" for c in iterate_cols})

cX = cX[[c for c in cX if c not in X_cols]]
cy = df[y_cols]

cX

Unnamed: 0_level_0,seconds_since_start,month,day_of_month,day_of_week,is_starting,Netto Power,Power / vane opening,seconds_since_last_data,seconds_since_last_start,Power / Drafttube pressure,...,Power / vane opening (t-2),seconds_since_last_data (t-2),seconds_since_last_start (t-2),Power / Drafttube pressure (t-2),Bolt_1_Tensile_adj (t-2),Bolt_2_Tensile_adj (t-2),Bolt_3_Tensile_adj (t-2),Bolt_4_Tensile_adj (t-2),Bolt_5_Tensile_adj (t-2),Bolt_6_Tensile_adj (t-2)
timepoints,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1970-12-19 09:51:45,1.0,12,19,5,0,258.759689,3.185623,1.0,1.0,1.506435,...,,,,,,,,,,
1970-12-19 09:51:46,2.0,12,19,5,0,258.214106,3.184514,1.0,2.0,1.505561,...,,,,,,,,,,
1970-12-19 09:51:47,3.0,12,19,5,0,257.668524,3.183404,1.0,3.0,1.504687,...,3.185623,1.0,1.0,1.506435,115.477449,43.989528,72.261611,3.366508,6.588478,38.823883
1970-12-19 09:51:48,4.0,12,19,5,0,257.739592,3.182294,1.0,4.0,1.503294,...,3.184514,1.0,2.0,1.505561,115.479316,44.003188,72.270504,3.374254,6.583464,38.841318
1970-12-19 09:51:49,5.0,12,19,5,0,258.533851,3.181184,1.0,5.0,1.500409,...,3.183404,1.0,3.0,1.504687,115.490184,44.028827,72.270683,3.383179,6.581384,38.843245
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1971-01-25 11:06:44,2769300.0,1,25,0,0,304.741716,3.269408,1.0,19788.0,1.954791,...,3.268996,1.0,19786.0,1.954447,154.385160,67.558810,89.666022,8.277246,11.734150,54.051294
1971-01-25 11:06:45,2769301.0,1,25,0,0,304.643131,3.269614,1.0,19789.0,1.954401,...,3.269202,1.0,19787.0,1.955126,154.395516,67.565687,89.660031,8.271519,11.718269,54.027884
1971-01-25 11:06:46,2769302.0,1,25,0,0,304.303833,3.269820,1.0,19790.0,1.954012,...,3.269408,1.0,19788.0,1.954791,154.386115,67.557822,89.651420,8.276545,11.704071,54.014705
1971-01-25 11:06:47,2769303.0,1,25,0,0,303.964534,3.270026,1.0,19791.0,1.953623,...,3.269614,1.0,19789.0,1.954401,154.365865,67.546091,89.654301,8.271877,11.711250,54.017029


In [37]:
def train_test_split(X, y, test_percent=0.1, offset_percent=0):
    
    test_start = int(len(df) * offset_percent)
    test_end = int(len(df) * (offset_percent + test_percent))

    X_train, X_test = X.iloc[:test_start], X.iloc[test_start:test_end]
    y_train, y_test = y.iloc[:test_start], y.iloc[test_start:test_end]
    
    return X_train, X_test, y_train, y_test

cX_train, cX_test, cy_train, cy_test = train_test_split(cX, cy, test_percent=0.1, offset_percent=0.9)

### CatBoost Modeling

In [38]:
params = {
    "loss_function": "MAPE",
    "iterations": 100,
    "depth": 5
}

cy_trains = [cy_train[c] for c in cy_train]
cy_tests = [cy_test[c] for c in cy_test]

models = []

for j in range(len(cy_trains)):
    model = CatBoostRegressor(**params)
    model.fit(cX_train, cy_trains[j], eval_set=(cX_test, cy_tests[j]), verbose=False)
    models.append(model)

In [39]:
model = models[0]

In [40]:
explainer = shap.Explainer(model)
shap_values = explainer(cX_train)

In [None]:
shap_values.shape

(849537, 120)

In [None]:
shap_values


.values =
array([[ 0.        ,  0.001404  ,  0.        , ..., -0.07108331,
         0.08506613, -0.00089547],
       [ 0.        ,  0.0060514 ,  0.        , ..., -0.11826315,
         0.12227749, -0.00089547],
       [ 0.        ,  0.0060514 ,  0.        , ..., -0.11826315,
         0.12227749, -0.00089547],
       ...,
       [ 0.        ,  0.05691564,  0.        , ...,  0.3750445 ,
         0.1728815 ,  0.00076257],
       [ 0.        ,  0.05691564,  0.        , ...,  0.3750445 ,
         0.1728815 ,  0.00076257],
       [ 0.        ,  0.05691564,  0.        , ...,  0.3750445 ,
         0.1728815 ,  0.00076257]])

.base_values =
array([1619.37954954, 1619.37954954, 1619.37954954, ..., 1619.37954954,
       1619.37954954, 1619.37954954])

.data =
array([[            nan,             nan,             nan, ...,
                    nan,             nan,             nan],
       [ 3.11093257e+02,  4.94922343e+00,  9.42061873e+01, ...,
                    nan,             nan,             

In [None]:
sdf = pd.DataFrame(shap_values.values, columns=cX_train.columns).abs().sum()
sdf

Unit_4_Power (t-1)                            0.000000
Unit_4_Reactive Power (t-1)               15806.326818
Turbine_Guide Vane Opening (t-1)              0.000000
Turbine_Pressure Drafttube (t-1)          99645.988634
Turbine_Pressure Spiral Casing (t-1)      18674.403082
                                             ...      
Unit_4_Reactive Power (t-20)              15827.065905
Turbine_Guide Vane Opening (t-20)         24343.842640
Turbine_Pressure Drafttube (t-20)        102119.733197
Turbine_Pressure Spiral Casing (t-20)     94245.823148
Turbine_Rotational Speed (t-20)            3731.454099
Length: 120, dtype: float64

In [None]:
sdf_ = pd.Series(sdf.index, index=sdf.index).apply(lambda s: s[:s.index("(")-1])
#sdf.groupby(sdf_).sum().sort_values()


TypeError: 'Index' object is not callable

In [None]:
sdf[[c for c in sdf.index if "Reactive Power" in c]].sort_values()

Unit_4_Reactive Power (t-8)      1609.965377
Unit_4_Reactive Power (t-7)      2928.844220
Unit_4_Reactive Power (t-12)     3095.025894
Unit_4_Reactive Power (t-3)      3373.090075
Unit_4_Reactive Power (t-10)     3760.947625
Unit_4_Reactive Power (t-16)     4393.168707
Unit_4_Reactive Power (t-5)      4783.449326
Unit_4_Reactive Power (t-6)      5467.979970
Unit_4_Reactive Power (t-9)      7103.728111
Unit_4_Reactive Power (t-13)     7792.596052
Unit_4_Reactive Power (t-15)     8839.078864
Unit_4_Reactive Power (t-4)      9468.345591
Unit_4_Reactive Power (t-17)     9650.702509
Unit_4_Reactive Power (t-14)    11794.061985
Unit_4_Reactive Power (t-18)    12163.559782
Unit_4_Reactive Power (t-1)     15806.326818
Unit_4_Reactive Power (t-20)    15827.065905
Unit_4_Reactive Power (t-11)    16357.576279
Unit_4_Reactive Power (t-2)     16371.071230
Unit_4_Reactive Power (t-19)    20551.026578
dtype: float64

In [None]:
shap.plots.beeswarm(shap_values, max_display=50, order=shap_values.abs.mean(0))

In [None]:
shap.plots.bar(shap_values.abs.mean(0))