In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from catboost import CatBoostRegressor

In [18]:
df = pd.read_parquet("../data/structured/general/combined_data.parquet")
pred_in = pd.read_parquet("../data/raw/prediction_input.parquet")
pred_in

Unnamed: 0_level_0,Unit_4_Power,Unit_4_Reactive Power,Turbine_Guide Vane Opening,Turbine_Pressure Drafttube,Turbine_Pressure Spiral Casing,Turbine_Rotational Speed,mode
timepoints,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1971-01-25 11:06:49,308.867868,5.592261,94.442351,158.159044,5279.876581,108.057467,operation
1971-01-25 11:06:50,308.898237,6.251969,94.445687,158.202829,5279.930843,108.057460,operation
1971-01-25 11:06:51,308.928605,7.037091,94.449024,158.246614,5279.985105,108.057454,operation
1971-01-25 11:06:52,308.958974,7.822213,94.452361,158.290399,5280.039368,108.057448,operation
1971-01-25 11:06:53,308.989343,8.607335,94.455698,158.302931,5280.058748,108.057442,operation
...,...,...,...,...,...,...,...
1971-02-03 20:16:19,113.553752,-20.230184,44.871419,100.034072,5419.023758,106.746818,operation
1971-02-03 20:16:20,110.520494,-20.911359,43.954699,100.034548,5419.328873,106.734902,operation
1971-02-03 20:16:21,107.487618,-21.592784,43.037925,100.035024,5419.633988,106.722986,operation
1971-02-03 20:16:22,104.454741,-22.274208,42.121151,100.035287,5420.094237,106.711070,operation


## Structuring the prediction input data

In [20]:
index_series = pd.Series(pred_in.index, index=pred_in.index)
pred_in["days_since_start"] = index_series.diff().dt.days.cumsum()

pred_in["is_starting"] = pred_in["mode"].replace("operation", 0).replace("start", 1).astype(int)

pred_in["Netto Power"] = pred_in["Unit_4_Power"] - pred_in["Unit_4_Reactive Power"]

pred_in["Power / vane opening"] = pred_in["Unit_4_Power"] / pred_in["Turbine_Guide Vane Opening"]

pred_in["startnr"] = (pred_in["is_starting"]==1).cumsum()
pred_in["seconds_since_last_data"] = pd.Series(pred_in.index, index=pred_in.index).diff().dt.seconds
pred_in["seconds_since_last_start"] = pred_in.groupby("startnr")["seconds_since_last_data"].cumsum()
pred_in.loc[pred_in["is_starting"]==1, "seconds_since_last_start"] = 0

pred_in["Power / Drafttube pressure"] = pred_in["Unit_4_Power"] / pred_in["Turbine_Pressure Drafttube"]

## Training a model

In [10]:
df = df.dropna()
df

Unnamed: 0_level_0,Unit_4_Power,Unit_4_Reactive Power,Turbine_Guide Vane Opening,Turbine_Pressure Drafttube,Turbine_Pressure Spiral Casing,Turbine_Rotational Speed,Bolt_1_Tensile,Bolt_2_Tensile,Bolt_3_Tensile,Bolt_4_Tensile,...,Power / vane opening,seconds_since_last_data,seconds_since_last_start,Power / Drafttube pressure,Bolt_1_Tensile_adj,Bolt_2_Tensile_adj,Bolt_3_Tensile_adj,Bolt_4_Tensile_adj,Bolt_5_Tensile_adj,Bolt_6_Tensile_adj
timepoints,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1970-12-19 09:51:45,262.104319,3.344630,82.277248,173.989815,5311.219755,107.964273,1598.477449,1480.989528,1684.261611,1601.366508,...,3.185623,1.0,1.0,1.506435,115.477449,43.989528,72.261611,3.366508,6.588478,38.823883
1970-12-19 09:51:46,262.004330,3.790223,82.274520,174.024413,5311.640329,107.964269,1598.479316,1481.003188,1684.270504,1601.374254,...,3.184514,1.0,2.0,1.505561,115.479316,44.003188,72.270504,3.374254,6.583464,38.841318
1970-12-19 09:51:47,261.904340,4.235817,82.271792,174.059012,5312.060902,107.964264,1598.490184,1481.028827,1684.270683,1601.383179,...,3.183404,1.0,3.0,1.504687,115.490184,44.028827,72.270683,3.383179,6.581384,38.843245
1970-12-19 09:51:48,261.804351,4.064759,82.269064,174.153819,5312.405938,107.964259,1598.494073,1481.059017,1684.271062,1601.378391,...,3.182294,1.0,4.0,1.503294,115.494073,44.059017,72.271062,3.378391,6.591746,38.872300
1970-12-19 09:51:49,261.704362,3.170510,82.266336,174.422046,5312.533396,107.964254,1598.498916,1481.075521,1684.276622,1601.380601,...,3.181184,1.0,5.0,1.500409,115.498916,44.075521,72.276622,3.380601,6.607884,38.924469
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1971-01-25 11:06:44,308.716025,3.974309,94.425666,157.927905,5280.929965,108.057498,1637.386115,1504.557822,1701.651420,1606.276545,...,3.269408,1.0,19788.0,1.954791,154.386115,67.557822,89.651420,8.276545,11.704071,54.014705
1971-01-25 11:06:45,308.746393,4.103262,94.429003,157.974925,5280.633358,108.057492,1637.365865,1504.546091,1701.654301,1606.271877,...,3.269614,1.0,19789.0,1.954401,154.365865,67.546091,89.654301,8.271877,11.711250,54.017029
1971-01-25 11:06:46,308.776762,4.472929,94.432340,158.021945,5280.336751,108.057486,1637.384133,1504.538696,1701.656143,1606.250028,...,3.269820,1.0,19790.0,1.954012,154.384133,67.538696,89.656143,8.250028,11.699142,54.002008
1971-01-25 11:06:47,308.807131,4.842597,94.435677,158.068966,5280.040144,108.057479,1637.357141,1504.531582,1701.662201,1606.245665,...,3.270026,1.0,19791.0,1.953623,154.357141,67.531582,89.662201,8.245665,11.685782,53.995135


In [16]:
X_cols = [
    "Turbine_Pressure Drafttube",
    "seconds_since_last_start"
]

y_cols = [c for c in df if c.endswith("Tensile")]

In [17]:
cX = df[X_cols]
cy = df[y_cols]

cX

Unnamed: 0_level_0,Turbine_Pressure Drafttube,seconds_since_last_start
timepoints,Unnamed: 1_level_1,Unnamed: 2_level_1
1970-12-19 09:51:45,173.989815,1.0
1970-12-19 09:51:46,174.024413,2.0
1970-12-19 09:51:47,174.059012,3.0
1970-12-19 09:51:48,174.153819,4.0
1970-12-19 09:51:49,174.422046,5.0
...,...,...
1971-01-25 11:06:44,157.927905,19788.0
1971-01-25 11:06:45,157.974925,19789.0
1971-01-25 11:06:46,158.021945,19790.0
1971-01-25 11:06:47,158.068966,19791.0


In [26]:
params = {
    "iterations": 10
}

cys = [cy[c] for c in cy]

models = []
for i in range(len(cys)):
    model = CatBoostRegressor(**params)
    model.fit(cX, cys[i])
    models.append(model)

Learning rate set to 0.5
0:	learn: 8.2289109	total: 125ms	remaining: 1.12s
1:	learn: 7.3456984	total: 244ms	remaining: 975ms
2:	learn: 6.9886526	total: 365ms	remaining: 852ms
3:	learn: 6.7764799	total: 467ms	remaining: 700ms
4:	learn: 6.6189124	total: 572ms	remaining: 572ms
5:	learn: 6.5038226	total: 678ms	remaining: 452ms
6:	learn: 6.4411594	total: 773ms	remaining: 331ms
7:	learn: 6.3195371	total: 890ms	remaining: 223ms
8:	learn: 6.2739347	total: 1.01s	remaining: 113ms
9:	learn: 6.2229001	total: 1.12s	remaining: 0us
Learning rate set to 0.5
0:	learn: 4.9698139	total: 130ms	remaining: 1.17s
1:	learn: 4.3998574	total: 251ms	remaining: 1s
2:	learn: 4.1967540	total: 372ms	remaining: 867ms
3:	learn: 4.0740293	total: 486ms	remaining: 729ms
4:	learn: 3.9873659	total: 583ms	remaining: 583ms
5:	learn: 3.9293274	total: 709ms	remaining: 472ms
6:	learn: 3.8608290	total: 811ms	remaining: 348ms
7:	learn: 3.8214697	total: 930ms	remaining: 232ms
8:	learn: 3.7912100	total: 1.03s	remaining: 114ms
9:	le

In [35]:
preds = []
for i in range(len(cys)):
    preds.append(
        models[i].predict(pred_in[X_cols])
    )

In [36]:
preds_df = pd.concat([pd.Series(p) for p in preds], axis=1)
preds_df.columns = [c for c in df if c.endswith("Tensile")]
preds_df.index = pred_in.index
preds_df

Unnamed: 0_level_0,Bolt_1_Tensile,Bolt_2_Tensile,Bolt_3_Tensile,Bolt_4_Tensile,Bolt_5_Tensile,Bolt_6_Tensile
timepoints,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1971-01-25 11:06:49,1624.247311,1493.557197,1692.857855,1605.919493,1636.641968,1685.689701
1971-01-25 11:06:50,1624.247311,1493.557197,1692.857855,1605.919493,1636.641968,1685.689701
1971-01-25 11:06:51,1624.247311,1493.557197,1692.857855,1605.919493,1636.641968,1685.689701
1971-01-25 11:06:52,1624.247311,1493.557197,1692.857855,1605.919493,1636.641968,1685.689701
1971-01-25 11:06:53,1624.247311,1493.557197,1692.857855,1605.919493,1636.641968,1685.689701
...,...,...,...,...,...,...
1971-02-03 20:16:19,1617.827118,1493.173026,1692.512238,1603.140820,1636.743049,1680.850251
1971-02-03 20:16:20,1617.827118,1493.173026,1692.512238,1603.140820,1636.743049,1680.850251
1971-02-03 20:16:21,1617.827118,1493.173026,1692.512238,1603.140820,1636.743049,1680.850251
1971-02-03 20:16:22,1617.827118,1493.173026,1692.512238,1603.140820,1636.743049,1680.850251
