In [22]:
import lightgbm as lgb
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
import catboost as cb
import numpy as np

In [5]:
train_data = pd.read_csv('../data/processed/train.csv')
test_data = pd.read_csv('../data/processed/X_test.csv')

In [6]:
train_data.columns = [column.replace(":", "_") for column in train_data.columns]
test_data.columns = [column.replace(":", "_") for column in test_data.columns]

In [7]:



train_lgb = lgb.Dataset(
    data=train_data[
        [column for column in train_data.columns if column != "pv_measurement"]
    ],
    label=train_data[["pv_measurement"]],
)

tss = TimeSeriesSplit(5) 
folds = tss.split(train_data) 

params = {
    'objective': "regression",  
    "max_depth": 27,
    "n_estimators": 180,
    "num_leaves": 69,
    "col_sample_bytree": 10e-1,
    "subsample": 10e-1,
    "reg_alpha": 5e-2,
    "reg_lambda": 0,
    "min_split_gain": 0,
    "min_child_samples": 39,
    "learning_rate": 6e-2,
}

# 27	180	69	10E-1	10E-1	5E-1	0	0	39	6E-2


model = lgb.LGBMRegressor(**params).fit(
    train_data[[column for column in train_data.columns if column != "pv_measurement"]],
    train_data[["pv_measurement"]],
)



In [8]:
pd.DataFrame(model.predict(test_data)).to_csv("../data/processed/lgb_hypertune.csv", index=False)

In [141]:
cb_train_data = train_data.copy()
cb_train_data["location"] = np.where(cb_train_data["A"] == 1, "A", np.where(cb_train_data["B"] == 1, "B", "C"))
cb_train_data = cb_train_data.drop(["A", "B", "C"], axis=1)

cb_test_data = test_data.copy()
cb_test_data["location"] = np.where(cb_test_data["A"] == 1, "A", np.where(cb_test_data["B"] == 1, "B", "C"))
cb_test_data = cb_test_data.drop(["A", "B", "C"], axis=1)

cb_train_data["hour"] = (cb_train_data["hour"] * 23).round(0).astype(int)
cb_train_data["hours_since_11"] = np.abs(11 - cb_train_data["hour"])
cb_train_data["month"] = (cb_train_data["month"] * 11).round(0).astype(int)
cb_train_data["dayofweek"] = (cb_train_data["dayofweek"] * 6).round(0).astype(int)
cb_train_data["dayofmonth"] = (cb_train_data["dayofmonth"] * 30).round(0).astype(int)
cb_train_data["months_since_june"] = np.abs(6 - cb_train_data["month"])
cb_train_data["is_in_shadow_idx"] = cb_train_data["is_in_shadow_idx"].astype(bool)
cb_train_data["is_day_idx"] = cb_train_data["is_day_idx"].astype(bool)


In [180]:
split_index = int(cb_train_data.shape[0]*0.9)

In [171]:
pool = cb.Pool(
    data=cb_train_data[
        [column for column in cb_train_data.columns if not column in ["pv_measurement"]]
    ],
    cat_features=[
        "location",
        "hour",
        "month",
        "dayofweek",
        "dayofmonth",
        "is_day_idx",
        "is_in_shadow_idx",
    ],
    label=cb_train_data[["pv_measurement"]],
)

results = cb.cv(
    pool=pool,
    params={
        "eval_metric": "MAE",
        "loss_function": "MAE",
        "has_time": True,
        "one_hot_max_size": 24,
        "iterations": 300,
        "depth": 10,
        "learning_rate": 0.1,
        "l2_leaf_reg": 0.1,
        "border_count": 32,
        "bagging_temperature": 0.2,
        "random_strength": 0.2,
        "max_leaves": 31,
        # "grow_policy": "SymmetricTree",
        # "min_data_in_leaf": 1,
        # "bootstrap_type": "Bernoulli",
        # "subsample": 0.8,
    },
    early_stopping_rounds=50,
    inverted=False,
    shuffle=False,
    stratified=False,
    as_pandas=True,
    plot=True,
    folds=tss,
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Training on fold [0/8]
0:	learn: 0.1014749	test: 0.1114965	best: 0.1114965 (0)	total: 24.9ms	remaining: 7.45s
1:	learn: 0.0928546	test: 0.1024707	best: 0.1024707 (1)	total: 44.7ms	remaining: 6.66s
2:	learn: 0.0863090	test: 0.0954331	best: 0.0954331 (2)	total: 63.2ms	remaining: 6.25s
3:	learn: 0.0796152	test: 0.0886611	best: 0.0886611 (3)	total: 78.3ms	remaining: 5.79s
4:	learn: 0.0739783	test: 0.0828825	best: 0.0828825 (4)	total: 94.8ms	remaining: 5.59s
5:	learn: 0.0684744	test: 0.0774672	best: 0.0774672 (5)	total: 110ms	remaining: 5.39s
6:	learn: 0.0635662	test: 0.0719944	best: 0.0719944 (6)	total: 122ms	remaining: 5.1s
7:	learn: 0.0596957	test: 0.0680467	best: 0.0680467 (7)	total: 135ms	remaining: 4.93s
8:	learn: 0.0560229	test: 0.0640540	best: 0.0640540 (8)	total: 148ms	remaining: 4.79s
9:	learn: 0.0528592	test: 0.0606275	best: 0.0606275 (9)	total: 162ms	remaining: 4.69s
10:	learn: 0.0502505	test: 0.0579920	best: 0.0579920 (10)	total: 177ms	remaining: 4.65s
11:	learn: 0.0477452	test

In [170]:
results

Unnamed: 0,iterations,test-MAE-mean,test-MAE-std,train-MAE-mean,train-MAE-std
0,0,0.044571,0.042848,0.085800,0.019920
1,1,0.041379,0.039262,0.078922,0.018144
2,2,0.038541,0.036200,0.072830,0.016734
3,3,0.035964,0.033329,0.067201,0.015308
4,4,0.033077,0.031518,0.062207,0.014105
...,...,...,...,...,...
295,295,0.020951,0.014022,0.017221,0.003320
296,296,0.020953,0.014024,0.017212,0.003327
297,297,0.020953,0.014024,0.017208,0.003327
298,298,0.020954,0.014024,0.017198,0.003326
