# Notebook for the custom stacking model, using catboost an LGBM

In [172]:
from catboost import CatBoostRegressor
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor
from sklearn.model_selection import train_test_split
import pandas as pd 
%store -r dm

dm = dm


## Our model 

We are training two models in parallel, then we are training a model on the weighted sum of the predictions of the two other models. 

catboost -> catboost_preds
adaBoost -> adaboost_preds

-> randomforest -> final_preds

In [173]:

cat_A = CatBoostRegressor()
ada_A = AdaBoostRegressor()

cat_B = CatBoostRegressor()
ada_B = AdaBoostRegressor()

cat_C = CatBoostRegressor()
ada_C = AdaBoostRegressor()

forest_A = RandomForestRegressor()
forest_B = RandomForestRegressor()
forest_C = RandomForestRegressor()

In [174]:
#preparing data

X_A = dm.data_A.iloc[:,2:-1] #independent columns
y_A = dm.data_A.iloc[:,0]   #target column i.e pv measurement

X_B = dm.data_B.iloc[:,2:-1] #independent columns
y_B = dm.data_B.iloc[:,0]    #target column i.e pv measurement

X_C = dm.data_C.iloc[:,2:-1] #independent columns
y_C = dm.data_C.iloc[:,0]   #target column i.e pv measurement


X_A_train, X_A_test, y_A_train, y_A_test = train_test_split(X_A, y_A)
X_B_train, X_B_test, y_B_train, y_B_test = train_test_split(X_B, y_B)
X_C_train, X_C_test, y_C_train, y_C_test = train_test_split(X_C, y_C)

X_test_A = dm.X_test_estimated_a[dm.X_test_estimated_a.columns.intersection(X_A_train.columns)]
X_test_B = dm.X_test_estimated_b[X_B_train.columns.intersection(dm.X_test_estimated_b.columns)]
X_test_C = dm.X_test_estimated_c[X_C_train.columns.intersection(dm.X_test_estimated_c.columns)]

print(len(X_A_train.columns),  len(X_test_A.columns))

print(len(X_A_train.columns.intersection(dm.X_test_estimated_a.columns)))

print(X_A_train.columns)

print(dm.X_test_estimated_a.columns)

25 25
25
Index(['absolute_humidity_2m:gm3', 'ceiling_height_agl:m', 'cloud_base_agl:m',
       'dew_or_rime:idx', 'dew_point_2m:K', 'fresh_snow_1h:cm',
       'fresh_snow_24h:cm', 'is_in_shadow:idx', 'msl_pressure:hPa',
       'precip_type_5min:idx', 'pressure_50m:hPa', 'prob_rime:p',
       'rain_water:kgm2', 'relative_humidity_1000hPa:p', 'sfc_pressure:hPa',
       'snow_melt_10min:mm', 'snow_water:kgm2', 'sun_elevation:d',
       'total_cloud_cover:p', 'visibility:m', 'wind_speed_10m:ms',
       'wind_speed_w_1000hPa:ms', 'month', 'hours', 'sum_rad:W'],
      dtype='object')
Index(['absolute_humidity_2m:gm3', 'ceiling_height_agl:m', 'cloud_base_agl:m',
       'dew_or_rime:idx', 'dew_point_2m:K', 'fresh_snow_1h:cm',
       'fresh_snow_24h:cm', 'is_in_shadow:idx', 'msl_pressure:hPa',
       'precip_type_5min:idx', 'pressure_50m:hPa', 'prob_rime:p',
       'rain_water:kgm2', 'relative_humidity_1000hPa:p', 'sfc_pressure:hPa',
       'snow_melt_10min:mm', 'snow_water:kgm2', 'sun_elevatio

In [175]:

cat_A.fit(X_A_train, y_A_train)
cat_B.fit(X_B_train, y_B_train)
cat_C.fit(X_C_train, y_C_train)
# cat.fit(X_B_train, y_B_train)
# cat.fit(X_C_train, y_C_train)

ada_A.fit(X_A_train, y_A_train)
ada_B.fit(X_B_train, y_B_train)
ada_C.fit(X_C_train, y_C_train)
# ada.fit(X_B_train, y_B_train)
# ada.fit(X_C_train, y_C_train)


Learning rate set to 0.085047
0:	learn: 1077.3741627	total: 6.76ms	remaining: 6.75s
1:	learn: 1008.1650553	total: 10.9ms	remaining: 5.42s
2:	learn: 947.1828666	total: 15.7ms	remaining: 5.22s
3:	learn: 890.5636830	total: 20.2ms	remaining: 5.02s
4:	learn: 839.5416016	total: 25ms	remaining: 4.97s
5:	learn: 794.4291799	total: 29.7ms	remaining: 4.92s
6:	learn: 754.6284838	total: 34.5ms	remaining: 4.9s
7:	learn: 718.1384766	total: 40ms	remaining: 4.96s
8:	learn: 684.8454154	total: 44.8ms	remaining: 4.94s
9:	learn: 655.4563346	total: 49.2ms	remaining: 4.87s
10:	learn: 630.0464678	total: 53.6ms	remaining: 4.82s
11:	learn: 606.5792286	total: 58.1ms	remaining: 4.79s
12:	learn: 586.1909715	total: 62.8ms	remaining: 4.77s
13:	learn: 568.1366411	total: 67.6ms	remaining: 4.76s
14:	learn: 552.8929011	total: 72.4ms	remaining: 4.75s
15:	learn: 539.3381071	total: 77.2ms	remaining: 4.75s
16:	learn: 526.7715448	total: 82.5ms	remaining: 4.77s
17:	learn: 515.9738846	total: 87.6ms	remaining: 4.78s
18:	learn: 

In [176]:

cat_preds_A = cat_A.predict(X_A_test)
ada_preds_A = ada_A.predict(X_A_test)

cat_preds_B = cat_B.predict(X_B_test)
ada_preds_B = ada_B.predict(X_B_test)

cat_preds_C = cat_C.predict(X_C_test)
ada_preds_C = ada_C.predict(X_C_test)

cat_A_df = pd.DataFrame(data=cat_preds_A, columns=["pv_preds"])
ada_A_df = pd.DataFrame(data=ada_preds_A, columns=["pv_preds"])

cat_B_df = pd.DataFrame(data=cat_preds_B, columns=["pv_preds"])
ada_B_df = pd.DataFrame(data=ada_preds_B, columns=["pv_preds"])

cat_C_df = pd.DataFrame(data=cat_preds_C, columns=["pv_preds"])
ada_C_df = pd.DataFrame(data=ada_preds_C, columns=["pv_preds"])

df_A = (cat_A_df + ada_A_df) / 2
df_B = (cat_B_df + ada_B_df) / 2
df_C = (cat_C_df + ada_C_df) / 2



In [177]:

X_A_train_forest, X_A_test_forest, y_A_train_forest, y_A_test_forest = train_test_split(df_A, y_A_test)
X_B_train_forest, X_B_test_forest, y_B_train_forest, y_B_test_forest = train_test_split(df_B, y_B_test)
X_C_train_forest, X_C_test_forest, y_C_train_forest, y_C_test_forest = train_test_split(df_C, y_C_test)




In [178]:
forest_A.fit(X_A_train_forest, y_A_train_forest)
forest_B.fit(X_B_train_forest, y_B_train_forest)
forest_C.fit(X_C_train_forest, y_C_train_forest)

In [179]:
final_score_A = forest_A.score(X_A_test_forest, y_A_test_forest)
final_score_B = forest_B.score(X_B_test_forest, y_B_test_forest)
final_score_C = forest_C.score(X_C_test_forest, y_C_test_forest)

print(final_score_A)
print(final_score_B)
print(final_score_C)

0.8528702196039609
0.8299273055178487
0.887030156281403


In [181]:
## Specify the model to be used

cat_preds_A = cat_A.predict(X_test_A)
ada_preds_A = ada_A.predict(X_test_A)

cat_preds_B = cat_B.predict(X_test_B)
ada_preds_B = ada_B.predict(X_test_B)

cat_preds_C = cat_C.predict(X_test_C)
ada_preds_C = ada_C.predict(X_test_C)

cat_A_df = pd.DataFrame(data=cat_preds_A, columns=["pv_preds"])
ada_A_df = pd.DataFrame(data=ada_preds_A, columns=["pv_preds"])

cat_B_df = pd.DataFrame(data=cat_preds_B, columns=["pv_preds"])
ada_B_df = pd.DataFrame(data=ada_preds_B, columns=["pv_preds"])

cat_C_df = pd.DataFrame(data=cat_preds_C, columns=["pv_preds"])
ada_C_df = pd.DataFrame(data=ada_preds_C, columns=["pv_preds"])

df_A = (cat_A_df + ada_A_df) / 2
df_B = (cat_B_df + ada_B_df) / 2
df_C = (cat_C_df + ada_C_df) / 2

pred_A = forest_A.predict(df_A)
pred_B = forest_B.predict(df_B)
pred_C = forest_C.predict(df_C)

df_A = pd.DataFrame()

df_A["prediction"] = pred_A
df_A["location"] = "A"

df_B = pd.DataFrame()

df_B["prediction"] = pred_B
df_B["location"] = "B"

df_C = pd.DataFrame()

df_C["prediction"] = pred_C
df_C["location"] = "C"

df_mid = pd.concat([df_A, df_B], ignore_index=True)

df = pd.concat([df_mid, df_C], join="inner", ignore_index=True)



df = df.drop("location", axis=1)


# df["id"] = test["id"]

# df = df[["id", "prediction"]]

df[df<0] = 0

# NAME THE FILE 
df.to_csv("sub19.csv")

df

Unnamed: 0,prediction
0,2.40680
1,11.88000
2,3.31540
3,228.19115
4,344.24170
...,...
2155,54.49535
2156,12.47295
2157,3.08700
2158,6.44350
