In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, StackingRegressor, RandomForestRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.metrics import accuracy_score, log_loss, precision_score, f1_score, mean_absolute_error, r2_score
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
import os

In [19]:
os.chdir("D:\\meridianthe4\\PML\\Cases\\Concrete_Strength")

In [20]:
concrete = pd.read_csv("Concrete_Data.csv")
X, y = concrete.drop("Strength", axis=1), concrete["Strength"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=25)

In [21]:
lr = LinearRegression()
rr = Ridge()
lr = Lasso()
en = ElasticNet()
rfr = RandomForestRegressor(random_state=25)
xgb = XGBRegressor(random_state=25)
lgbm = LGBMRegressor(random_state=25)
catb = CatBoostRegressor(random_state=25, verbose=False)

In [17]:
final_ests = [rfr, xgb, lgbm, catb]
scores = []
for final_est in tqdm(final_ests):
    for passthrough in [True, False]:
        stack = StackingRegressor(estimators=[('lr', lr), ('rr', rr), ('lr_lasso', lr), ('en', en)], final_estimator=final_est, passthrough=passthrough)
        stack.fit(X_train, y_train)
        y_pred = stack.predict(X_test)
        mae = mean_absolute_error(y_test, y_pred)
        scores.append([final_est, passthrough, mae])

 75%|███████▌  | 3/4 [00:01<00:00,  2.68it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000115 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1743
[LightGBM] [Info] Number of data points in the train set: 721, number of used features: 12
[LightGBM] [Info] Start training from score 35.938960
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000045 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 964
[LightGBM] [Info] Number of data points in the train set: 721, number of used features: 4
[LightGBM] [Info] Start training from score 35.938960


100%|██████████| 4/4 [00:03<00:00,  1.07it/s]


In [12]:
scores_df = pd.DataFrame(scores, columns=["Final Estimator", "Passthrough", "score"])
scores_df.sort_values(by="score")

Unnamed: 0,Final Estimator,Passthrough,score
6,<catboost.core.CatBoostRegressor object at 0x0...,True,3.179003
4,LGBMRegressor(random_state=25),True,3.402113
2,"XGBRegressor(base_score=None, booster=None, ca...",True,3.701099
0,RandomForestRegressor(random_state=25),True,4.092146
7,<catboost.core.CatBoostRegressor object at 0x0...,False,7.555002
5,LGBMRegressor(random_state=25),False,7.932917
1,RandomForestRegressor(random_state=25),False,8.808664
3,"XGBRegressor(base_score=None, booster=None, ca...",False,9.382875


In [22]:
final_ests = [rfr, xgb, lgbm, catb]
scores = []
for final_est in tqdm(final_ests):
    for passthrough in [True, False]:
        stack = StackingRegressor(estimators=[('lr', lr), ('rr', rr), ('lr_lasso', lr), ('en', en)], final_estimator=final_est, passthrough=passthrough)
        stack.fit(X_train, y_train)
        y_pred = stack.predict(X_test)
        r2 = r2_score(y_test, y_pred)
        scores.append([final_est, passthrough, r2])

 75%|███████▌  | 3/4 [00:01<00:00,  2.48it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000092 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1743
[LightGBM] [Info] Number of data points in the train set: 721, number of used features: 12
[LightGBM] [Info] Start training from score 35.938960
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000077 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 964
[LightGBM] [Info] Number of data points in the train set: 721, number of used features: 4
[LightGBM] [Info] Start training from score 35.938960


100%|██████████| 4/4 [00:04<00:00,  1.04s/it]


In [23]:
scores_df = pd.DataFrame(scores, columns=["Final Estimator", "Passthrough", "score"])
scores_df.sort_values(by="score", ascending=False)

Unnamed: 0,Final Estimator,Passthrough,score
6,<catboost.core.CatBoostRegressor object at 0x0...,True,0.915112
4,LGBMRegressor(random_state=25),True,0.908096
2,"XGBRegressor(base_score=None, booster=None, ca...",True,0.89305
0,RandomForestRegressor(random_state=25),True,0.879136
7,<catboost.core.CatBoostRegressor object at 0x0...,False,0.639851
5,LGBMRegressor(random_state=25),False,0.622372
1,RandomForestRegressor(random_state=25),False,0.513304
3,"XGBRegressor(base_score=None, booster=None, ca...",False,0.451402
