# autogluon.tabular

In [8]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
train = pd.read_csv('../data/train.csv', sep=",")
test = pd.read_csv('../data/test.csv')

import sklearn
y = train['SalePrice']
X = train.drop('SalePrice', axis=1)
X=X[set(X.columns.values.tolist())]
categorical = [var for var in X.columns if X[var].dtype=='O']
numerical = [var for var in X.columns if X[var].dtype!='O']
X[categorical] = X[categorical].fillna('None')

# auto-sklearn can not deal with categorical variables
X= pd.concat([pd.get_dummies(X[categorical], dummy_na=True), X[numerical]], axis=1)

y = np.log1p(y)
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, random_state=42, test_size=0.2)

  X=X[set(X.columns.values.tolist())]


In [9]:
from autogluon.tabular import TabularDataset, TabularPredictor
X_train = pd.concat([X_train, y_train], axis=1)
X_test = pd.concat([X_test, y_test], axis=1)
train_data = TabularDataset(X_train)
test_data = TabularDataset(X_test)
predictor = TabularPredictor(label='SalePrice',eval_metric = 'root_mean_squared_error').fit(train_data, time_limit=300,
                                                   presets='best_quality',auto_stack=True,)  # For a fair comparison with auto-sklearn
# leaderboard = predictor.leaderboard(test_data)


No path specified. Models will be saved in: "AutogluonModels/ag-20230421_121610/"
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=20
Beginning AutoGluon training ... Time limit = 300s
AutoGluon will save models to "AutogluonModels/ag-20230421_121610/"
AutoGluon Version:  0.7.0
Python Version:     3.9.16
Operating System:   Linux
Platform Machine:   aarch64
Platform Version:   #1 SMP PREEMPT Tue Sep 13 07:51:32 UTC 2022
Train Data Rows:    1168
Train Data Columns: 347
Label Column: SalePrice
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (13.521140839642674, 10.460270761075149, 12.03066, 0.39061)
	If 'regression' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['bi

Repeating k-fold bagging: 17/20
Repeating k-fold bagging: 18/20
Repeating k-fold bagging: 19/20
Repeating k-fold bagging: 20/20
Completed 20/20 k-fold bagging repeats ...
Fitting model: WeightedEnsemble_L2 ... Training model for up to 299.83s of the 297.06s of remaining time.
	-0.1412	 = Validation score   (-root_mean_squared_error)
	0.04s	 = Training   runtime
	0.0s	 = Validation runtime
Fitting 9 L2 models ...
Fitting model: LightGBMXT_BAG_L2 ... Training model for up to 297.01s of the 297.01s of remaining time.
	Fitting 8 child models (S1F1 - S1F8) | Fitting with SequentialLocalFoldFittingStrategy
		`import lightgbm` failed. A quick tip is to install via `pip install autogluon.tabular[lightgbm]==0.7.0`.
Fitting model: LightGBM_BAG_L2 ... Training model for up to 296.97s of the 296.96s of remaining time.
	Fitting 8 child models (S1F1 - S1F8) | Fitting with SequentialLocalFoldFittingStrategy
		`import lightgbm` failed. A quick tip is to install via `pip install autogluon.tabular[light

In [10]:
y_test = test_data['SalePrice']  # values to predict
test_data_nolab = test_data.drop(columns=['SalePrice'])  # delete label column to prove we're not cheating
y_pred = predictor.predict(test_data_nolab)

perf = predictor.evaluate_predictions(y_true=y_test, y_pred=y_pred, auxiliary_metrics=True)

Evaluation: root_mean_squared_error on test data: -0.1363192172330769
	Note: Scores are always higher_is_better. This metric score can be multiplied by -1 to get the metric value.
Evaluations on test data:
{
    "root_mean_squared_error": -0.1363192172330769,
    "mean_squared_error": -0.018582928987038814,
    "mean_absolute_error": -0.09058484638885887,
    "r2": 0.9004188921159217,
    "pearsonr": 0.9499610277743544,
    "median_absolute_error": -0.05549754284556929
}


In [11]:
results = predictor.fit_summary(show_plot=True)

*** Summary of fit() ***
Estimated performance of each model:
                    model  score_val  pred_time_val  fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0     WeightedEnsemble_L3  -0.133724       0.296334  2.999137                0.000138           0.022182            3       True          8
1    ExtraTreesMSE_BAG_L2  -0.134864       0.223110  2.018322                0.071800           0.599221            2       True          7
2  RandomForestMSE_BAG_L2  -0.136211       0.224396  2.377734                0.073086           0.958633            2       True          6
3     WeightedEnsemble_L2  -0.141171       0.149018  1.451817                0.000154           0.036620            2       True          5
4  RandomForestMSE_BAG_L1  -0.145216       0.072193  0.837244                0.072193           0.837244            1       True          3
5    ExtraTreesMSE_BAG_L1  -0.145484       0.074117  0.574159                0.074117           0.



# Scikit-Optimize

Probably you'll have to reload the notebook for the changes being in place. Scikit-Optimize works only with sklearn 0.23.2

In [12]:
!pip install scikit-optimize



In [13]:
import skopt
import importlib
import sklearn
display(skopt.__version__)
importlib.reload(sklearn)
display(sklearn.__version__)

# Since BayesSearchCV can not deal with missing values, we have to impute them before:

from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
import numpy as np


numeric_transformer = SimpleImputer(missing_values=np.nan, strategy='mean')

X_train.loc[:, numerical] = numeric_transformer.fit_transform(X_train.loc[:, numerical])
X_test.loc[:, numerical] = numeric_transformer.fit_transform(X_test.loc[:, numerical])

from skopt.space import Real, Categorical, Integer
from skopt import BayesSearchCV
from sklearn.ensemble import GradientBoostingRegressor

regressor = BayesSearchCV(
    estimator = GradientBoostingRegressor(),
      search_spaces ={
         'learning_rate': Real(0.1,0.3),
         'loss': Categorical(['ls']),
         'max_depth': Integer(3,6),
         'n_estimators': Integer(100, 600),
         'subsample': Real(0.6, 1.0),
         'max_features': Real(0.6, 1.0) 
      },
    n_iter=64,
    random_state=0,
    verbose=1, iid=True,
    cv=5, n_jobs=-1
  )
regressor.fit(X_train,y_train)

'0.9.0'

'1.2.2'

  X_train.loc[:, numerical] = numeric_transformer.fit_transform(X_train.loc[:, numerical])
  X_test.loc[:, numerical] = numeric_transformer.fit_transform(X_test.loc[:, numerical])


AttributeError: module 'numpy' has no attribute 'int'.
`np.int` was a deprecated alias for the builtin `int`. To avoid this error in existing code, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.
The aliases was originally deprecated in NumPy 1.20; for more details and guidance see the original release note at:
    https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations

In [None]:
predictions = regressor.predict(X_test)
print("mean-squared-error:", sklearn.metrics.mean_squared_error(y_test, predictions, squared=False))

# TPOT (Tree-based Pipeline Optimization Tool)

!pip install tpot

the steps below require a new-start of the notebook - if not the changes are not in place

!pip install --upgrade ipywidgets
!jupyter nbextension enable --py widgetsnbextension

from tpot import TPOTRegressor
from sklearn.datasets import load_digits 
from sklearn.model_selection import train_test_split 

tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2, random_state=42, n_jobs=-1)
tpot.fit(X_train, y_train)
print(np.sqrt(-tpot.score(X_test, y_test)))

# FLAML

!pip install flaml

from sklearn.model_selection import train_test_split
train = pd.read_csv('../data/train.csv', sep=",")
test = pd.read_csv('../data/test.csv')

import sklearn
y = train['SalePrice']
X = train.drop('SalePrice', axis=1)
y = np.log1p(y)
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, random_state=42, test_size=0.2)

from flaml import AutoML
from flaml.default import LGBMRegressor
automl = AutoML()
# X_train, X_test, y_train, y_test
automl.fit(X_train, y_train, task="regression", estimator_list=["lgbm", "rf", "extra_tree"],
           train_time_limit=60,
           time_budget=600,
           n_jobs=-1,
          ## ensemble=dict(final_estimator= LGBMRegressor(),
          ## passthrough = True), 
           log_file_name='flaml_.log', log_type='all')

print(automl.best_estimator)

print(automl.best_config)

print(automl.best_config_per_estimator)

from flaml.data import get_output_from_log

time_history, best_valid_loss_history, valid_loss_history, config_history, metric_history = \
    get_output_from_log(filename='flaml_logfile', time_budget=120)

!pip install matplotlib

import matplotlib.pyplot as plt
import numpy as np

plt.title("Learning Curve")
plt.xlabel("Wall Clock Time (s)")
plt.ylabel("Validation Accuracy")
plt.step(time_history, 1 - np.array(best_valid_loss_history), where="post")
plt.show()

prediction = automl.predict(X_test)

np.sqrt(np.mean((prediction - y_test)**2))

# OPTUNA

!pip install optuna

import lightgbm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
import optuna
from sklearn.model_selection import cross_val_score

train = pd.read_csv('../data/train.csv', sep=",")
categorical = [var for var in train.columns if train[var].dtype=='O']

for cat_feat in categorical:
    train[cat_feat] = train[cat_feat].astype('category')
    
y = train['SalePrice']
X = train.drop('SalePrice', axis=1)
y = np.log1p(y)
X_train, X_test, y_train, y_test = \
        train_test_split(X, y, random_state=42, test_size=0.2)

def objective(trial):

    x_tr, x_te, y_tr, y_te = \
        train_test_split(X_train, y_train, random_state=42, test_size=0.2)
    
    model = lightgbm.LGBMRegressor()

    param = {
        "objective": "regression",
        "metric": "'neg_mean_squared_error'",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "learning_rate": trial.suggest_float("learning_rate", 1e-4, 0.1, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 1.0, log=True),
        "n_estimators" : trial.suggest_int("n_estimators", 100, 400),
        "num_leaves": trial.suggest_int("num_leaves", 4, 30),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "subsample_freq": trial.suggest_int("subsample_freq",0, 8),
        "colsample_bytree": trial.suggest_float("colsample_bytree",0.5,1.0),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0)
    }
    model.set_params(**param)
    clf = cross_val_score(
    model,
         x_tr, y_tr, scoring = 'neg_mean_squared_error'
    )

    return np.mean(np.sqrt(-clf))

study = optuna.create_study()
study.optimize(objective , n_trials =70)
trial = study.best_trial
model = lightgbm.LGBMRegressor()
model.set_params(**trial.params)
for k,v in trial.params.items():
    print(f'{k}: {v}')
model.fit(X_train, y_train)
print(f'result on hold-out set after HPO: {np.sqrt(mean_squared_error(y_test, model.predict(X_test)))}')