### **FlexML:** Regression Experiment with California House Value Prediction Dataset

In [1]:
from flexml import Regression
from sklearn.datasets import fetch_california_housing

In [2]:
df = fetch_california_housing(as_frame = True)['frame']
df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [3]:
reg_exp = Regression(data=df, target_col="MedHouseVal") 
reg_exp.start_experiment(experiment_size="quick") 

# By default, FlexML runs in 'quick' mode to run the most popular and fast models for faster results.
# Default evaluation metric for Regression tasks is 'r2'

# You can change your model evaluation parameters such as;
  # cv_method (default = 'kfold' for Regression, 'stratified_kfold' for Classification)
  # n_folds (default = 5)
  # test_size (default = 0.25 If cv_method is 'holdout', else None)
  # groups_col (default = None)

INFO | 2025-04-06 14:08:32 | [PROCESS] Training the ML models with kfold validation
INFO | Training Progress:  | ██████████ | 100%
INFO | 2025-04-06 14:08:46 | [PROCESS] Model training is finished!


Unnamed: 0,Model Name,R2,MAE,MSE,RMSE,MAPE,Time (sec)
1,CatBoostRegressor,0.852024,0.291575,0.196942,0.443684,0.162991,6.93
2,LGBMRegressor,0.837282,0.309548,0.216548,0.465227,0.175458,2.24
3,XGBRegressor,0.834695,0.307479,0.21999,0.468914,0.171952,1.66
4,DecisionTreeRegressor,0.613782,0.459651,0.514159,0.717004,0.250717,0.7
5,Ridge,0.601385,0.531686,0.530564,0.728245,0.317594,0.02
6,LinearRegression,0.601378,0.531677,0.530572,0.728251,0.317588,0.0
7,ElasticNet,0.422752,0.679422,0.768426,0.876568,0.453446,0.0
8,Lasso,0.284865,0.767992,0.952078,0.975702,0.520122,0.01
9,HuberRegressor,-2.166381,0.5909,4.220209,1.535683,0.34381,1.79


In [4]:
# You can also display the model stats after the experiment is done with a desired evaluation metric, default is "r2"
reg_exp.show_model_stats(eval_metric="r2")

Unnamed: 0,Model Name,R2,MAE,MSE,RMSE,MAPE,Time (sec)
1,CatBoostRegressor,0.852024,0.291575,0.196942,0.443684,0.162991,6.93
2,LGBMRegressor,0.837282,0.309548,0.216548,0.465227,0.175458,2.24
3,XGBRegressor,0.834695,0.307479,0.21999,0.468914,0.171952,1.66
4,DecisionTreeRegressor,0.613782,0.459651,0.514159,0.717004,0.250717,0.7
5,Ridge,0.601385,0.531686,0.530564,0.728245,0.317594,0.02
6,LinearRegression,0.601378,0.531677,0.530572,0.728251,0.317588,0.0
7,ElasticNet,0.422752,0.679422,0.768426,0.876568,0.453446,0.0
8,Lasso,0.284865,0.767992,0.952078,0.975702,0.520122,0.01
9,HuberRegressor,-2.166381,0.5909,4.220209,1.535683,0.34381,1.79


In [5]:
"""
* You can get the best model(s) by calling get_best_models() method
    - top_n_models: default is 1, If you want to get only one model, you will get a model object, else
    - you will get a list of model objects
* By default, get_best_models() function returns the best model(s) based on the evaluation metric used in the start_experiment()
* But you can reorder the models by passing eval_metric parameter to get_best_models()
"""
best_model = reg_exp.get_best_models(top_n_models=1)

# Optionally, you can get the any model object by "model_name" value in the show_model_stats() output
example_model = reg_exp.get_model_by_name("CatBoostRegressor")

In [7]:
# Plot model evaluation graphs for a desired model

"""
Available 'kind' values for Regression:
- "feature_importance" (default)
- "residuals"
- "prediction_error"
- "calibration_curve"
- "shap_summary"
- "shap_violin"

You can play with extra parameters for the plot function for some of the kinds, check out func doc of plot() method for more details
"""
reg_exp.plot(model="CatBoostRegressor")

In [8]:
reg_exp.tune_model(tuning_method="optuna")
# If you don't play with any validation strategy here (e.g. cv_method, n_folds, test_size, group_col), It uses the same strategy as the start_experiment(), otherwise it cleans the model leaderboard since validation strategy is different

INFO | 2025-04-06 14:08:48 | [PROCESS] Model Tuning process started with 'optuna' method


Unnamed: 0,Model Name,R2,MAE,MSE,RMSE,MAPE,Time (sec)
1,CatBoostRegressor_(optuna)_(n_iter=10),0.852599,0.289819,0.196189,0.442813,0.161855,49.18
2,CatBoostRegressor,0.852024,0.291575,0.196942,0.443684,0.162991,6.93
3,LGBMRegressor,0.837282,0.309548,0.216548,0.465227,0.175458,2.24
4,XGBRegressor,0.834695,0.307479,0.21999,0.468914,0.171952,1.66
5,DecisionTreeRegressor,0.613782,0.459651,0.514159,0.717004,0.250717,0.7
6,Ridge,0.601385,0.531686,0.530564,0.728245,0.317594,0.02
7,LinearRegression,0.601378,0.531677,0.530572,0.728251,0.317588,0.0
8,ElasticNet,0.422752,0.679422,0.768426,0.876568,0.453446,0.0
9,Lasso,0.284865,0.767992,0.952078,0.975702,0.520122,0.01
10,HuberRegressor,-2.166381,0.5909,4.220209,1.535683,0.34381,1.79


INFO | 2025-04-06 14:09:38 | [PROCESS] Model Tuning process is finished successfully


In [9]:
reg_exp.tune_model(model="LGBMRegressor", tuning_method="randomized_search", n_iter=5)
# You are in your own world now, you can pass any model object and any tuning method you want and see all of them in the same model leaderboard for comparison

INFO | 2025-04-06 14:09:38 | [PROCESS] Model Tuning process started with 'randomized_search' method


Unnamed: 0,Model Name,R2,MAE,MSE,RMSE,MAPE,Time (sec)
1,LGBMRegressor_(randomized_search)_(n_iter=5),0.855076,0.286317,0.192833,0.439004,0.159922,41.98
2,CatBoostRegressor_(optuna)_(n_iter=10),0.852599,0.289819,0.196189,0.442813,0.161855,49.18
3,CatBoostRegressor,0.852024,0.291575,0.196942,0.443684,0.162991,6.93
4,LGBMRegressor,0.837282,0.309548,0.216548,0.465227,0.175458,2.24
5,XGBRegressor,0.834695,0.307479,0.21999,0.468914,0.171952,1.66
6,DecisionTreeRegressor,0.613782,0.459651,0.514159,0.717004,0.250717,0.7
7,Ridge,0.601385,0.531686,0.530564,0.728245,0.317594,0.02
8,LinearRegression,0.601378,0.531677,0.530572,0.728251,0.317588,0.0
9,ElasticNet,0.422752,0.679422,0.768426,0.876568,0.453446,0.0
10,Lasso,0.284865,0.767992,0.952078,0.975702,0.520122,0.01


INFO | 2025-04-06 14:10:20 | [PROCESS] Model Tuning process is finished successfully


In [10]:
# You can access to tuning model by using get_model_by_name() method

my_tuned_lgb_model = reg_exp.get_model_by_name("LGBMRegressor_(randomized_search)_(n_iter=5)")
my_tuned_lgb_model

## Prediction and Model Saving

In [11]:
# Let's assume that you have a new data for prediction, for example the same data that you used for training.

X = reg_exp.X

preds = reg_exp.predict(X) # full_train (default=True) param enables you to train the model with the whole dataset and then predict, otherwise it uses the model that trained with the last X_train, y_train fold
preds

INFO | 2025-04-06 14:10:20 | Training the model using the whole data


array([4.30071943, 3.85183199, 4.16394166, ..., 0.96663076, 0.89024036,
       0.86868   ])

In [12]:
reg_exp.save_model(my_tuned_lgb_model) # Saves the model with the feature engineering pipeline, you can set model_only to True If you want to save only the model

INFO | 2025-04-06 14:10:21 | No save path provided. Using default: pipeline.pkl
INFO | 2025-04-06 14:10:21 | Training the model using the whole data
INFO | 2025-04-06 14:10:23 | Pipeline saved successfully at pipeline.pkl
