In [16]:
# Import necessary libraries
from pycaret.regression import setup, compare_models, create_model, predict_model, evaluate_model, finalize_model, save_model
from sklearn.datasets import load_iris
import pandas as pd

In [None]:
# Load the Iris dataset
data = load_iris()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['species'] = data.target

In [None]:
# Setup the environment
clf1 = setup(df, target='species', verbose=False)

# Compare baseline models and return top 3 based on 'Accuracy'
top3 = compare_models(n_select=3)

# Create a model
model = create_model('rf')  # 'rf' stands for Random Forest

# Evaluate the model
evaluate_model(model)

# Make predictions on holdout set
predictions = predict_model(model)

# Finalize the model for deployment
final_model = finalize_model(model)


In [None]:
# Classification Functional API Example
# loading sample dataset
from pycaret.datasets import get_data
data = get_data('juice')
# init setup
from pycaret.classification import *
print(len(data))


In [None]:
s = setup(data, target = 'Purchase', session_id = 123)

In [None]:
# model training and selection
best = compare_models()

In [None]:
# evaluate trained model
evaluate_model(best)

In [None]:
# predict on hold-out/test set
pred_holdout = predict_model(best)
# predict on new data
new_data = data.copy().drop( 'Purchase', axis = 1)
predictions = predict_model(best, data = new_data)
# save model
save_model(best, 'best_pipeline')

In [None]:
from pathlib import Path
import pandas as pd
import tarfile
import urllib.request

def load_housing_data():
    tarball_path = Path("datasets/housing.tgz")
    if not tarball_path.is_file():
        Path("datasets").mkdir(parents=True, exist_ok=True)
        url = "https://github.com/ageron/data/raw/main/housing.tgz"
        urllib.request.urlretrieve(url, tarball_path)
        with tarfile.open(tarball_path) as housing_tarball:
            housing_tarball.extractall(path="datasets")
    return pd.read_csv(Path("datasets/housing/housing.csv"))

housing = load_housing_data()

In [8]:
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [11]:
s = setup(housing, target = 'median_house_value', session_id = 123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,median_house_value
2,Target type,Regression
3,Original data shape,"(20640, 10)"
4,Transformed data shape,"(20640, 14)"
5,Transformed train set shape,"(14447, 14)"
6,Transformed test set shape,"(6193, 14)"
7,Numeric features,8
8,Categorical features,1
9,Rows with missing values,1.0%


In [12]:
# model training and selection
best = compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lightgbm,Light Gradient Boosting Machine,31965.221,2290310010.0155,47813.5842,0.8282,0.2374,0.18,0.369
rf,Random Forest Regressor,32392.7457,2488626259.8746,49849.3961,0.8133,0.2418,0.1813,0.665
et,Extra Trees Regressor,35266.9227,2811146927.0735,52976.6391,0.789,0.2551,0.1975,0.264
gbr,Gradient Boosting Regressor,38085.2975,3033267318.3205,55040.2351,0.7726,0.27,0.2142,0.242
lr,Linear Regression,49577.4363,4716853793.9212,68608.9368,0.646,0.3772,0.2868,0.183
lasso,Lasso Regression,49577.7955,4716860016.021,68609.0056,0.646,0.3771,0.2868,0.024
ridge,Ridge Regression,49578.7243,4716870746.7228,68609.2087,0.646,0.3787,0.2868,0.009
llar,Lasso Least Angle Regression,49577.7917,4716860741.5232,68609.011,0.646,0.3767,0.2868,0.009
br,Bayesian Ridge,49591.1185,4719175049.5751,68626.582,0.6458,0.3766,0.2869,0.009
lar,Least Angle Regression,49633.114,4791193594.6773,69149.7096,0.6403,0.389,0.2846,0.009


In [13]:
# evaluate trained model
evaluate_model(best)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelinâ€¦

In [17]:
# predict on hold-out/test set
pred_holdout = predict_model(best)
# predict on new data
new_data = housing.copy().drop( 'median_house_value', axis = 1)
predictions = predict_model(best, data = new_data)
# save model
save_model(best, 'best_housing_predictor')

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Light Gradient Boosting Machine,32307.2773,2297304651.9079,47930.206,0.8264,0.2341,0.1791


Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['longitude', 'latitude',
                                              'housing_median_age',
                                              'total_rooms', 'total_bedrooms',
                                              'population', 'households',
                                              'median_income'],
                                     transformer=SimpleImputer(add_indicator=False,
                                                               copy=True,
                                                               fill_value=None,
                                                               keep_empty_features=False,
                                                               missing_values=nan,
                                                               strategy...
                  LGBMRegres