In [1]:
# Project #2, Abalone classification by sex
!pip install pycaret



In [2]:
# Import libraries
import pandas as pd
from pycaret.classification import *

In [3]:
# Load the dataset from UCI
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data"
columns = [
    "Sex", "Length", "Diameter", "Height",
    "WholeWeight", "ShuckedWeight", "VisceraWeight", "ShellWeight", "Rings"
]
abalone = pd.read_csv(url, names=columns)

In [4]:
# Check data
abalone.head()

Unnamed: 0,Sex,Length,Diameter,Height,WholeWeight,ShuckedWeight,VisceraWeight,ShellWeight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [5]:
# Check for class imbalance
abalone['Sex'].value_counts(normalize=True) * 100

Unnamed: 0_level_0,proportion
Sex,Unnamed: 1_level_1
M,36.581278
I,32.128322
F,31.2904


In [6]:
# Initialize PyCaret for classification
s = setup(
    data=abalone,
    target='Sex',
    session_id=123,
    fix_imbalance=True,     # handle class imbalance automatically
)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Sex
2,Target type,Multiclass
3,Target mapping,"F: 0, I: 1, M: 2"
4,Original data shape,"(4177, 9)"
5,Transformed data shape,"(4461, 9)"
6,Transformed train set shape,"(3207, 9)"
7,Transformed test set shape,"(1254, 9)"
8,Numeric features,8
9,Preprocess,True


In [7]:
# Compare models
best_model = compare_models(sort='F1') # Weighted F1. Reward balanced performance across all classes
print(best_model)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.5416,0.7436,0.5416,0.5341,0.5361,0.3123,0.3133,0.414
gbc,Gradient Boosting Classifier,0.5419,0.0,0.5419,0.5322,0.5345,0.3134,0.3148,1.578
lr,Logistic Regression,0.5501,0.0,0.5501,0.541,0.5331,0.3299,0.3372,0.917
lda,Linear Discriminant Analysis,0.544,0.0,0.544,0.5336,0.5323,0.3188,0.3227,0.047
rf,Random Forest Classifier,0.5323,0.7385,0.5323,0.5277,0.5288,0.2983,0.299,0.682
xgboost,Extreme Gradient Boosting,0.5265,0.726,0.5265,0.5227,0.5236,0.2896,0.2902,0.517
lightgbm,Light Gradient Boosting Machine,0.5234,0.7337,0.5234,0.5178,0.5194,0.2849,0.2855,1.068
knn,K Neighbors Classifier,0.5163,0.7021,0.5163,0.5171,0.5127,0.2769,0.2793,0.072
ridge,Ridge Classifier,0.5385,0.0,0.5385,0.5294,0.5124,0.3143,0.3263,0.047
qda,Quadratic Discriminant Analysis,0.5299,0.0,0.5299,0.518,0.5047,0.2938,0.3042,0.045


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='sqrt',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_samples_leaf=1,
                     min_samples_split=2, min_weight_fraction_leaf=0.0,
                     monotonic_cst=None, n_estimators=100, n_jobs=-1,
                     oob_score=False, random_state=123, verbose=0,
                     warm_start=False)


In [8]:
# Tune the best model
tuned_model = tune_model(best_model, optimize='F1')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.4983,0.7541,0.4983,0.4738,0.4756,0.2531,0.2589
1,0.5529,0.7651,0.5529,0.5414,0.5448,0.3309,0.3325
2,0.4983,0.7519,0.4983,0.4835,0.4763,0.2548,0.2634
3,0.5342,0.7524,0.5342,0.5199,0.5087,0.3074,0.3177
4,0.5034,0.7502,0.5034,0.489,0.4857,0.2604,0.2659
5,0.5274,0.7423,0.5274,0.5213,0.5145,0.2959,0.3013
6,0.5788,0.76,0.5788,0.5708,0.5699,0.3708,0.3741
7,0.5514,0.757,0.5514,0.5396,0.5329,0.3319,0.3394
8,0.5411,0.749,0.5411,0.5321,0.5317,0.3143,0.317
9,0.5445,0.7754,0.5445,0.5315,0.5307,0.3199,0.3242


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


In [9]:
# Stack top models
top_models = compare_models(n_select=3, sort='F1')  # pick top 3

for i in range(len(top_models)):
    print(f"Top {i+1}: {top_models[i]}")

stacked_model = stack_models(top_models, optimize='F1')

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.5416,0.7436,0.5416,0.5341,0.5361,0.3123,0.3133,0.52
gbc,Gradient Boosting Classifier,0.5419,0.0,0.5419,0.5322,0.5345,0.3134,0.3148,1.571
lr,Logistic Regression,0.5501,0.0,0.5501,0.541,0.5331,0.3299,0.3372,0.122
lda,Linear Discriminant Analysis,0.544,0.0,0.544,0.5336,0.5323,0.3188,0.3227,0.05
rf,Random Forest Classifier,0.5323,0.7385,0.5323,0.5277,0.5288,0.2983,0.299,0.708
xgboost,Extreme Gradient Boosting,0.5265,0.726,0.5265,0.5227,0.5236,0.2896,0.2902,0.294
lightgbm,Light Gradient Boosting Machine,0.5234,0.7337,0.5234,0.5178,0.5194,0.2849,0.2855,0.853
knn,K Neighbors Classifier,0.5163,0.7021,0.5163,0.5171,0.5127,0.2769,0.2793,0.075
ridge,Ridge Classifier,0.5385,0.0,0.5385,0.5294,0.5124,0.3143,0.3263,0.045
qda,Quadratic Discriminant Analysis,0.5299,0.0,0.5299,0.518,0.5047,0.2938,0.3042,0.046


Processing:   0%|          | 0/67 [00:00<?, ?it/s]

Top 1: ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='sqrt',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_samples_leaf=1,
                     min_samples_split=2, min_weight_fraction_leaf=0.0,
                     monotonic_cst=None, n_estimators=100, n_jobs=-1,
                     oob_score=False, random_state=123, verbose=0,
                     warm_start=False)
Top 2: GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='log_loss', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_samples_leaf=1,
                           min_samples_split=2, min_weight_fraction_leaf=0.0,
                           n_estimators=100, n_iter_no_change=None,
                    

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.57,0.0,0.57,0.5692,0.5695,0.3531,0.3531
1,0.5392,0.0,0.5392,0.5461,0.5423,0.3065,0.3067
2,0.5597,0.0,0.5597,0.5627,0.561,0.3386,0.3387
3,0.5514,0.0,0.5514,0.5417,0.5443,0.3277,0.329
4,0.5274,0.0,0.5274,0.5223,0.5244,0.2905,0.2907
5,0.524,0.0,0.524,0.5185,0.5182,0.2883,0.2903
6,0.5685,0.0,0.5685,0.564,0.5661,0.3517,0.3518
7,0.5514,0.0,0.5514,0.5397,0.543,0.3255,0.3269
8,0.5479,0.0,0.5479,0.545,0.5464,0.3207,0.3208
9,0.5582,0.0,0.5582,0.5554,0.5532,0.3331,0.3355


Processing:   0%|          | 0/6 [00:00<?, ?it/s]

In [10]:
# Evaluate the final stacked model
evaluate_model(stacked_model)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [11]:
# Predict on new data
predictions = predict_model(stacked_model, data=abalone)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Stacking Classifier,0.8602,0.9696,0.8602,0.8596,0.8599,0.7898,0.7898


In [12]:
# Inspect top rows of the predictions DataFrame
predictions.head()

Unnamed: 0,Length,Diameter,Height,WholeWeight,ShuckedWeight,VisceraWeight,ShellWeight,Rings,Sex,prediction_label,prediction_score
0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15,M,M,0.752
1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7,M,M,0.5823
2,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9,F,F,0.9097
3,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10,M,M,0.7806
4,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7,I,I,0.9399


In [13]:
# Save the best trained model pipeline to a file
save_model(best_model, 'abalone_sex_classifier_pipeline')

# Load the saved model pipeline from file
loaded_model = load_model('abalone_sex_classifier_pipeline')

# Display the loaded model object
print("Loaded model pipeline:")
print(loaded_model)

Transformation Pipeline and Model Successfully Saved
Transformation Pipeline and Model Successfully Loaded
Loaded model pipeline:
Pipeline(memory=FastMemory(location=/tmp/joblib),
         steps=[('label_encoding',
                 TransformerWrapperWithInverse(exclude=None, include=None,
                                               transformer=LabelEncoder())),
                ('numerical_imputer',
                 TransformerWrapper(exclude=None,
                                    include=['Length', 'Diameter', 'Height',
                                             'WholeWeight', 'ShuckedWeight',
                                             'VisceraWeight', 'ShellWeight',
                                             'Rings'],
                                    transformer=SimpleImputer(add_i...
                 ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0,
                                      class_weight=None, criterion='gini',
                                      max_d