In [None]:
pip install pycaret



In [None]:
# ==============================
# 1. Imports
# ==============================
import pandas as pd
from pycaret.classification import *

# ==============================
# 2. Load Data
# ==============================
!wget -q https://archive.ics.uci.edu/static/public/186/wine+quality.zip
!unzip -oq wine+quality.zip
df_wine = pd.read_csv("winequality-red.csv", sep=';')

print("Original dataset shape:", df_wine.shape)
print(df_wine['quality'].value_counts().sort_index())

# ==============================
# 3. Feature Engineering
# ==============================
df_wine['total_acidity'] = df_wine['fixed acidity'] + df_wine['volatile acidity']
df_wine['alcohol_to_density'] = df_wine['alcohol'] / df_wine['density']
df_wine['sulphate_to_chloride'] = df_wine['sulphates'] / (df_wine['chlorides'] + 1e-5)

print("After feature engineering:", df_wine.shape)

# ==============================
# 4. Setup PyCaret
# ==============================
clf_setup = setup(
    data=df_wine,
    target='quality',
    session_id=42,
    normalize=True,
    normalize_method='zscore',
    fix_imbalance=False,
    remove_multicollinearity=True,
    multicollinearity_threshold=0.95,
    fold=10,
    fold_shuffle=True,
    use_gpu=False,
    verbose=True
)

# ==============================
# 5. Compare Models
# ==============================
best_model = compare_models(sort='F1')
print("Best model selected:", best_model)

# ==============================
# 6. Tune the Best Model
# ==============================
tuned_model = tune_model(best_model, optimize='F1', n_iter=50)
print("Tuned model:", tuned_model)


# ==============================
# 7. Blend Top Models
# ==============================
top3_models = compare_models(n_select=3, sort='F1')
blended_model = blend_models(top3_models, optimize='F1')
print("Blended model created.")

# ==============================
# 8. Finalize Model
# ==============================
final_model = finalize_model(blended_model)
print("Final model ready for deployment.")

# ==============================
# 9. Predictions
# ==============================
preds = predict_model(final_model)
print(preds.head())


Original dataset shape: (1599, 12)
quality
3     10
4     53
5    681
6    638
7    199
8     18
Name: count, dtype: int64
After feature engineering: (1599, 15)


Unnamed: 0,Description,Value
0,Session id,42
1,Target,quality
2,Target type,Multiclass
3,Target mapping,"3: 0, 4: 1, 5: 2, 6: 3, 7: 4, 8: 5"
4,Original data shape,"(1599, 15)"
5,Transformed data shape,"(1599, 13)"
6,Transformed train set shape,"(1119, 13)"
7,Transformed test set shape,"(480, 13)"
8,Numeric features,14
9,Preprocess,True


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.6908,0.6,0.6908,0.6579,0.6681,0.4952,0.5003,0.35
rf,Random Forest Classifier,0.6756,0.5937,0.6756,0.6461,0.6531,0.4705,0.4761,0.427
lightgbm,Light Gradient Boosting Machine,0.6586,0.5803,0.6586,0.6323,0.642,0.4505,0.4535,2.155
xgboost,Extreme Gradient Boosting,0.6541,0.5795,0.6541,0.6282,0.6383,0.4442,0.4468,0.539
gbc,Gradient Boosting Classifier,0.6399,0.0,0.6399,0.6175,0.6243,0.4211,0.4247,2.344
dt,Decision Tree Classifier,0.5826,0.4805,0.5826,0.5834,0.5816,0.3505,0.3514,0.113
lr,Logistic Regression,0.5961,0.0,0.5961,0.5677,0.5722,0.3343,0.3389,0.083
lda,Linear Discriminant Analysis,0.5756,0.0,0.5756,0.557,0.5608,0.3183,0.321,0.056
knn,K Neighbors Classifier,0.5746,0.5109,0.5746,0.5541,0.5594,0.3132,0.3158,0.079
nb,Naive Bayes,0.5496,0.5021,0.5496,0.5635,0.5528,0.3152,0.3175,0.056


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

Best model selected: ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='sqrt',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_samples_leaf=1,
                     min_samples_split=2, min_weight_fraction_leaf=0.0,
                     monotonic_cst=None, n_estimators=100, n_jobs=-1,
                     oob_score=False, random_state=42, verbose=0,
                     warm_start=False)


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6161,0.7723,0.6161,0.5922,0.5923,0.36,0.3649
1,0.6071,0.8163,0.6071,0.5851,0.5862,0.3473,0.3517
2,0.7232,0.8939,0.7232,0.6849,0.701,0.5558,0.5598
3,0.6696,0.8425,0.6696,0.6341,0.6484,0.4661,0.4706
4,0.6518,0.8502,0.6518,0.6314,0.6247,0.4254,0.4382
5,0.6607,0.8167,0.6607,0.6287,0.6346,0.4408,0.4466
6,0.6786,0.0,0.6786,0.6392,0.6499,0.471,0.4779
7,0.5982,0.0,0.5982,0.5632,0.5762,0.3426,0.3457
8,0.6429,0.0,0.6429,0.6151,0.6191,0.4116,0.4171
9,0.6847,0.8056,0.6847,0.6544,0.6534,0.4744,0.4847


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 50 candidates, totalling 500 fits


Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).
Tuned model: ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='sqrt',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_samples_leaf=1,
                     min_samples_split=2, min_weight_fraction_leaf=0.0,
                     monotonic_cst=None, n_estimators=100, n_jobs=-1,
                     oob_score=False, random_state=42, verbose=0,
                     warm_start=False)


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.6908,0.6,0.6908,0.6579,0.6681,0.4952,0.5003,0.313
rf,Random Forest Classifier,0.6756,0.5937,0.6756,0.6461,0.6531,0.4705,0.4761,0.428
lightgbm,Light Gradient Boosting Machine,0.6586,0.5803,0.6586,0.6323,0.642,0.4505,0.4535,2.048
xgboost,Extreme Gradient Boosting,0.6541,0.5795,0.6541,0.6282,0.6383,0.4442,0.4468,0.492
gbc,Gradient Boosting Classifier,0.6399,0.0,0.6399,0.6175,0.6243,0.4211,0.4247,2.336
dt,Decision Tree Classifier,0.5826,0.4805,0.5826,0.5834,0.5816,0.3505,0.3514,0.09
lr,Logistic Regression,0.5961,0.0,0.5961,0.5677,0.5722,0.3343,0.3389,0.074
lda,Linear Discriminant Analysis,0.5756,0.0,0.5756,0.557,0.5608,0.3183,0.321,0.056
knn,K Neighbors Classifier,0.5746,0.5109,0.5746,0.5541,0.5594,0.3132,0.3158,0.08
nb,Naive Bayes,0.5496,0.5021,0.5496,0.5635,0.5528,0.3152,0.3175,0.055


Processing:   0%|          | 0/67 [00:00<?, ?it/s]

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6964,0.8418,0.6964,0.6608,0.6755,0.5025,0.5054
1,0.6696,0.8551,0.6696,0.6568,0.6613,0.4674,0.4687
2,0.6875,0.8972,0.6875,0.6642,0.671,0.5091,0.5132
3,0.6607,0.8525,0.6607,0.6369,0.6447,0.4619,0.4664
4,0.7232,0.8748,0.7232,0.7,0.7038,0.5506,0.5594
5,0.6786,0.8491,0.6786,0.6526,0.66,0.4774,0.4811
6,0.6696,0.0,0.6696,0.639,0.6506,0.4663,0.4701
7,0.625,0.0,0.625,0.5987,0.608,0.3916,0.3946
8,0.6429,0.0,0.6429,0.6094,0.6213,0.417,0.4215
9,0.6216,0.8093,0.6216,0.5987,0.6014,0.3787,0.3854


Processing:   0%|          | 0/6 [00:00<?, ?it/s]

Blended model created.
Final model ready for deployment.


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Voting Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0


      fixed acidity  volatile acidity  citric acid  residual sugar  chlorides  \
416            10.6              0.48         0.64             2.2      0.111   
749             7.3              0.51         0.18             2.1      0.070   
1321            5.0              0.74         0.00             1.2      0.041   
1064            8.2              0.74         0.09             2.0      0.067   
659             7.1              0.84         0.02             4.4      0.096   

      free sulfur dioxide  total sulfur dioxide  density    pH  sulphates  \
416                   6.0                  20.0  0.99700  3.26       0.66   
749                  12.0                  28.0  0.99768  3.52       0.73   
1321                 16.0                  46.0  0.99258  4.01       0.59   
1064                  5.0                  10.0  0.99418  3.28       0.57   
659                   5.0                  13.0  0.99700  3.41       0.57   

      alcohol  total_acidity  alcohol_to_density  

In [None]:
predictions = predict_model(blended_model, data=df_wine)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Voting Classifier,0.8993,0.9846,0.8993,0.9002,0.8987,0.8414,0.8418
