## 使用PyCaret建立红酒分类模型

In [1]:
import pandas as pd
import numpy as np

wine_df = pd.read_csv('winequality-red.csv', sep=';')

wine_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


### 将quality feature转为2分类（good or bad）

In [2]:
wine_df.quality = np.where(wine_df.quality >= 6,'Good', 'Bad')
wine_df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,Bad
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,Bad
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,Bad
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,Good
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,Bad


### 比较模型通过pycaret建立管道任务

In [3]:
from pycaret.classification import *

exp_clf101 = setup(data = wine_df, target = 'quality', session_id=123)

Unnamed: 0,Description,Value
0,session_id,123
1,Target,quality
2,Target Type,Binary
3,Label Encoded,"Bad: 0, Good: 1"
4,Original Data,"(1599, 12)"
5,Missing Values,False
6,Numeric Features,11
7,Categorical Features,0
8,Ordinal Features,False
9,High Cardinality Features,False


In [4]:
best = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.8222,0.8973,0.8384,0.8357,0.8364,0.6416,0.6429,0.079
et,Extra Trees Classifier,0.8159,0.9044,0.8319,0.8302,0.8306,0.629,0.6299,0.064
lightgbm,Light Gradient Boosting Machine,0.8132,0.8849,0.8204,0.8346,0.8266,0.6242,0.6257,0.022
gbc,Gradient Boosting Classifier,0.7855,0.8593,0.799,0.8071,0.8018,0.5682,0.5703,0.036
ridge,Ridge Classifier,0.7569,0.0,0.7497,0.791,0.7688,0.5131,0.5151,0.004
lr,Logistic Regression,0.7515,0.8177,0.748,0.7837,0.7648,0.5019,0.5033,0.695
lda,Linear Discriminant Analysis,0.7489,0.8173,0.7513,0.7779,0.7635,0.496,0.4974,0.005
dt,Decision Tree Classifier,0.7444,0.7411,0.7809,0.7568,0.7684,0.4835,0.4841,0.005
nb,Naive Bayes,0.7418,0.8043,0.7646,0.7615,0.7621,0.4798,0.4811,0.005
ada,Ada Boost Classifier,0.7363,0.8126,0.7645,0.7548,0.7578,0.4684,0.4711,0.027


#### 归一化数据尝试第二次

In [5]:
exp_clf102 = setup(data = wine_df, target = 'quality', session_id=123,
                  normalize = True, 
                  transformation = True)

Unnamed: 0,Description,Value
0,session_id,123
1,Target,quality
2,Target Type,Binary
3,Label Encoded,"Bad: 0, Good: 1"
4,Original Data,"(1599, 12)"
5,Missing Values,False
6,Numeric Features,11
7,Categorical Features,0
8,Ordinal Features,False
9,High Cardinality Features,False


In [6]:
best = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.8231,0.9036,0.8402,0.836,0.8375,0.6433,0.6444,0.066
rf,Random Forest Classifier,0.8222,0.8976,0.8351,0.838,0.8359,0.6418,0.643,0.077
lightgbm,Light Gradient Boosting Machine,0.8141,0.8835,0.8237,0.8337,0.8275,0.626,0.6277,0.02
gbc,Gradient Boosting Classifier,0.7873,0.8597,0.7991,0.8095,0.803,0.572,0.5741,0.036
lr,Logistic Regression,0.7525,0.8201,0.7727,0.7719,0.7711,0.5015,0.5032,0.006
qda,Quadratic Discriminant Analysis,0.7507,0.8123,0.776,0.7679,0.7711,0.4972,0.4985,0.005
ridge,Ridge Classifier,0.7498,0.0,0.7595,0.775,0.7659,0.4972,0.499,0.005
lda,Linear Discriminant Analysis,0.7498,0.8214,0.7595,0.775,0.7659,0.4972,0.499,0.005
dt,Decision Tree Classifier,0.7444,0.7413,0.7793,0.7578,0.768,0.4837,0.4844,0.005
nb,Naive Bayes,0.7373,0.8128,0.7119,0.7854,0.7461,0.4753,0.4787,0.004


### 选择最好的ExtraTree作为分类器，并创建实例

In [7]:
et_model = create_model('et')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8304,0.9082,0.7869,0.8889,0.8348,0.6618,0.667
1,0.8214,0.919,0.8361,0.8361,0.8361,0.64,0.64
2,0.8482,0.9357,0.8525,0.8667,0.8595,0.6945,0.6946
3,0.8393,0.9037,0.8689,0.8413,0.8548,0.6749,0.6754
4,0.8304,0.8939,0.8689,0.8281,0.848,0.6563,0.6573
5,0.8393,0.9248,0.8361,0.8644,0.85,0.677,0.6775
6,0.8125,0.9031,0.8525,0.8125,0.832,0.6202,0.6211
7,0.7768,0.872,0.8333,0.7692,0.8,0.5484,0.5506
8,0.8036,0.8817,0.8167,0.8167,0.8167,0.6051,0.6051
9,0.8288,0.8943,0.85,0.8361,0.843,0.6549,0.655


#### 对该模型进行可视化评估

In [8]:
evaluate_model(et_model)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

### 在验证集上验证，观察效果

In [9]:
predict_model(et_model)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extra Trees Classifier,0.7833,0.8852,0.8347,0.7667,0.7992,0.5649,0.5673


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,Label,Score
0,1.020581,-0.262296,0.879957,0.909533,-0.811347,-1.540063,-1.767548,-0.001569,0.179457,-0.026236,1.507610,Good,Good,0.89
1,0.803863,0.365869,0.069877,0.182416,0.169895,0.853209,1.990033,0.562903,-0.216541,-0.678425,-1.071247,Bad,Bad,0.94
2,-0.548122,1.883332,-0.893325,-0.877416,-0.243956,0.627023,0.590837,-0.012133,0.309834,-0.575997,-1.403918,Bad,Bad,0.81
3,0.107496,-1.862383,0.790677,-0.626785,-0.545447,-0.666586,-1.164982,-1.619910,-0.619242,-0.783883,1.558460,Good,Good,0.85
4,-0.094433,-0.848963,-0.143037,-0.877416,-0.079076,0.853209,0.377183,-0.235514,-0.150025,0.344380,-1.234925,Bad,Bad,0.60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
475,-0.548122,-0.140170,0.322105,-0.184957,-1.025574,1.330907,0.533355,0.282230,-0.017590,0.205903,-0.205644,Good,Good,1.00
476,1.829882,-0.021413,1.097942,0.007672,0.466399,-1.351650,-1.357754,1.774646,-0.754683,-2.001296,-0.760154,Bad,Bad,1.00
477,-0.629846,-0.781016,0.222956,-0.877416,-0.301560,1.330907,0.934478,0.056549,0.761128,-0.287096,-0.912961,Good,Good,1.00
478,0.936322,-1.341851,0.654246,-1.148800,-1.025574,-1.735218,-1.697012,-1.047378,-1.095523,-0.476619,1.112168,Good,Good,0.89


### 保存模型到本地化pickle文件

In [10]:
save_model(et_model, model_name = 'extra_tree_model')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True, features_todrop=[],
                                       id_columns=[],
                                       ml_usecase='classification',
                                       numerical_features=[], target='quality',
                                       time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 numeric_stra...
                  ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0,
                                       class_weight=None, criterion='gini',
                                       max_depth=None, max_features='auto',
                                       max_leaf_nod