## 使用PyCaret建立红酒分类模型

In [1]:
import pandas as pd
import numpy as np

wine_df = pd.read_csv('winequality-red.csv', sep=';')

wine_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


### 将quality feature转为2分类（good or bad）

In [2]:
wine_df.quality = np.where(wine_df.quality >= 6,'Good', 'Bad')
wine_df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,Bad
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,Bad
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,Bad
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,Good
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,Bad


### 比较模型通过pycaret建立管道任务

In [3]:
from pycaret.classification import *

exp_clf101 = setup(data = wine_df, target = 'quality', session_id=123)

Setup Succesfully Completed!


Unnamed: 0,Description,Value
0,session_id,123
1,Target Type,Binary
2,Label Encoded,"Bad: 0, Good: 1"
3,Original Data,"(1599, 12)"
4,Missing Values,False
5,Numeric Features,11
6,Categorical Features,0
7,Ordinal Features,False
8,High Cardinality Features,False
9,High Cardinality Method,


In [4]:
best = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.8222,0.8973,0.8384,0.8357,0.8364,0.6416,0.6429,0.074
et,Extra Trees Classifier,0.8159,0.9044,0.8319,0.8302,0.8306,0.629,0.6299,0.064
lightgbm,Light Gradient Boosting Machine,0.8132,0.8849,0.8204,0.8346,0.8266,0.6242,0.6257,0.02
gbc,Gradient Boosting Classifier,0.7855,0.8593,0.799,0.8071,0.8018,0.5682,0.5703,0.037
ridge,Ridge Classifier,0.7569,0.0,0.7497,0.791,0.7688,0.5131,0.5151,0.004
lr,Logistic Regression,0.7515,0.8177,0.748,0.7837,0.7648,0.5019,0.5033,0.679
lda,Linear Discriminant Analysis,0.7489,0.8173,0.7513,0.7779,0.7635,0.496,0.4974,0.005
dt,Decision Tree Classifier,0.7444,0.7411,0.7809,0.7568,0.7684,0.4835,0.4841,0.005
nb,Naive Bayes,0.7418,0.8043,0.7646,0.7615,0.7621,0.4798,0.4811,0.005
ada,Ada Boost Classifier,0.7363,0.8126,0.7645,0.7548,0.7578,0.4684,0.4711,0.027


#### 归一化数据尝试第二次

In [4]:
exp_clf102 = setup(data = wine_df, target = 'quality', session_id=123,
                  normalize = True, 
                  transformation = True)

Setup Succesfully Completed!


Unnamed: 0,Description,Value
0,session_id,123
1,Target Type,Binary
2,Label Encoded,"Bad: 0, Good: 1"
3,Original Data,"(1599, 12)"
4,Missing Values,False
5,Numeric Features,11
6,Categorical Features,0
7,Ordinal Features,False
8,High Cardinality Features,False
9,High Cardinality Method,


In [5]:
best = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
0,Extra Trees Classifier,0.8079,0.889,0.823,0.82,0.8206,0.6138,0.6155,0.1337
1,Extreme Gradient Boosting,0.79,0.8718,0.8112,0.8022,0.8054,0.5774,0.5797,0.1784
2,CatBoost Classifier,0.7882,0.8628,0.7929,0.8098,0.7998,0.5751,0.5774,2.1053
3,Light Gradient Boosting Machine,0.7793,0.8686,0.7962,0.7935,0.7939,0.5564,0.558,0.1084
4,Random Forest Classifier,0.7676,0.8451,0.7577,0.8011,0.7772,0.5347,0.538,0.0278
5,Gradient Boosting Classifier,0.7659,0.8493,0.7762,0.7881,0.78,0.5298,0.5334,0.1913
6,Quadratic Discriminant Analysis,0.7525,0.8081,0.7612,0.7724,0.766,0.5035,0.5047,0.0026
7,Logistic Regression,0.7418,0.8146,0.7661,0.756,0.7599,0.4805,0.4821,0.0136
8,Ridge Classifier,0.7391,0.0,0.7561,0.7575,0.7552,0.4758,0.4778,0.0046
9,Linear Discriminant Analysis,0.7391,0.8147,0.7561,0.7575,0.7552,0.4758,0.4778,0.004


### 选择最好的ExtraTree作为分类器，并创建实例

In [6]:
et_model = create_model('et')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8393,0.9224,0.85,0.85,0.85,0.6769,0.6769
1,0.8036,0.8816,0.8,0.8276,0.8136,0.6061,0.6065
2,0.6875,0.7675,0.7167,0.7049,0.7107,0.371,0.371
3,0.8214,0.8864,0.85,0.8226,0.8361,0.6401,0.6405
4,0.7589,0.8651,0.7333,0.8,0.7652,0.5185,0.5205
5,0.7857,0.8715,0.8,0.8,0.8,0.5692,0.5692
6,0.8571,0.9207,0.8667,0.8667,0.8667,0.7128,0.7128
7,0.8661,0.9447,0.8333,0.9091,0.8696,0.7325,0.7354
8,0.8393,0.9041,0.8644,0.8361,0.85,0.677,0.6775
9,0.8198,0.9258,0.9153,0.7826,0.8437,0.634,0.6449


#### 对该模型进行可视化评估

In [7]:
evaluate_model(et_model)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

### 在验证集上验证，观察效果

In [8]:
predict_model(et_model)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extra Trees Classifier,0.8146,0.8983,0.8288,0.8256,0.8272,0.6272,0.6272


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,Label,Score
0,-0.005604,0.394596,0.136158,1.140747,-0.747088,0.742605,1.668267,0.302880,-0.576412,-0.792626,-1.086355,Good,Bad,0.17
1,-0.769331,0.600717,-0.999232,-0.883571,-0.126655,-0.796673,-0.707976,0.565035,1.088545,-0.904344,-1.749849,Bad,Bad,0.31
2,0.249192,1.056111,-0.293609,0.344431,-0.126655,-0.479653,-0.003577,0.329166,0.019022,0.145317,-1.086355,Bad,Bad,0.22
3,-0.522435,0.063170,0.336652,-0.184197,-0.482036,2.201986,0.964234,-1.371097,0.212476,1.130120,1.826208,Good,Good,0.91
4,-0.443413,2.926164,-1.502162,1.623971,0.754724,-1.325466,-1.326805,-0.067088,1.995006,-0.792626,0.494160,Bad,Bad,0.08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
475,0.678236,0.177116,0.287344,0.185945,1.145283,-0.332051,-0.003577,0.139514,-0.442392,-0.578801,0.658770,Bad,Bad,0.46
476,-0.443413,-0.968102,0.187128,0.612182,0.399926,1.089754,1.006003,-0.146889,0.529757,0.426514,0.735837,Bad,Good,0.83
477,1.888491,-0.692787,1.199605,-0.397363,-1.267137,-1.142205,-0.707976,1.395468,-1.814347,-0.190145,-1.245039,Good,Good,0.67
478,-1.495902,0.650741,0.287344,-1.158059,0.440161,0.183230,0.830077,0.091949,0.965523,1.003909,-1.245039,Bad,Bad,0.37


### finalize_model 结束过程

In [9]:
finalize_model(et_model)

ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
                     oob_score=False, random_state=123, verbose=0,
                     warm_start=False)

### 保存模型到本地化pickle文件

In [10]:
save_model(et_model, model_name = 'extra_tree_model')

Transformation Pipeline and Model Succesfully Saved


## 测试是否能够上线

In [1]:
from pycaret.classification import load_model, predict_model
import streamlit as st
import pandas as pd
import numpy as np


2022-04-03 13:38:08.004 INFO    numexpr.utils: Note: NumExpr detected 12 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2022-04-03 13:38:08.005 INFO    numexpr.utils: NumExpr defaulting to 8 threads.


In [2]:
def predict_quality(model, df):
    
    predictions_data = predict_model(estimator = model, data = df)
    
    return predictions_data['Label'][0]
    
model = load_model('extra_tree_model')

Transformation Pipeline and Model Successfully Loaded


In [3]:
features = {'fixed acidity': 10, 'volatile acidity': 1,
            'citric acid': 0.38, 'residual sugar': 5.9,
            'chlorides': 0.638, 'free sulfur dioxide': 51,
            'total sulfur dioxide': 144, 'density': 0.99,
            'pH': 3, 'sulphates': 0.5, 'alcohol': 10.5
            }
 

features_df  = pd.DataFrame([features])
features_df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,10,1,0.38,5.9,0.638,51,144,0.99,3,0.5,10.5


In [4]:
prediction = predict_quality(model, features_df)

In [5]:
str(prediction)

'Good'

In [1]:
!streamlit run streamlit_app.py

^C
