# Deployment und Abschluss - pycaret

## Install

ref: https://pycaret.gitbook.io/docs/get-started/installation
* You can install PyCaret with Python's pip package manager:

## Load and prep Data

In [4]:
## load data
import numpy as np
import pandas as pd

datapath = '../3_data'
from os import chdir; chdir(datapath)
dataset = pd.read_csv('bank_data_prep.csv')
print(dataset.shape)

(9860, 30)


In [5]:
## remove duration
dataset = dataset.drop("duration", axis = 1)

In [6]:
## train - test - split
from sklearn.model_selection import train_test_split
data, data_unseen = train_test_split(dataset, train_size=0.9, random_state=1234)

print('Data for Modeling: ' + str(data.shape))
print('Unseen Data For Predictions: ' + str(data_unseen.shape))

Data for Modeling: (8874, 29)
Unseen Data For Predictions: (986, 29)


## Run a Classication Experiment

### Init setup

In [9]:
from pycaret.classification import *
s = setup(
    data = data, 
    target = 'y', 
    fold = 5, ## defaul = 10
    session_id=1234) ## random seed

Unnamed: 0,Description,Value
0,Session id,1234
1,Target,y
2,Target type,Binary
3,Target mapping,"no: 0, yes: 1"
4,Original data shape,"(8874, 29)"
5,Transformed data shape,"(8874, 29)"
6,Transformed train set shape,"(6211, 29)"
7,Transformed test set shape,"(2663, 29)"
8,Numeric features,14
9,Preprocess,True


### Show available Models (for Classification)

In [11]:
models()

Unnamed: 0_level_0,Name,Reference,Turbo
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
lr,Logistic Regression,sklearn.linear_model._logistic.LogisticRegression,True
knn,K Neighbors Classifier,sklearn.neighbors._classification.KNeighborsCl...,True
nb,Naive Bayes,sklearn.naive_bayes.GaussianNB,True
dt,Decision Tree Classifier,sklearn.tree._classes.DecisionTreeClassifier,True
svm,SVM - Linear Kernel,sklearn.linear_model._stochastic_gradient.SGDC...,True
rbfsvm,SVM - Radial Kernel,sklearn.svm._classes.SVC,False
gpc,Gaussian Process Classifier,sklearn.gaussian_process._gpc.GaussianProcessC...,False
mlp,MLP Classifier,sklearn.neural_network._multilayer_perceptron....,False
ridge,Ridge Classifier,sklearn.linear_model._ridge.RidgeClassifier,True
rf,Random Forest Classifier,sklearn.ensemble._forest.RandomForestClassifier,True


### Model Training and Selection

In [13]:
best_model = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.7617,0.802,0.7617,0.773,0.7567,0.5144,0.529,0.464
catboost,CatBoost Classifier,0.7582,0.7996,0.7582,0.7681,0.7534,0.5074,0.5207,3.48
lightgbm,Light Gradient Boosting Machine,0.7543,0.7954,0.7543,0.7632,0.7498,0.4998,0.512,0.32
ada,Ada Boost Classifier,0.7492,0.7957,0.7492,0.7616,0.7432,0.4882,0.5044,0.218
rf,Random Forest Classifier,0.7421,0.7902,0.7421,0.7451,0.7394,0.4771,0.4826,0.416
lr,Logistic Regression,0.7393,0.7852,0.7393,0.7435,0.7361,0.4709,0.4778,2.806
ridge,Ridge Classifier,0.7326,0.7851,0.7326,0.7353,0.7298,0.4578,0.463,0.074
lda,Linear Discriminant Analysis,0.7324,0.7851,0.7324,0.7351,0.7296,0.4575,0.4627,0.078
et,Extra Trees Classifier,0.7224,0.7732,0.7224,0.7232,0.7206,0.4387,0.4414,0.414
qda,Quadratic Discriminant Analysis,0.7221,0.7713,0.7221,0.7379,0.7132,0.4313,0.452,0.09


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

MCC, see: https://en.wikipedia.org/wiki/Phi_coefficient

In [15]:
print(best_model)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='log_loss', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_samples_leaf=1,
                           min_samples_split=2, min_weight_fraction_leaf=0.0,
                           n_estimators=100, n_iter_no_change=None,
                           random_state=1234, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)


### Analyze best Model

In [17]:
evaluate_model(best_model)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

### Predict on unseen Data

In [19]:
predictions = predict_model(best_model, data=data_unseen)
print(predictions.head())

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Gradient Boosting Classifier,0.7495,0.799,0.7495,0.7626,0.7457,0.4972,0.5109


       age  education  housing  contact_cellular  month  day_of_week  \
2809  36.0          6        0                 1      5            2   
4052  34.0          6        1                 1      8            3   
658   36.0          6        1                 0      5            2   
786   40.0          5        0                 0      5            3   
6675  36.0          6        0                 1     11            3   

      campaign  pdays  previous  emp_var_rate  ...  job_student  \
2809  0.477121      0         0          -1.8  ...        False   
4052  0.698970      0         0           1.4  ...        False   
658   0.903090      0         0          -1.8  ...        False   
786   0.698970      0         0           1.1  ...        False   
6675  0.477121      0         0          -0.1  ...        False   

      job_technician  job_unemployed  marital_married  marital_single  \
2809           False           False            False            True   
4052            Tr

### Save best Model Pipeline

## Tune a specific Model
ref: https://pycaret.gitbook.io/docs/get-started/functions/optimize

### rf:  Random Forest Classifier

In [23]:
## create model
model_rf = create_model('rf')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7409,0.7939,0.7409,0.7447,0.7379,0.4744,0.4808
1,0.7536,0.7961,0.7536,0.7583,0.7505,0.4998,0.5072
2,0.7351,0.771,0.7351,0.7392,0.7317,0.4621,0.4691
3,0.7311,0.7967,0.7311,0.7329,0.7287,0.4553,0.4594
4,0.7496,0.7931,0.7496,0.7506,0.7481,0.494,0.4966
Mean,0.7421,0.7902,0.7421,0.7451,0.7394,0.4771,0.4826
Std,0.0085,0.0097,0.0085,0.0088,0.0087,0.0174,0.0175


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

In [24]:
## tune model
model_rf_tuned = tune_model(model_rf)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7562,0.8053,0.7562,0.7654,0.7517,0.5037,0.5161
1,0.7689,0.8134,0.7689,0.7825,0.7635,0.5285,0.5456
2,0.7544,0.7897,0.7544,0.7672,0.7485,0.4988,0.5154
3,0.7617,0.8021,0.7617,0.7709,0.7572,0.5147,0.5272
4,0.7689,0.8071,0.7689,0.7759,0.7655,0.5305,0.5402
Mean,0.762,0.8035,0.762,0.7724,0.7573,0.5153,0.5289
Std,0.0061,0.0079,0.0061,0.0062,0.0066,0.0127,0.0123


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [25]:
## parameters of default model
print(model_rf)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       monotonic_cst=None, n_estimators=100, n_jobs=-1,
                       oob_score=False, random_state=1234, verbose=0,
                       warm_start=False)


In [26]:
## parameters of tuned model
print(model_rf_tuned)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                       class_weight='balanced_subsample', criterion='gini',
                       max_depth=10, max_features='sqrt', max_leaf_nodes=None,
                       max_samples=None, min_impurity_decrease=0,
                       min_samples_leaf=5, min_samples_split=7,
                       min_weight_fraction_leaf=0.0, monotonic_cst=None,
                       n_estimators=160, n_jobs=-1, oob_score=False,
                       random_state=1234, verbose=0, warm_start=False)


comparison of the default parameters and the tuned parameters

| parameter | default | tuned |
| :--- | :--- | :--- |
| ccp_alpha | 0 | 0 |
| class_weight | None | 'balanced_subsample' |
| criterion | 'gini' | 'gini' |
| max_depth | None | 10 |
| max_features | 'sqrt' | 'sqrt' |
| max_leaf_nodes | None | None |
| max_samples | None | None |
| min_impurity_decrease | 0 | 0 |
| min_samples_leaf | 1 | 5 |
| min_samples_split | 2 | 7 |
| min_weight_fraction_leaf | 0 | 0 |
| monotonic_cst | None | None |
| n_estimators | 100 | 160 |
| n_jobs | -1 | -1 |
| oob_score | False | False |
| random_state | 1234 | 1234 |
| verbose | 0 | 0 |
| warm_start | False | False |