# Criando modelos de classificação com o dataset diabetes do pacote pycaret

In [1]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.utils.class_weight import compute_class_weight


In [2]:
from pycaret.datasets import get_data
data = get_data('diabetes')

Unnamed: 0,Number of times pregnant,Plasma glucose concentration a 2 hours in an oral glucose tolerance test,Diastolic blood pressure (mm Hg),Triceps skin fold thickness (mm),2-Hour serum insulin (mu U/ml),Body mass index (weight in kg/(height in m)^2),Diabetes pedigree function,Age (years),Class variable
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
data['Class variable'].value_counts()


0    500
1    268
Name: Class variable, dtype: int64

In [4]:
from pycaret.classification import *
s = setup(data, target = 'Class variable', session_id = 123)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Class variable
2,Target type,Binary
3,Original data shape,"(768, 9)"
4,Transformed data shape,"(768, 9)"
5,Transformed train set shape,"(537, 9)"
6,Transformed test set shape,"(231, 9)"
7,Numeric features,8
8,Preprocess,True
9,Imputation type,simple


In [5]:
best = s.compare_models(sort='Recall')

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
nb,Naive Bayes,0.7427,0.7955,0.5702,0.6543,0.6043,0.4156,0.4215,0.01
lr,Logistic Regression,0.7689,0.8047,0.5602,0.7208,0.6279,0.4641,0.4736,0.78
gbc,Gradient Boosting Classifier,0.7373,0.7914,0.555,0.6445,0.5931,0.4013,0.4059,0.054
lda,Linear Discriminant Analysis,0.767,0.8055,0.555,0.7202,0.6243,0.4594,0.4695,0.01
ridge,Ridge Classifier,0.767,0.0,0.5497,0.7235,0.6221,0.4581,0.469,0.008
lightgbm,Light Gradient Boosting Machine,0.7133,0.7645,0.5398,0.6036,0.565,0.3534,0.358,0.093
rf,Random Forest Classifier,0.7485,0.7911,0.5284,0.6811,0.5924,0.415,0.4238,0.068
qda,Quadratic Discriminant Analysis,0.7282,0.7894,0.5281,0.6558,0.5736,0.3785,0.391,0.009
ada,Ada Boost Classifier,0.7372,0.7799,0.5275,0.6585,0.5796,0.3926,0.4017,0.041
dt,Decision Tree Classifier,0.6928,0.6512,0.5137,0.5636,0.5328,0.307,0.3098,0.01


# Aumentando o desbalanceamento de classe e recriando os modelos de classificação

In [6]:
class0 = data.query('`Class variable`==0')
class1 = data.query('`Class variable`==1').sample(frac=0.5)


In [7]:
data2 = pd.concat([class0, class1])

In [8]:
data2['Class variable'].value_counts()


0    500
1    134
Name: Class variable, dtype: int64

In [9]:
from pycaret.classification import *
s2 = setup(data2, target = 'Class variable', session_id = 124)

Unnamed: 0,Description,Value
0,Session id,124
1,Target,Class variable
2,Target type,Binary
3,Original data shape,"(634, 9)"
4,Transformed data shape,"(634, 9)"
5,Transformed train set shape,"(443, 9)"
6,Transformed test set shape,"(191, 9)"
7,Numeric features,8
8,Preprocess,True
9,Imputation type,simple


In [15]:
best2 = s2.compare_models(sort='Recall')

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
nb,Naive Bayes,0.8149,0.8049,0.4967,0.6049,0.5335,0.4204,0.4308,0.009
qda,Quadratic Discriminant Analysis,0.8151,0.799,0.4878,0.6125,0.5297,0.4172,0.4293,0.008
lda,Linear Discriminant Analysis,0.8353,0.8282,0.4444,0.696,0.5293,0.4364,0.4594,0.008
dt,Decision Tree Classifier,0.7316,0.6226,0.4344,0.3946,0.4025,0.2339,0.2399,0.009
lr,Logistic Regression,0.8331,0.8312,0.4244,0.7111,0.5161,0.4236,0.452,0.019
lightgbm,Light Gradient Boosting Machine,0.8038,0.7886,0.4167,0.5586,0.4635,0.3501,0.3618,0.069
rf,Random Forest Classifier,0.842,0.7931,0.4133,0.7458,0.5219,0.4376,0.4694,0.062
gbc,Gradient Boosting Classifier,0.8197,0.7953,0.4067,0.6245,0.4867,0.3847,0.4001,0.046
ada,Ada Boost Classifier,0.8018,0.7585,0.3978,0.5417,0.4487,0.3354,0.3454,0.035
ridge,Ridge Classifier,0.8285,0.0,0.37,0.7088,0.4721,0.3831,0.4182,0.007


# Recriando modelos com o parâmetro fix_imbalance=True

In [11]:
from pycaret.classification import *
s4 = setup(data2, target = 'Class variable', session_id = 125, fix_imbalance=True)

Unnamed: 0,Description,Value
0,Session id,125
1,Target,Class variable
2,Target type,Binary
3,Original data shape,"(634, 9)"
4,Transformed data shape,"(889, 9)"
5,Transformed train set shape,"(698, 9)"
6,Transformed test set shape,"(191, 9)"
7,Numeric features,8
8,Preprocess,True
9,Imputation type,simple


In [12]:
best4 = s4.compare_models(sort='Recall')

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.7453,0.814,0.7056,0.4521,0.5461,0.3846,0.4054,0.018
ridge,Ridge Classifier,0.7475,0.0,0.7056,0.455,0.5486,0.3886,0.4085,0.012
lda,Linear Discriminant Analysis,0.7475,0.8177,0.7056,0.455,0.5486,0.3886,0.4085,0.011
nb,Naive Bayes,0.7541,0.7802,0.6922,0.4553,0.5434,0.3871,0.4078,0.013
qda,Quadratic Discriminant Analysis,0.7564,0.7598,0.6189,0.4603,0.521,0.3654,0.3769,0.012
ada,Ada Boost Classifier,0.754,0.7662,0.6178,0.453,0.5141,0.3576,0.3713,0.044
knn,K Neighbors Classifier,0.6484,0.6773,0.5667,0.3256,0.405,0.1858,0.2043,0.015
gbc,Gradient Boosting Classifier,0.7584,0.7639,0.5544,0.4504,0.4907,0.3366,0.3438,0.069
rf,Random Forest Classifier,0.7701,0.7924,0.5378,0.4709,0.491,0.3466,0.3543,0.075
svm,SVM - Linear Kernel,0.6361,0.0,0.5189,0.2766,0.3459,0.1405,0.1657,0.01


Processing:   0%|          | 0/61 [00:00<?, ?it/s]

In [13]:
best4

# Criando um modelo LogisticRegression com o sklearn

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score, roc_auc_score
from sklearn.utils.class_weight import compute_class_weight

X = data2.drop('Class variable', axis=1)
y = data2['Class variable']

pesos = compute_class_weight(class_weight="balanced", classes=(yu:= np.unique(y)), y=y)
classe_peso = dict(zip(yu, list(pesos)))
classe_peso

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression(class_weight=classe_peso)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)

print("Acurácia:", accuracy)
print("Recall:", recall)
print("AUC:", auc)

Acurácia: 0.7952755905511811
Recall: 0.7037037037037037
AUC: 0.7618518518518519
