### Data Collection and Analysis
Dataset collected from kagle
#### PIMA Diabetes Dataset

In [1]:
import pandas as pd
# loading the diabetes dataset to a pandas DataFrame
diabetes_dataset = pd.read_csv('diabetes.csv') 

In [2]:
# printing the first 5 rows of the dataset
diabetes_dataset.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
# number of rows and Columns in this dataset
diabetes_dataset.shape

(768, 9)

In [4]:
# getting the statistical measures of the data
diabetes_dataset.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


### Scaling

In [5]:
from sklearn.preprocessing import StandardScaler

# Create a StandardScaler object
scaler = StandardScaler()

# Define the features to be scaled
features_to_scale = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']

# Fit the scaler to the data and transform the features
scaled_features = scaler.fit_transform(diabetes_dataset[features_to_scale])

# Update the original DataFrame with the scaled features
diabetes_dataset[features_to_scale] = scaled_features

print(diabetes_dataset.head())

   Pregnancies   Glucose  BloodPressure  SkinThickness   Insulin       BMI  \
0     0.639947  0.848324       0.149641       0.907270 -0.692891  0.204013   
1    -0.844885 -1.123396      -0.160546       0.530902 -0.692891 -0.684422   
2     1.233880  1.943724      -0.263941      -1.288212 -0.692891 -1.103255   
3    -0.844885 -0.998208      -0.160546       0.154533  0.123302 -0.494043   
4    -1.141852  0.504055      -1.504687       0.907270  0.765836  1.409746   

   DiabetesPedigreeFunction       Age  Outcome  
0                  0.468492  1.425995        1  
1                 -0.365061 -0.190672        0  
2                  0.604397 -0.105584        1  
3                 -0.920763 -1.041549        0  
4                  5.484909 -0.020496        1  


In [6]:
diabetes_dataset['Outcome'].value_counts()

Outcome
0    500
1    268
Name: count, dtype: int64

0 --> Non-Diabetic

1 --> Diabetic

In [7]:
diabetes_dataset.groupby('Outcome').mean()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,-0.162456,-0.341594,-0.047638,-0.054728,-0.095577,-0.214288,-0.127275,-0.174505
1,0.30309,0.637302,0.088877,0.102104,0.178315,0.39979,0.237453,0.325569


In [8]:
# separating the data and labels
X = diabetes_dataset.drop(columns = 'Outcome', axis=1)
Y = diabetes_dataset['Outcome']

### Train Test Split

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X_train, X_test, Y_train, Y_test = train_test_split(X ,Y, test_size = 0.2, stratify=Y, random_state=2)

In [11]:
print(X.shape, X_train.shape, X_test.shape)

(768, 8) (614, 8) (154, 8)


### Evaluation

In [12]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

def evaluation(model):
    training_prediction = model.predict(X_train)
    test_prediction = model.predict(X_test)
    
    print("Accuracy Score of the model: ")
    
    training_data_accuracy = accuracy_score(Y_train, training_prediction)
    print("Accuracy score on training data: ", training_data_accuracy)

    test_data_accuracy = accuracy_score(Y_test, test_prediction)
    print("Accuracy score on test data: ", test_data_accuracy)
    
    print()
    print("Precision of the model: ")

    training_data_precision = precision_score(Y_train, training_prediction, average='binary')
    print("Precision score on training data: ", training_data_precision)

    test_data_precision = precision_score(Y_test, test_prediction, average='binary')
    print ("Precision score on test data: ", test_data_precision)
    print()
    print("Recall of the model: ")

    training_data_recall = recall_score(Y_train, training_prediction, average='binary')
    print("Recall score on training data: ", training_data_recall)

    test_data_recall = recall_score(Y_test, test_prediction, average='binary')
    print ("Recall score on test data: ", test_data_recall)
    print()
    print("f1 score of the model: ")

    training_data_f1 = f1_score(Y_train, training_prediction, average='binary')
    print("f1 score on training data: ", training_data_f1)

    test_data_f1 = f1_score(Y_test, test_prediction, average='binary')
    print ("f1 score on test data: ", test_data_f1)

In [13]:
from pycaret.classification import compare_models, setup

This function initializes the training environment and creates the transformation pipeline. Setup function must be called before executing any other function. It takes two mandatory parameters: data and target.

In [14]:
clf1 = setup(data = diabetes_dataset, 
             target = 'Outcome')

Unnamed: 0,Description,Value
0,Session id,3202
1,Target,Outcome
2,Target type,Binary
3,Original data shape,"(768, 9)"
4,Transformed data shape,"(768, 9)"
5,Transformed train set shape,"(537, 9)"
6,Transformed test set shape,"(231, 9)"
7,Numeric features,8
8,Preprocess,True
9,Imputation type,simple


In [15]:
top5 = compare_models(sort='AUC',
                      n_select=5,
                      exclude=['lightgbm', 'xgboost', 'dummy', 'svm', 'ridge', 'knn', 'dt', 'nb', 'qda']
                     )

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.7636,0.8229,0.5573,0.7042,0.6173,0.4514,0.4605,0.229
lda,Linear Discriminant Analysis,0.7599,0.821,0.5515,0.6964,0.6116,0.4428,0.451,0.009
et,Extra Trees Classifier,0.7597,0.8113,0.5775,0.6837,0.6235,0.4497,0.4548,0.022
rf,Random Forest Classifier,0.7616,0.8073,0.5877,0.6947,0.6331,0.4582,0.4646,0.027
gbc,Gradient Boosting Classifier,0.7391,0.8012,0.581,0.6446,0.6063,0.4126,0.4175,0.021
ada,Ada Boost Classifier,0.7506,0.7953,0.5939,0.6685,0.6232,0.4387,0.4448,0.012


Processing:   0%|          | 0/33 [00:00<?, ?it/s]

### Soft Voting Ensemble

A soft voting ensemble is a type of ensemble learning method used in machine learning for classification tasks where multiple base models are trained on the same dataset, and they each produce probability estimates for all possible classes.

To make predictions with a soft voting ensemble, the predicted probabilities from each base model are averaged (or weighted averaged) for each class across all base models. The class with the highest average probability is then chosen as the final prediction.

In [16]:
from pycaret.classification import blend_models

In [17]:
blend_soft = blend_models(estimator_list = top5, optimize = 'AUC',method = 'soft')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8704,0.9038,0.6842,0.9286,0.7879,0.6976,0.7145
1,0.7037,0.788,0.4737,0.6,0.5294,0.3175,0.3223
2,0.7593,0.8376,0.6842,0.65,0.6667,0.4785,0.4788
3,0.6852,0.7519,0.4211,0.5714,0.4848,0.2656,0.272
4,0.7222,0.785,0.5789,0.6111,0.5946,0.3836,0.3839
5,0.8148,0.8632,0.6316,0.8,0.7059,0.5735,0.582
6,0.8148,0.9008,0.6842,0.7647,0.7222,0.584,0.586
7,0.8302,0.7921,0.6111,0.8462,0.7097,0.594,0.6098
8,0.7925,0.7921,0.6111,0.7333,0.6667,0.5178,0.5223
9,0.7547,0.8032,0.5556,0.6667,0.6061,0.4301,0.4339


Processing:   0%|          | 0/6 [00:00<?, ?it/s]

In [18]:
print("Evaluation scores of Soft Voting: ")
print()
evaluation(blend_soft)

Evaluation scores of Soft Voting: 

Accuracy Score of the model: 
Accuracy score on training data:  0.9153094462540716
Accuracy score on test data:  0.9025974025974026

Precision of the model: 
Precision score on training data:  0.9090909090909091
Precision score on test data:  0.9333333333333333

Recall of the model: 
Recall score on training data:  0.8411214953271028
Recall score on test data:  0.7777777777777778

f1 score of the model: 
f1 score on training data:  0.8737864077669902
f1 score on test data:  0.8484848484848485


### Saving the trained model

In [19]:
import pickle

In [21]:
filename = 'diabetes_model.sav'
pickle.dump(blend_soft, open(filename, 'wb'))