In [8]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import joblib

In [3]:
#loading the available dataset
df = pd.read_csv(".\data\diabetes.csv")

In [4]:
# checking the first five rows
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
# To understand the number of rows and columns
df.shape

(768, 9)

In [6]:
# To understand the dataset more, there is need to see the statistical measures
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


### Note:
These statistics are from a dataset with 768 observations, each representing an individual. The dataset appears to be related to diabetes, as indicated by the "Outcome" column, which likely indicates whether or not an individual has diabetes (1 for yes, 0 for no). Here's how to interpret the statistics for each column:

- **Pregnancies:** This column represents the number of pregnancies the individuals in the dataset have had. On average, the individuals have had approximately 3.85 pregnancies, with a minimum of 0 and a maximum of 17.

- **Glucose:** This column represents the glucose levels in the blood. The average glucose level is approximately 120.89 mg/dL. The values range from a minimum of 0 (which seems unrealistic and may indicate missing data) to a maximum of 199.

- **BloodPressure:** This column represents the blood pressure of the individuals (mm Hg). The average blood pressure is approximately 69.11 mm Hg. The values range from a minimum of 0 (which may indicate missing data or measurement errors) to a maximum of 122.

- **SkinThickness:** This column represents the thickness of the skinfold (mm). The average skin thickness is approximately 20.54 mm. The values range from a minimum of 0 (which could indicate missing or erroneous data) to a maximum of 99.

- **Insulin:** This column represents insulin levels (mu U/ml). The average insulin level is approximately 79.80 mu U/ml. The values range from 0 to a maximum of 846.

- **BMI (Body Mass Index):** This column represents the Body Mass Index, which is a measure of an individual's body fat based on their height and weight. The average BMI is approximately 31.99. The values range from a minimum of 0 (which may indicate missing or erroneous data) to a maximum of 67.10.

- **DiabetesPedigreeFunction:** This column represents a function that measures the genetic risk of diabetes. The average value of this function is approximately 0.47, with values ranging from 0.078 to 2.42.

- **Age:** This column represents the age of the individuals in years. The average age is approximately 33.24 years, with a minimum of 21 years and a maximum of 81 years.

- **Outcome:** This column likely indicates whether an individual has diabetes or not. The values are binary, with 1 typically indicating the presence of diabetes and 0 indicating the absence of diabetes.

In [9]:
# Separate features (X) and target variable (y)
X = df.drop('Outcome', axis=1)
y = df['Outcome']

In [10]:
# Split the dataset into 80% training and 20% test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# Perform preprocessing: Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [13]:
# Initialize classifiers
classifiers = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'Support Vector Machine': SVC(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Gradient Boosting': GradientBoostingClassifier()
}

In [14]:
best_model = None
best_accuracy = 0

In [15]:
# Iterate through classifiers and select the best one
for name, classifier in classifiers.items():
    # Cross-validation to estimate model performance
    scores = cross_val_score(classifier, X_train, y_train, cv=5)
    avg_accuracy = np.mean(scores)
    print(f'{name} - Cross-Validation Accuracy: {avg_accuracy:.2f}')
    
    # Train the model on the full training set
    classifier.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = classifier.predict(X_test)
    
    # Calculate accuracy on the test set
    test_accuracy = accuracy_score(y_test, y_pred)
    print(f'{name} - Test Accuracy: {test_accuracy:.2f}')
    
    # Check if this model has the highest accuracy
    if test_accuracy > best_accuracy:
        best_accuracy = test_accuracy
        best_model = classifier

Logistic Regression - Cross-Validation Accuracy: 0.76
Logistic Regression - Test Accuracy: 0.75
Random Forest - Cross-Validation Accuracy: 0.77
Random Forest - Test Accuracy: 0.75
Support Vector Machine - Cross-Validation Accuracy: 0.77
Support Vector Machine - Test Accuracy: 0.73
K-Nearest Neighbors - Cross-Validation Accuracy: 0.74
K-Nearest Neighbors - Test Accuracy: 0.69
Gradient Boosting - Cross-Validation Accuracy: 0.77
Gradient Boosting - Test Accuracy: 0.75


In [17]:
# Save the best model to a file
if best_model:
    joblib.dump(best_model, './model/best_diabetes_model.pkl')
    print(f'Best model ({type(best_model).__name__}) saved to best_diabetes_model.pkl')
else:
    print('No best model found.')

Best model (LogisticRegression) saved to best_diabetes_model.pkl
