In [387]:
#Loading the libraries
import pandas as pd

In [389]:
#Loading the data
diabetes_pre= pd.read_csv(r"C:\Users\PC\Desktop\learn\dataset\diabetes.csv") 

In [391]:
#Dropping the fullname column 
diabetes_pre= diabetes_pre.drop(['fullname'],axis = 1)

In [393]:
#Assigning index to the data
diabetes_pre['Id'] = [f'{i:04}' for i in range(1, len(diabetes_pre) + 1)]

In [395]:
diabetes_pre.rename(columns={'age':'Age'},inplace = True)

In [403]:
#Looking at the data
diabetes_pre.head()

Unnamed: 0,Age,Gender,Sugar_Level,Weight,Height,Diabetes,Id
0,20,Male,186,56.61,195.29,No,1
1,8,Female,131,95.03,188.5,No,2
2,33,Male,92,84.56,177.83,No,3
3,25,Male,196,84.16,162.78,No,4
4,40,Male,198,91.13,165.1,Yes,5


The name column is removed to maintain anonymity for ethical, legal and privacy concerns followed by assigning Index to uniquely identify each record and enable efficient data access. The age column was changed to Age to ensure clear and consistent format accros the various columns.

# Data Exploration and Pre-processing

Duplicate Check

In [181]:
#Checking for duplicates in the data
duplicates_pre= diabetes_pre[diabetes_pre.duplicated()]

In [183]:
duplicates_pre

Unnamed: 0,Age,Gender,Sugar_Level,Weight,Height,Diabetes,Id


Overview of the data

In [186]:
# Overview of the data( checking for null, and the datatpes) 
diabetes_pre.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 625 entries, 0 to 624
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Age          625 non-null    int64  
 1   Gender       625 non-null    object 
 2   Sugar_Level  625 non-null    int64  
 3   Weight       625 non-null    float64
 4   Height       625 non-null    float64
 5   Diabetes     625 non-null    object 
 6   Id           625 non-null    object 
dtypes: float64(2), int64(2), object(3)
memory usage: 34.3+ KB


Descriptive statistics

In [189]:
#Descriptive statistics on the numerical features
diabetes_pre.describe()

Unnamed: 0,Age,Sugar_Level,Weight,Height
count,625.0,625.0,625.0,625.0
mean,21.1456,134.2128,75.374551,174.338701
std,11.963871,38.067179,14.461423,14.388471
min,1.0,70.0,50.011876,150.016625
25%,10.0,102.0,62.711229,161.44473
50%,22.0,134.0,76.154236,174.081742
75%,32.0,168.0,87.621388,187.085603
max,40.0,199.0,99.967675,199.853908


Looking for Outlier

In [194]:
# Columns to check
cols_to_check = ['Age', 'Sugar_Level', 'Weight', 'Height']

for col in cols_to_check:
    Q1 = diabetes_pre[col].quantile(0.25)
    Q3 = diabetes_pre[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = diabetes_pre[(diabetes_pre[col] < lower_bound) | (diabetes_pre[col] > upper_bound)]
    
    print(f"\nColumn: {col}")
    print(f"Total outliers: {len(outliers)}")
    print(outliers[[col]])


Column: Age
Total outliers: 0
Empty DataFrame
Columns: [Age]
Index: []

Column: Sugar_Level
Total outliers: 0
Empty DataFrame
Columns: [Sugar_Level]
Index: []

Column: Weight
Total outliers: 0
Empty DataFrame
Columns: [Weight]
Index: []

Column: Height
Total outliers: 0
Empty DataFrame
Columns: [Height]
Index: []


#everything looks great for the numerical features and would need to check for the object features

In [196]:
object_cols= ['Gender','Diabetes']

#finding unique values for the object columns
unique_values = {}
for col in object_cols :
    unique_values[col] = diabetes_pre[col].unique()

# printing the values for object_cols
for col, values in unique_values.items() :
    print(f"Unique values in '{col}' : ")
    print(values)
    print()


Unique values in 'Gender' : 
['Male' 'Female']

Unique values in 'Diabetes' : 
['No' 'Yes']



The object columns contains the correct unique records

Duplicate check was conducted with no duplicate in the data, for the overview of the data, each column was assigned the right data type with zero non-null count for the columns.
For the descriptive statistics on the numerical features;
 - The dataset includes 625 records with complete data for age, Sugar_Level, Weight, and Height.
 - Age ranges from 1 to 40 years, with a mean of 21.15 years, indicating a young population that includes both children and adults.
 - Sugar levels vary widely (70 to 199 mg/dL), with a mean of 134.21 mg/dL, suggesting that some individuals may be at risk of diabetes.
 - Weight ranges from 50 to 99.97 kg, averaging 75.37 kg, showing moderate variability across individuals.
 -Height spans from 150 to 199.85 cm, with a mean of 174.34 cm, indicating a diverse height range, possibly across different age and gender groups.

Using the Outlier detection check for the numerical columns, All age,Sugar_Level, Weight and Height entries fall within the expected range of variation.
For the objects columns contains the correct unique records of Male and Female for Gender and No and Yes for the Diabetes target variable.

# Feature Engeneering

Feature Creation

In [220]:

# Height is in cm and Weight in kg
diabetes_pre['BMI'] = diabetes_pre['Weight'] / ((diabetes_pre['Height'] / 100) ** 2)


In [222]:
def classify_bmi(bmi):
    if bmi < 18.5:
        return 'Underweight'
    elif 18.5 <= bmi < 25:
        return 'Normal'
    elif 25 <= bmi < 30:
        return 'Overweight'
    else:
        return 'Obese'

diabetes_pre['BMI_Category'] = diabetes_pre['BMI'].apply(classify_bmi)


Using feature engineering involves adding new features to a model's feature vector that are calculated from the existing features. These new properties could be mathematical transformations of pre-existing traits, such as ratios or differences (Heaton, 2016). The Added feature is BMI(Body Mass Index) which estimates body fat based on a person's weight and height. Calculated Using the formula: BMI= Weight(kg)/ [Height(m)]2. BMI greater or equal to 25 is associated with overweight or obesity which is a major risk factor for type 2 diabetes. 

Feature Selection

All the features available seems to be relevant for predicting diabetes and with small number of features,the model is not at much risk to overfitting. Therefore feature selection technique such as filter method(Correlation Analysis,F-Test and Chi-Square test), Wrapper method and embedded feature selection method will not be neccessary.

FEATURE ENCODING OF CATEGORICAL VARIABLES

In [267]:
#Making a copy of diabetes_pre as diabetes_eng 
diabetes_eng = diabetes_pre.copy()

In [271]:
#Encoding Categorical variables

import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Encode Gender and Diabetes Column
diabetes_eng['Gender'] = diabetes_eng['Gender'].map({'Male': 0, 'Female': 1}) 
diabetes_eng['Diabetes'] = diabetes_eng['Diabetes'].map({'No': 0, 'Yes': 1})

# Encode BMI Category
le = LabelEncoder()
diabetes_eng['BMI_Category'] = le.fit_transform(diabetes_eng['BMI_Category'])  

Encoding the Categorical values makes the data numerically meaningful and machine readable for the algorithms to process it effectively. 

FEATURE SCALING

In [274]:
from sklearn.preprocessing import StandardScaler

# List of columns to scale
columns_to_scale = ['Age', 'Sugar_Level', 'Weight', 'Height', 'BMI', 'Gender' , 'BMI_Category']

# Initialize the scaler
scaler = StandardScaler()

# Apply StandardScaler
df_scaled_values = scaler.fit_transform(diabetes_eng[columns_to_scale])

# Convert back to DataFrame for easy use
diabetes_scaled = pd.DataFrame(df_scaled_values, columns=columns_to_scale)


# View scaled data
print(diabetes_scaled.head())


        Age  Sugar_Level    Weight    Height       BMI    Gender  BMI_Category
0 -0.095832     1.361506 -1.298335  1.457600 -1.641728 -1.044175      1.671828
1 -1.099655    -0.084466  1.360157  0.984808  0.226512  0.957694      0.747144
2  0.991643    -1.109791  0.635753  0.242969  0.225614 -1.044175      0.747144
3  0.322428     1.624410  0.607940 -0.803818  1.013811 -1.044175     -0.177539
4  1.577207     1.676990  1.090298 -0.642394  1.276015 -1.044175     -0.177539


Scaling is important in machine learning because it brings all numerical features to the same scale, which improves model performance and training efficiency. Features like Height and Sugar_Level with higher magnitude will dominate the model due to their larger numeric ranges, even if they're not more important. For the tree based models scalling wont be neccesarry, despite that scalling doesnt hurt the model.

# Model Selection and training

LAZY PREDICT

In [None]:
!pip install lazypredict

In [None]:
from lazypredict.Supervised import LazyClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [None]:
# Features and target
X = diabetes_scaled[['Age', 'Gender', 'Sugar_Level', 'Weight', 'Height', 'BMI', 'BMI_Category']]
y = diabetes_eng['Diabetes']  # 0 or 1

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [None]:
clf = LazyClassifier(verbose=0, ignore_warnings=True, random_state=42)
models, predictions = clf.fit(X_train, X_test, y_train, y_test)


In [325]:
print(models)

                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
SGDClassifier                      0.55               0.54     0.54      0.54   
QuadraticDiscriminantAnalysis      0.54               0.54     0.54      0.54   
ExtraTreeClassifier                0.53               0.52     0.52      0.53   
DecisionTreeClassifier             0.51               0.51     0.51      0.51   
AdaBoostClassifier                 0.52               0.51     0.51      0.52   
RidgeClassifier                    0.53               0.51     0.51      0.50   
LogisticRegression                 0.53               0.51     0.51      0.50   
LinearDiscriminantAnalysis         0.52               0.51     0.51      0.50   
RidgeClassifierCV                  0.52               0.50     0.50      0.49   
DummyClassifier                    0.54               0.50     0.50      0.37   
LinearSVC                   

Upon using the lazy predict, which gives the performance summary of the various models, the models I will be using for the analysis are Support Vector Classifier, Decision Tree and the K-Nearest Neighbor.   

APPLYING SVC

In [343]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

# Define features and target
X = diabetes_scaled[['Age', 'Gender', 'Sugar_Level', 'Weight', 'Height', 'BMI', 'BMI_Category']]
y = diabetes_eng['Diabetes']  

# Train-Test Split 
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Initialize and train the SVM model
svm_model = SVC(kernel='linear', C=1.0, random_state=42)
svm_model.fit(X_train, y_train)
 

DECISION TREE

In [347]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

# Define features and target
X = diabetes_scaled[['Age', 'Gender', 'Sugar_Level', 'Weight', 'Height', 'BMI', 'BMI_Category']]
y = diabetes_eng['Diabetes']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Initialize and train the Decision Tree model
dt_model = DecisionTreeClassifier(random_state=42, class_weight='balanced')  
dt_model.fit(X_train, y_train)


In [None]:
K NEAREST NEIGHBOR 

In [351]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

# Define features and target
X = diabetes_scaled[['Age', 'Gender', 'Sugar_Level', 'Weight', 'Height', 'BMI', 'BMI_Category']]
y = diabetes_eng['Diabetes']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Initialize and train the KNN model
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)


# MODEL EVALUATION

SVC

In [357]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Predict on test data
y_pred_svm = svm_model.predict(X_test)

# Evaluate the model
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svm))
print("\nClassification Report:\n", classification_report(y_test, y_pred_svm))
print("Accuracy Score:", accuracy_score(y_test, y_pred_svm))

Confusion Matrix:
 [[16 41]
 [17 51]]

Classification Report:
               precision    recall  f1-score   support

           0       0.48      0.28      0.36        57
           1       0.55      0.75      0.64        68

    accuracy                           0.54       125
   macro avg       0.52      0.52      0.50       125
weighted avg       0.52      0.54      0.51       125

Accuracy Score: 0.536


DECISION TREE

In [359]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Predict on test data
y_pred_dt = dt_model.predict(X_test)

# Evaluate the model
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_dt))
print("\nClassification Report:\n", classification_report(y_test, y_pred_dt))
print("Accuracy Score:", accuracy_score(y_test, y_pred_dt))


Confusion Matrix:
 [[19 38]
 [26 42]]

Classification Report:
               precision    recall  f1-score   support

           0       0.42      0.33      0.37        57
           1       0.53      0.62      0.57        68

    accuracy                           0.49       125
   macro avg       0.47      0.48      0.47       125
weighted avg       0.48      0.49      0.48       125

Accuracy Score: 0.488


KNN

In [363]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Predict on test data
y_pred_knn = knn_model.predict(X_test)

# Evaluate the model
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_knn))
print("\nClassification Report:\n", classification_report(y_test, y_pred_knn))
print("Accuracy Score:", accuracy_score(y_test, y_pred_knn))


Confusion Matrix:
 [[27 30]
 [30 38]]

Classification Report:
               precision    recall  f1-score   support

           0       0.47      0.47      0.47        57
           1       0.56      0.56      0.56        68

    accuracy                           0.52       125
   macro avg       0.52      0.52      0.52       125
weighted avg       0.52      0.52      0.52       125

Accuracy Score: 0.52


From the models, SVC does better with Accuracy Score of 0.536 and having a recall of 0.75 for predicting the positive diabetes with precision of 0.55

HYPERPARAMETER TUNING 

RANDOM SEARCH

In [373]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from scipy.stats import uniform

# Define features and target
X = diabetes_scaled[['Age', 'Gender', 'Sugar_Level', 'Weight', 'Height', 'BMI', 'BMI_Category']]
y = diabetes_eng['Diabetes']  

# Train-Test Split 
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Define hyperparameter distribution
param_dist = {
    'C': uniform(0.1, 10),  # C from 0.1 to 10
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
    'gamma': ['scale', 'auto']  # only relevant for non-linear kernels
}

# Initialize base SVC model
svc = SVC(random_state=42)

# Randomized Search
random_search = RandomizedSearchCV(
    svc, param_distributions=param_dist,
    n_iter=20, cv=5, random_state=42, n_jobs=-1
)

# Fit model
random_search.fit(X_train, y_train)

# Best model
best_svm = random_search.best_estimator_

# Predict on test data
y_pred_svm = best_svm.predict(X_test)

# Evaluate the model
print("Best Parameters Found:", random_search.best_params_)
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_svm))
print("\nClassification Report:\n", classification_report(y_test, y_pred_svm))
print("Accuracy Score:", accuracy_score(y_test, y_pred_svm))


Best Parameters Found: {'C': 2.0967378215835972, 'gamma': 'scale', 'kernel': 'sigmoid'}

Confusion Matrix:
 [[24 33]
 [31 37]]

Classification Report:
               precision    recall  f1-score   support

           0       0.44      0.42      0.43        57
           1       0.53      0.54      0.54        68

    accuracy                           0.49       125
   macro avg       0.48      0.48      0.48       125
weighted avg       0.49      0.49      0.49       125

Accuracy Score: 0.488


GRID SEARCH

In [382]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Define features and target
X = diabetes_scaled[['Age', 'Gender', 'Sugar_Level', 'Weight', 'Height', 'BMI', 'BMI_Category']]
y = diabetes_eng['Diabetes']  

# Train-Test Split 
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Define hyperparameter grid
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
    'gamma': ['scale', 'auto']  # Only used for rbf, poly, sigmoid
}

# Initialize base SVC model
svc = SVC(random_state=42)

# Grid Search
grid_search = GridSearchCV(
    svc, param_grid=param_grid,
    cv=5, n_jobs=-1, verbose=1
)

# Fit model
grid_search.fit(X_train, y_train)

# Best model
best_svm = grid_search.best_estimator_

# Predict on test data
y_pred_svm = best_svm.predict(X_test)

# Evaluate the model
print("Best Parameters Found:", grid_search.best_params_)
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_svm))
print("\nClassification Report:\n", classification_report(y_test, y_pred_svm))
print("Accuracy Score:", accuracy_score(y_test, y_pred_svm))


Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best Parameters Found: {'C': 0.1, 'gamma': 'scale', 'kernel': 'rbf'}

Confusion Matrix:
 [[ 0 57]
 [ 0 68]]

Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00        57
           1       0.54      1.00      0.70        68

    accuracy                           0.54       125
   macro avg       0.27      0.50      0.35       125
weighted avg       0.30      0.54      0.38       125

Accuracy Score: 0.544


With the small size of the data, it doesnt do so well with hyperparameter tunning causing overfitting.