In [42]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

In [43]:
df = pd.read_csv('diabetes.csv')
df.head(2)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0


In [44]:
df.isna().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

##### 1. Create a column called “BMI_category” using the “BMI” column and the general medical guidelines for BMI types.

In [45]:
# under 18.5 – This is described as underweight. 
# between 18.5 and 24.9 – This is described as the ‘healthy range’.
# between 25 and 29.9 – This is described as overweight
# between 30 and 39.9 – This is described as obesity
# 40 or over – This is described as severe obesity

In [46]:
category = lambda x: 'underweight' if x >= 0 and x < 18.5 else 'healthy' if  x >=18.5 and x < 24.9 else 'overweight' if x >= 25 and x < 29.9 else 'obesity' if x >= 30 and x < 39.9 else 'severe obesity' if x > 40 else None
df['BMI_category'] = df['BMI'].apply(category)

In [47]:
df.head(2)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,BMI_category
0,6,148,72,35,0,33.6,0.627,50,1,obesity
1,1,85,66,29,0,26.6,0.351,31,0,overweight


##### 2. Split the data into 2 parts: train and val. Use 20% of the data for val.

In [48]:
from sklearn.model_selection import train_test_split
X = df.drop('Outcome', axis = 1)
y = df['Outcome']
xtrain, xval, ytrain, yval = train_test_split(X,y, test_size=0.2, random_state=42, stratify=y)

##### 3. Apply Standard Scaler on the numeric features. Fit and transform on train and only transform on val.

In [49]:
X.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'BMI_category'],
      dtype='object')

In [50]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
num_cols = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI',
            'DiabetesPedigreeFunction', 'Age']

xtrain[num_cols] = sc.fit_transform(xtrain[num_cols])
xval[num_cols] = sc.transform(xval[num_cols])

##### 4. Apply One-hot Encoding to the categorical features. Fit and transform on train and only transform on val

In [51]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse=False)
cat_cols = ['BMI_category']

xtrain_encoded = ohe.fit_transform(xtrain[cat_cols])
xtrain = xtrain.drop(cat_cols, axis=1)
xtrain[ohe.get_feature_names_out(cat_cols)] = xtrain_encoded

xval_encoded = ohe.transform(xval[cat_cols])
xval = xval.drop(cat_cols, axis=1)
xval[ohe.get_feature_names_out(cat_cols)] = xval_encoded



##### 5. Build a KNN classifier. Experiment with different values of k (3, 5, 7) and select the value with the highest f1 score

In [52]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score

k_values = [3, 5, 7]
best_k = 0
highest_f1 = 0

for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(xtrain, ytrain)
    yval_pred = knn.predict(xval)
    f1 = f1_score(yval, yval_pred)
    print(f'k={k}, f1_score={f1}')
    if f1 > highest_f1:
        highest_f1 = f1
        best_k = k

print(f'Best k: {best_k} with highest f1 score: {highest_f1}')

k=3, f1_score=0.5420560747663552
k=5, f1_score=0.6078431372549019
k=7, f1_score=0.6153846153846153
Best k: 7 with highest f1 score: 0.6153846153846153


##### 6. Build a Decision Tree classifier. Experiment with different values of max_depth (3, 5, 7) and choose the value with the highest f1 score.

In [53]:
from sklearn.tree import DecisionTreeClassifier

max_depth_values = [3, 5, 7]
best_depth = 0
highest_f1_dt = 0

for depth in max_depth_values:
    dt = DecisionTreeClassifier(max_depth=depth, random_state=42)
    dt.fit(xtrain, ytrain)
    yval_pred_dt = dt.predict(xval)
    f1_dt = f1_score(yval, yval_pred_dt)
    print(f'max_depth={depth}, f1_score={f1_dt}')
    if f1_dt > highest_f1_dt:
        highest_f1_dt = f1_dt
        best_depth = depth

print(f'Best max_depth: {best_depth} with highest f1 score: {highest_f1_dt}')

max_depth=3, f1_score=0.37333333333333335
max_depth=5, f1_score=0.7037037037037037
max_depth=7, f1_score=0.5909090909090909
Best max_depth: 5 with highest f1 score: 0.7037037037037037


In [54]:
import joblib

# Save the StandardScaler
joblib.dump(sc, 'standard_scaler.pkl')

# Save the OneHotEncoder
joblib.dump(ohe, 'one_hot_encoder.pkl')

# Save the best model (Decision Tree in this case)
best_model = DecisionTreeClassifier(max_depth=best_depth, random_state=42)
best_model.fit(xtrain, ytrain)
joblib.dump(best_model, 'best_model.pkl')

['best_model.pkl']

In [55]:
# Select 5 samples from the validation set
xval_sample = xval.iloc[:5]
yval_sample = yval.iloc[:5]

# Make predictions using the best model
yval_sample_pred = best_model.predict(xval_sample)

# Display the results
for i in range(5):
    print(f"Sample {i+1}:")
    print(f"Features: {xval_sample.iloc[i].values}")
    print(f"Actual Outcome: {yval_sample.iloc[i]}")
    print(f"Predicted Outcome: {yval_sample_pred[i]}")
    print()

Sample 1:
Features: [ 0.96054099  1.20788789 -0.29601471 -1.31138021 -0.73076636 -0.58221684
 -0.55579092  0.56103382  0.          0.          1.          0.
  0.          0.        ]
Actual Outcome: 0
Predicted Outcome: 1

Sample 2:
Features: [ 1.86648903 -1.67775979  1.98813468  0.14031779 -0.27558248  0.44897876
 -0.58306107  1.15306018  0.          1.          0.          0.
  0.          0.        ]
Actual Outcome: 0
Predicted Outcome: 0

Sample 3:
Features: [-0.5493724   0.03460257  0.3565994   0.392787    1.12712704  0.499902
  0.01688223 -0.6230189   0.          1.          0.          0.
  0.          0.        ]
Actual Outcome: 0
Predicted Outcome: 0

Sample 4:
Features: [ 0.96054099 -0.21908074 -0.29601471 -1.31138021 -0.73076636 -0.58221684
  0.77135637  0.05358266  0.          0.          1.          0.
  0.          0.        ]
Actual Outcome: 1
Predicted Outcome: 1

Sample 5:
Features: [-0.5493724  -1.4874973  -3.77662329 -1.31138021 -0.73076636 -4.07045874
 -1.13755411 