In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score
import joblib

In [19]:
df = pd.read_csv("C:/Users/ajkan/Downloads/diabetes.csv")
df.head(2)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0


In [20]:
def bmi_categories(bmi):
  if bmi<18.5:
    return 'Underweight'
  elif 18.5<=bmi<25:
    return 'Healthy weight'
  elif 25<=bmi<30:
    return 'Overweight'
  else:
    return 'Obese'

In [21]:
df['BMI_category']=df['BMI'].apply(bmi_categories)
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,BMI_category
0,6,148,72,35,0,33.6,0.627,50,1,Obese
1,1,85,66,29,0,26.6,0.351,31,0,Overweight
2,8,183,64,0,0,23.3,0.672,32,1,Healthy weight
3,1,89,66,23,94,28.1,0.167,21,0,Overweight
4,0,137,40,35,168,43.1,2.288,33,1,Obese


In [22]:
train_data, val_data = train_test_split(df, test_size=0.2, random_state=42)

In [23]:
numeric_features = ["Pregnancies", "Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI", "DiabetesPedigreeFunction", "Age"]
categorical_features = ["BMI_category"]
target_column = "Outcome"

In [24]:
X_train = train_data[numeric_features + categorical_features]
X_val = val_data[numeric_features + categorical_features]
y_train = train_data[target_column]
y_val = val_data[target_column]

In [25]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[numeric_features])
X_val_scaled = scaler.transform(X_val[numeric_features])


In [26]:
encoder = OneHotEncoder(sparse=False, handle_unknown="ignore")
X_train_encoded = encoder.fit_transform(X_train[categorical_features])
X_val_encoded = encoder.transform(X_val[categorical_features])




In [27]:
X_train_prepared = np.hstack([X_train_scaled, X_train_encoded])
X_val_prepared = np.hstack([X_val_scaled, X_val_encoded])


In [28]:
best_knn_f1 = 0
best_knn_k = 0
for k in [3, 5, 7]:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_prepared, y_train)
    y_pred = knn.predict(X_val_prepared)
    f1 = f1_score(y_val, y_pred)
    print(f"KNN with k={k}: F1 Score={f1}")
    if f1 > best_knn_f1:
        best_knn_f1 = f1
        best_knn_k = k

print(f"Best KNN k={best_knn_k} with F1 Score={best_knn_f1}")


KNN with k=3: F1 Score=0.6296296296296297
KNN with k=5: F1 Score=0.5607476635514018
KNN with k=7: F1 Score=0.5904761904761904
Best KNN k=3 with F1 Score=0.6296296296296297


In [30]:
best_tree_f1 = 0
best_tree_depth = 0
for depth in [3, 5, 7]:
    tree = DecisionTreeClassifier(max_depth=depth, random_state=42)
    tree.fit(X_train_prepared, y_train)
    y_pred = tree.predict(X_val_prepared)
    f1 = f1_score(y_val, y_pred)
    print(f"Decision Tree with max_depth={depth}: F1 Score={f1}")
    if f1 > best_tree_f1:
        best_tree_f1 = f1
        best_tree_depth = depth

print(f"Best Decision Tree depth={best_tree_depth} with F1 Score={best_tree_f1}")

Decision Tree with max_depth=3: F1 Score=0.6476190476190475
Decision Tree with max_depth=5: F1 Score=0.6862745098039216
Decision Tree with max_depth=7: F1 Score=0.6495726495726496
Best Decision Tree depth=5 with F1 Score=0.6862745098039216


In [31]:
if best_knn_f1 > best_tree_f1:
    best_model = KNeighborsClassifier(n_neighbors=best_knn_k)
    best_model_name = "knn"
else:
    best_model = DecisionTreeClassifier(max_depth=best_tree_depth, random_state=42)
    best_model_name = "decision_tree"


In [32]:
best_model.fit(X_train_prepared, y_train)

In [33]:
joblib.dump(scaler, "scaler.pkl")
joblib.dump(encoder, "encoder.pkl")
joblib.dump(best_model, f"{best_model_name}_model.pkl")


['decision_tree_model.pkl']

In [35]:
def inference_pipeline(sample_features):
    scaler = joblib.load("scaler.pkl")
    encoder = joblib.load("encoder.pkl")
    model = joblib.load(f"{best_model_name}_model.pkl")
    numeric_sample = sample_features[numeric_features]
    categorical_sample = sample_features[categorical_features]

    
    scaled_numeric = scaler.transform([numeric_sample])
    encoded_categorical = encoder.transform([categorical_sample])

   
    prepared_sample = np.hstack([scaled_numeric, encoded_categorical])

   
    prediction = model.predict(prepared_sample)
    return prediction[0]

In [36]:
for i in range(5):
    sample = X_val.iloc[i]
    true_label = y_val.iloc[i]
    predicted_label = inference_pipeline(sample)
    print(f"Sample {i+1}: True Label={true_label}, Predicted Label={predicted_label}")


Sample 1: True Label=0, Predicted Label=0
Sample 2: True Label=0, Predicted Label=0
Sample 3: True Label=0, Predicted Label=0
Sample 4: True Label=0, Predicted Label=0
Sample 5: True Label=0, Predicted Label=0


