In [1]:
import numpy as np
import pandas as pd


In [2]:
df=pd.read_csv('diabetes.csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
# Define BMI categories
def categorize_bmi(bmi):
    if bmi < 18.5:
        return "Underweight"
    elif 18.5 <= bmi < 24.9:
        return "Normal weight"
    elif 25 <= bmi < 29.9:
        return "Overweight"
    else:
        return "Obesity"

df['BMI_category'] = df['BMI'].apply(categorize_bmi)

In [4]:
from sklearn.model_selection import train_test_split

# Split data (80% train, 20% validation)
train_data, val_data = train_test_split(df, test_size=0.2, random_state=42)


In [5]:
from sklearn.preprocessing import StandardScaler

# Identify numeric columns
numeric_features = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']

# Initialize scaler
scaler = StandardScaler()

# Fit on train, transform train and val
train_data[numeric_features] = scaler.fit_transform(train_data[numeric_features])
val_data[numeric_features] = scaler.transform(val_data[numeric_features])


In [6]:
from sklearn.preprocessing import OneHotEncoder

# Identify categorical columns
categorical_features = ['BMI_category']

# Initialize encoder
encoder = OneHotEncoder(sparse=False, drop='first')

# Fit on train, transform train and val
encoded_train = pd.DataFrame(encoder.fit_transform(train_data[categorical_features]), columns=encoder.get_feature_names_out())
encoded_val = pd.DataFrame(encoder.transform(val_data[categorical_features]), columns=encoder.get_feature_names_out())

# Drop original categorical columns and concatenate one-hot encoded features
train_data = pd.concat([train_data.reset_index(drop=True), encoded_train], axis=1).drop(columns=categorical_features)
val_data = pd.concat([val_data.reset_index(drop=True), encoded_val], axis=1).drop(columns=categorical_features)




In [7]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score

X_train = train_data.drop(columns=['Outcome'])
y_train = train_data['Outcome']
X_val = val_data.drop(columns=['Outcome'])
y_val = val_data['Outcome']

best_k = 0
best_f1 = 0

for k in [3, 5, 7]:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_val)
    score = f1_score(y_val, y_pred)
    print(f"K={k}, F1 Score={score}")
    if score > best_f1:
        best_f1 = score
        best_k = k

print(f"Best K: {best_k} with F1 Score: {best_f1}")


K=3, F1 Score=0.5871559633027522
K=5, F1 Score=0.5420560747663552
K=7, F1 Score=0.5714285714285713
Best K: 3 with F1 Score: 0.5871559633027522


In [8]:
from sklearn.tree import DecisionTreeClassifier

best_depth = 0
best_f1_dt = 0

for depth in [3, 5, 7]:
    dt = DecisionTreeClassifier(max_depth=depth, random_state=42)
    dt.fit(X_train, y_train)
    y_pred = dt.predict(X_val)
    score = f1_score(y_val, y_pred)
    print(f"Max Depth={depth}, F1 Score={score}")
    if score > best_f1_dt:
        best_f1_dt = score
        best_depth = depth

print(f"Best Max Depth: {best_depth} with F1 Score: {best_f1_dt}")


Max Depth=3, F1 Score=0.6476190476190475
Max Depth=5, F1 Score=0.6862745098039216
Max Depth=7, F1 Score=0.6260869565217392
Best Max Depth: 5 with F1 Score: 0.6862745098039216


In [9]:
import joblib

# Save scaler, encoder, and best model
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(encoder, 'encoder.pkl')
joblib.dump(knn if best_f1 > best_f1_dt else dt, 'best_model.pkl')


['best_model.pkl']

In [10]:
# Save the validation data to a CSV file
val_data.to_csv('validation.csv', index=False)