In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
import sklearn

In [2]:
print(sklearn.__version__)

1.7.1


In [3]:
df = pd.read_excel("diabetes_1000.xlsx")
df

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,Sudden weight loss,Weakness,Polyphagia,Genital thrush,Visual blurring,Itching,Irritability,Delayed healing,Partial paresis,Muscle stiffness,Alopecia,Obesity,Class
0,35,Female,Yes,No,No,No,Yes,Yes,No,No,No,No,Yes,No,No,No,Positive
1,52,Female,Yes,Yes,Yes,No,Yes,No,No,No,No,No,Yes,Yes,Yes,No,Positive
2,52,Female,Yes,Yes,Yes,No,Yes,No,Yes,Yes,No,No,No,Yes,Yes,Yes,Positive
3,41,Male,Yes,Yes,Yes,Yes,Yes,No,No,No,No,Yes,Yes,Yes,No,Yes,Positive
4,53,Male,No,No,No,No,Yes,No,No,No,No,No,No,No,No,No,Negative
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,39,Female,Yes,No,Yes,Yes,Yes,No,Yes,Yes,No,No,No,No,No,No,Positive
9996,57,Female,Yes,Yes,No,Yes,Yes,Yes,Yes,No,No,Yes,Yes,No,No,No,Positive
9997,58,Female,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,No,No,Positive
9998,33,Male,No,No,No,No,Yes,No,No,Yes,No,No,No,No,No,No,Negative


In [4]:
y = df["Class"].copy()
x = df.drop("Class", axis=1).copy()

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [5]:
X_train

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,Sudden weight loss,Weakness,Polyphagia,Genital thrush,Visual blurring,Itching,Irritability,Delayed healing,Partial paresis,Muscle stiffness,Alopecia,Obesity
9254,31,Male,No,Yes,Yes,No,Yes,Yes,Yes,Yes,No,Yes,No,Yes,No,No
1561,57,Male,No,No,No,No,No,No,Yes,No,No,Yes,No,Yes,Yes,No
1670,67,Male,No,No,No,Yes,No,No,No,No,No,Yes,Yes,Yes,No,No
6087,50,Female,No,No,Yes,No,Yes,Yes,No,Yes,No,No,Yes,No,Yes,Yes
6669,29,Male,Yes,No,Yes,No,Yes,Yes,Yes,Yes,No,No,No,Yes,No,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5734,46,Male,No,Yes,Yes,Yes,No,No,No,No,No,Yes,Yes,No,No,No
5191,48,Male,No,No,No,Yes,No,No,Yes,Yes,No,No,No,Yes,No,No
5390,47,Female,Yes,Yes,Yes,No,No,No,No,Yes,Yes,No,No,Yes,No,No
860,51,Female,Yes,No,Yes,Yes,Yes,No,Yes,No,No,No,Yes,No,No,No


In [6]:
symptoms_col = X_train.drop(["Age","Gender"], axis=1).columns.tolist()
age_col = ['Age']
gender_col =['Gender']

In [7]:
def build_pipeline(age_col, symptoms_col, gender_col):
    # 1. Scale Age
    age_pipeline = Pipeline([
        ("scaler", StandardScaler())
    ])

    # 2. Ordinal encode Yes/No symptoms (all use ['No', 'Yes'])
    symptoms_pipeline = Pipeline([
        ("encoder", OrdinalEncoder(categories=[['No', 'Yes']] * len(symptoms_col)))
    ])

    # 3. One-hot encode Gender (fixed typo: "enoder" → "encoder")
    gender_pipeline = Pipeline([
    ("encoder", OneHotEncoder(categories=[['Male', 'Female']], drop=None, handle_unknown='ignore'))
    ])
    
    # Combine all into a ColumnTransformer
    full_pipeline = ColumnTransformer([
        ("age", age_pipeline, age_col),
        ("gender", gender_pipeline, gender_col),
        ("symptoms", symptoms_pipeline, symptoms_col)
    ])

    return full_pipeline


In [8]:
pipeline = build_pipeline(age_col,symptoms_col, gender_col)
X_train_prepared = pipeline.fit_transform(X_train)

In [9]:
X_train_prepared.shape

(8000, 17)

In [10]:
pd.DataFrame(X_train_prepared, columns=pipeline.get_feature_names_out())


Unnamed: 0,age__Age,gender__Gender_Male,gender__Gender_Female,symptoms__Polyuria,symptoms__Polydipsia,symptoms__Sudden weight loss,symptoms__Weakness,symptoms__Polyphagia,symptoms__Genital thrush,symptoms__Visual blurring,symptoms__Itching,symptoms__Irritability,symptoms__Delayed healing,symptoms__Partial paresis,symptoms__Muscle stiffness,symptoms__Alopecia,symptoms__Obesity
0,-1.349032,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
1,0.774833,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
2,1.591704,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0
3,0.203023,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0
4,-1.512406,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,-0.123725,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
7996,0.039649,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
7997,-0.042038,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
7998,0.284710,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [11]:
import tensorflow as tf
from tensorflow.keras import Input
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [12]:
model = Sequential([
    Input(shape=(X_train_prepared.shape[1],)),
    Dense(32, activation='relu'),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')
])

In [13]:
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

In [14]:
model.fit(
    X_train_prepared, # Features
    Y_train,          # Labels (0 or 1)
    epochs=10,        # Full passes over training data
    batch_size=32,    # Update weights after 32 samples
    validation_split=0.2  # 20% of training data used for validation
)

Epoch 1/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.7610 - loss: 0.4614 - val_accuracy: 0.9356 - val_loss: 0.1722
Epoch 2/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9475 - loss: 0.1484 - val_accuracy: 0.9444 - val_loss: 0.1470
Epoch 3/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9504 - loss: 0.1297 - val_accuracy: 0.9456 - val_loss: 0.1460
Epoch 4/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9482 - loss: 0.1371 - val_accuracy: 0.9475 - val_loss: 0.1427
Epoch 5/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9495 - loss: 0.1278 - val_accuracy: 0.9463 - val_loss: 0.1457
Epoch 6/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9490 - loss: 0.1244 - val_accuracy: 0.9475 - val_loss: 0.1427
Epoch 7/10
[1m200/200[0m 

<keras.src.callbacks.history.History at 0x15a0f79ab10>

In [15]:
import numpy as np
print(type(X_train), X_train.shape)
print(type(Y_train), Y_train.shape)
print(np.unique(Y_train)) 

<class 'pandas.core.frame.DataFrame'> (8000, 16)
<class 'numpy.ndarray'> (8000,)
[0 1]


In [16]:
X_test_prepared = pipeline.fit_transform(X_test) 

In [17]:
X_test

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,Sudden weight loss,Weakness,Polyphagia,Genital thrush,Visual blurring,Itching,Irritability,Delayed healing,Partial paresis,Muscle stiffness,Alopecia,Obesity
6252,34,Male,Yes,Yes,Yes,No,No,No,Yes,No,Yes,No,Yes,No,No,No
4684,57,Male,No,No,No,No,No,No,No,Yes,No,No,No,No,Yes,Yes
1731,25,Male,No,No,No,Yes,Yes,No,Yes,No,No,No,Yes,No,Yes,No
4742,28,Male,Yes,Yes,Yes,No,Yes,No,No,Yes,No,No,Yes,Yes,No,No
4521,53,Female,No,No,Yes,Yes,No,No,No,Yes,No,Yes,Yes,No,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6412,34,Male,No,No,No,Yes,Yes,No,Yes,No,No,Yes,No,Yes,Yes,No
8285,60,Female,Yes,No,No,Yes,No,No,No,No,No,Yes,No,Yes,No,No
7853,48,Male,No,No,No,No,No,Yes,No,No,No,No,No,No,Yes,Yes
1095,59,Female,No,Yes,No,No,Yes,Yes,Yes,Yes,No,Yes,No,No,No,No


In [18]:
test_loss, test_acc = model.evaluate(X_test_prepared, Y_test)
print(f"Test accuracy: {test_acc:.4f}")

[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9544 - loss: 0.1342
Test accuracy: 0.9520


In [19]:
import joblib
model.save("diabe_model.h5")
joblib.dump(pipeline, "diabetes_pipeline.pkl")



['diabetes_pipeline.pkl']

In [20]:
input_df =  X_test.iloc[[200]] 

In [21]:
pipe=joblib.load("diabetes_pipeline.pkl")

In [22]:
input_df = pd.DataFrame(input_df)

In [23]:
input_df

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,Sudden weight loss,Weakness,Polyphagia,Genital thrush,Visual blurring,Itching,Irritability,Delayed healing,Partial paresis,Muscle stiffness,Alopecia,Obesity
5198,37,Female,No,No,No,Yes,No,No,Yes,Yes,No,Yes,No,Yes,Yes,No


In [24]:
input_dff = pipe.transform(input_df)

In [25]:
print(input_df.shape)
print(input_dff.shape)

(1, 16)
(1, 17)


In [26]:
p = model.predict(input_dff)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 114ms/step


In [27]:
p

array([[0.07025111]], dtype=float32)