In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_excel("diabetes_1000.xlsx")
df

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,Sudden weight loss,Weakness,Polyphagia,Genital thrush,Visual blurring,Itching,Irritability,Delayed healing,Partial paresis,Muscle stiffness,Alopecia,Obesity,Class
0,35,Female,Yes,No,No,No,Yes,Yes,No,No,No,No,Yes,No,No,No,Positive
1,52,Female,Yes,Yes,Yes,No,Yes,No,No,No,No,No,Yes,Yes,Yes,No,Positive
2,52,Female,Yes,Yes,Yes,No,Yes,No,Yes,Yes,No,No,No,Yes,Yes,Yes,Positive
3,41,Male,Yes,Yes,Yes,Yes,Yes,No,No,No,No,Yes,Yes,Yes,No,Yes,Positive
4,53,Male,No,No,No,No,Yes,No,No,No,No,No,No,No,No,No,Negative
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,39,Female,Yes,No,Yes,Yes,Yes,No,Yes,Yes,No,No,No,No,No,No,Positive
9996,57,Female,Yes,Yes,No,Yes,Yes,Yes,Yes,No,No,Yes,Yes,No,No,No,Positive
9997,58,Female,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,No,No,Positive
9998,33,Male,No,No,No,No,Yes,No,No,Yes,No,No,No,No,No,No,Negative


In [3]:
y = df["Class"].copy()
x = df.drop("Class", axis=1).copy()

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [4]:
X_train

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,Sudden weight loss,Weakness,Polyphagia,Genital thrush,Visual blurring,Itching,Irritability,Delayed healing,Partial paresis,Muscle stiffness,Alopecia,Obesity
9254,31,Male,No,Yes,Yes,No,Yes,Yes,Yes,Yes,No,Yes,No,Yes,No,No
1561,57,Male,No,No,No,No,No,No,Yes,No,No,Yes,No,Yes,Yes,No
1670,67,Male,No,No,No,Yes,No,No,No,No,No,Yes,Yes,Yes,No,No
6087,50,Female,No,No,Yes,No,Yes,Yes,No,Yes,No,No,Yes,No,Yes,Yes
6669,29,Male,Yes,No,Yes,No,Yes,Yes,Yes,Yes,No,No,No,Yes,No,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5734,46,Male,No,Yes,Yes,Yes,No,No,No,No,No,Yes,Yes,No,No,No
5191,48,Male,No,No,No,Yes,No,No,Yes,Yes,No,No,No,Yes,No,No
5390,47,Female,Yes,Yes,Yes,No,No,No,No,Yes,Yes,No,No,Yes,No,No
860,51,Female,Yes,No,Yes,Yes,Yes,No,Yes,No,No,No,Yes,No,No,No


In [5]:
symptoms_col = X_train.drop(["Age","Gender"], axis=1).columns.tolist()
age_col = ['Age']
gender_col =['Gender']

In [27]:
def build_pipeline(age_col, symptoms_col, gender_col):
    # 1. Scale Age
    age_pipeline = Pipeline([
        ("scaler", StandardScaler())
    ])

    # 2. Ordinal encode Yes/No symptoms (all use ['No', 'Yes'])
    symptoms_pipeline = Pipeline([
        ("encoder", OrdinalEncoder(categories=[['No', 'Yes']] * len(symptoms_col)))
    ])

    # 3. One-hot encode Gender (fixed typo: "enoder" → "encoder")
    gender_pipeline = Pipeline([
    ("encoder", OneHotEncoder(categories=[['Male', 'Female']], drop=None, handle_unknown='ignore'))
    ])
    
    # Combine all into a ColumnTransformer
    full_pipeline = ColumnTransformer([
        ("age", age_pipeline, age_col),
        ("gender", gender_pipeline, gender_col),
        ("symptoms", symptoms_pipeline, symptoms_col)
    ])

    return full_pipeline


In [29]:
pipeline = build_pipeline(age_col,symptoms_col, gender_col)
X_train_prepared = pipeline.fit_transform(X_train)

In [25]:
X_train_prepared.shape

(8000, 17)

In [9]:
pd.DataFrame(X_train_prepared, columns=pipeline.get_feature_names_out())


Unnamed: 0,age__Age,gender__Gender_Female,gender__Gender_Male,symptoms__Polyuria,symptoms__Polydipsia,symptoms__Sudden weight loss,symptoms__Weakness,symptoms__Polyphagia,symptoms__Genital thrush,symptoms__Visual blurring,symptoms__Itching,symptoms__Irritability,symptoms__Delayed healing,symptoms__Partial paresis,symptoms__Muscle stiffness,symptoms__Alopecia,symptoms__Obesity
0,-1.349032,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
1,0.774833,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
2,1.591704,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0
3,0.203023,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0
4,-1.512406,0.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,-0.123725,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
7996,0.039649,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
7997,-0.042038,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
7998,0.284710,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [10]:
import tensorflow as tf
from tensorflow.keras import Input
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [11]:
model = Sequential([
    Input(shape=(X_train_prepared.shape[1],)),
    Dense(32, activation='relu'),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')
])

In [12]:
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

In [13]:
model.fit(
    X_train_prepared, # Features
    Y_train,          # Labels (0 or 1)
    epochs=10,        # Full passes over training data
    batch_size=32,    # Update weights after 32 samples
    validation_split=0.2  # 20% of training data used for validation
)

Epoch 1/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.7628 - loss: 0.4588 - val_accuracy: 0.9350 - val_loss: 0.1753
Epoch 2/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9471 - loss: 0.1466 - val_accuracy: 0.9413 - val_loss: 0.1509
Epoch 3/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9528 - loss: 0.1264 - val_accuracy: 0.9425 - val_loss: 0.1476
Epoch 4/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9532 - loss: 0.1289 - val_accuracy: 0.9431 - val_loss: 0.1492
Epoch 5/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9537 - loss: 0.1196 - val_accuracy: 0.9419 - val_loss: 0.1505
Epoch 6/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9491 - loss: 0.1286 - val_accuracy: 0.9438 - val_loss: 0.1477
Epoch 7/10
[1m200/200[0m 

<keras.src.callbacks.history.History at 0x2c1bfd727e0>

In [14]:
import numpy as np
print(type(X_train), X_train.shape)
print(type(Y_train), Y_train.shape)
print(np.unique(Y_train)) 

<class 'pandas.core.frame.DataFrame'> (8000, 16)
<class 'numpy.ndarray'> (8000,)
[0 1]


In [15]:
X_test_prepared = pipeline.fit_transform(X_test) 

In [16]:
X_test

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,Sudden weight loss,Weakness,Polyphagia,Genital thrush,Visual blurring,Itching,Irritability,Delayed healing,Partial paresis,Muscle stiffness,Alopecia,Obesity
6252,34,Male,Yes,Yes,Yes,No,No,No,Yes,No,Yes,No,Yes,No,No,No
4684,57,Male,No,No,No,No,No,No,No,Yes,No,No,No,No,Yes,Yes
1731,25,Male,No,No,No,Yes,Yes,No,Yes,No,No,No,Yes,No,Yes,No
4742,28,Male,Yes,Yes,Yes,No,Yes,No,No,Yes,No,No,Yes,Yes,No,No
4521,53,Female,No,No,Yes,Yes,No,No,No,Yes,No,Yes,Yes,No,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6412,34,Male,No,No,No,Yes,Yes,No,Yes,No,No,Yes,No,Yes,Yes,No
8285,60,Female,Yes,No,No,Yes,No,No,No,No,No,Yes,No,Yes,No,No
7853,48,Male,No,No,No,No,No,Yes,No,No,No,No,No,No,Yes,Yes
1095,59,Female,No,Yes,No,No,Yes,Yes,Yes,Yes,No,Yes,No,No,No,No


In [17]:
test_loss, test_acc = model.evaluate(X_test_prepared, Y_test)
print(f"Test accuracy: {test_acc:.4f}")

[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9506 - loss: 0.1347
Test accuracy: 0.9505


In [18]:
import joblib
model.save("diabetes_model.h5")
joblib.dump(pipeline, "diabetes_pipeline.pkl")



['diabetes_pipeline.pkl']

In [31]:
input_df =  X_test.iloc[[200]] 

In [45]:
pipe=joblib.load("diabetes_pipeline.pkl")

In [33]:
input_df = pd.DataFrame(input_df)

In [35]:
input_df

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,Sudden weight loss,Weakness,Polyphagia,Genital thrush,Visual blurring,Itching,Irritability,Delayed healing,Partial paresis,Muscle stiffness,Alopecia,Obesity
5198,37,Female,No,No,No,Yes,No,No,Yes,Yes,No,Yes,No,Yes,Yes,No


In [53]:
input_dff = pipe.transform(input_df)

In [55]:
print(input_df.shape)
print(input_dff.shape)

(1, 16)
(1, 16)


In [51]:
model.predict(input_dff)


ValueError: Exception encountered when calling Sequential.call().

[1mInput 0 of layer "dense" is incompatible with the layer: expected axis -1 of input shape to have value 17, but received input with shape (1, 16)[0m

Arguments received by Sequential.call():
  • inputs=tf.Tensor(shape=(1, 16), dtype=float32)
  • training=False
  • mask=None
  • kwargs=<class 'inspect._empty'>

In [None]:
pipeline