In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import regularizers
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.callbacks import LearningRateScheduler


In [3]:
df = pd.read_csv ("diabetes.csv")

In [4]:
df.shape

(78094, 126)

In [5]:
Y = df['diabetes_mellitus'] ## Label, target variable, 0 --> doesn't have diabetes_mellitus, 
                                                        #   1 --> does have diabetes_mellitus
X = df.drop(columns=['diabetes_mellitus']) ## 125 feature data set

In [6]:
X.shape

(78094, 125)

In [7]:
Y.shape

(78094,)

In [8]:
# Preprocess the data: clean, impute, scale, and encode
def preprocess_data(train_data, test_data):
    # Columns that are not needed or irrelevant to the model
    remove_columns = ["hospital_id", "nan_counts", "icu_id"]
    train_data.drop(remove_columns, axis=1, inplace=True)
    test_data.drop(remove_columns, axis=1, inplace=True)

    # Separate target variable
    y_train = train_data['diabetes_mellitus']
    x_train = train_data.drop(columns=['diabetes_mellitus'])

    # Identify numeric and categorical columns
    numeric_cols = x_train.select_dtypes(include=['int64', 'float64']).columns
    categorical_cols = x_train.select_dtypes(include=['object']).columns

    # Impute missing values in numeric columns
    imputer = SimpleImputer(strategy='mean')
    x_train[numeric_cols] = imputer.fit_transform(x_train[numeric_cols])
    test_data[numeric_cols] = imputer.transform(test_data[numeric_cols])

    # Scale numeric columns
    scaler = StandardScaler()
    x_train[numeric_cols] = scaler.fit_transform(x_train[numeric_cols])
    test_data[numeric_cols] = scaler.transform(test_data[numeric_cols])

    # One-hot encode categorical columns
    x_train_encoded = pd.get_dummies(x_train, columns=categorical_cols, drop_first=True)
    test_data_encoded = pd.get_dummies(test_data, columns=categorical_cols, drop_first=True)

    # Align train and test data, ensuring they have the same columns
    x_train_encoded, test_data_encoded = x_train_encoded.align(test_data_encoded, join='inner', axis=1, fill_value=0)

    return x_train_encoded, y_train, test_data_encoded



In [9]:
# Build the neural network model
def build_model(input_shape):
    model = keras.Sequential([
        # Using regularization to prevent overfitting
        keras.layers.Dense(512, activation='relu', kernel_regularizer=regularizers.l1_l2(l1=0.001, l2=0.001), input_shape=(input_shape,)),
        keras.layers.Dropout(0.3),
        keras.layers.Dense(256, activation='relu', kernel_regularizer=regularizers.l1_l2(l1=0.001, l2=0.001)),
        keras.layers.Dropout(0.3),
        keras.layers.Dense(128, activation='relu', kernel_regularizer=regularizers.l1_l2(l1=0.001, l2=0.001)),
        keras.layers.Dropout(0.3),
        keras.layers.Dense(1, activation='sigmoid')
    ])
    optimizer = Adam(learning_rate=0.0001)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Train an ensemble of models for robust predictions
def ensemble_model(x_train, y_train, x_val, y_val, num_models=1):
    models = []
    for _ in range(num_models):
        model = build_model(x_train.shape[1])
        # Convert data to a consistent type
        x_train_tensor = tf.convert_to_tensor(x_train.astype('float32'))
        y_train_tensor = tf.convert_to_tensor(y_train.astype('float32'))
        x_val_tensor = tf.convert_to_tensor(x_val.astype('float32'))
        y_val_tensor = tf.convert_to_tensor(y_val.astype('float32'))

        model.fit(x_train_tensor, y_train_tensor, epochs=10, batch_size=64, validation_data=(x_val_tensor, y_val_tensor))
        models.append(model)
    return models





In [10]:
# Predict on test data using the ensemble
def predict_test_data(models, test_data_encoded, threshold=0.5):
    # Convert to TensorFlow tensor and ensure type consistency
    test_data_tensor = tf.convert_to_tensor(test_data_encoded.astype('float32'))

    # Averaging predictions from all models
    test_predictions = np.mean([model.predict(test_data_tensor) for model in models], axis=0)
    # Convert to binary outcome
    test_predictions_binary = np.where(test_predictions >= threshold, 1, 0)
    return test_predictions_binary.flatten()

# Create a submission file for the predictions
def save_submission(predictions_binary, test_data, filename='Diabetes_Mellitus_Prediction_DeepLearning Approach Output.csv'):
    submission_df = pd.DataFrame()
    submission_df['ID'] = test_encounter_id['encounter_id']
    submission_df['diabetes_mellitus'] = predictions_binary
    submission_df.to_csv(filename, index=False)
    print('done')


In [11]:
# Load and preprocess data
train_data = pd.read_csv("diabetes.csv")
test_data = pd.read_csv("diabetes_test_unlabeled.csv")
test_encounter_id=pd.read_csv("diabetes_test_unlabeled.csv")

x_train_encoded, y_train, test_data_encoded = preprocess_data(train_data, test_data)

# Split data into training and validation sets
x_train, x_val, y_train, y_val = train_test_split(x_train_encoded, y_train, test_size=0.3, random_state=42)

# Train models and predict
models = ensemble_model(x_train, y_train, x_val, y_val, num_models=1)
test_predictions_binary = predict_test_data(models, test_data_encoded)

# Save predictions to a file
save_submission(test_predictions_binary, test_data)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
done
