# Import required libaries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Load the data

In [2]:
filepath = '../alzheimer_masked_nn/alzheimers_disease_data.csv'
data = pd.read_csv(filepath)

# Display basic information about the dataset

In [3]:
print("Dataset Shape:", data.shape)
print("\nFirst few rows of the dataset:")
data.head()

Dataset Shape: (2149, 35)

First few rows of the dataset:


Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,...,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness,Diagnosis,DoctorInCharge
0,4751,73,0,0,2,22.927749,0,13.297218,6.327112,1.347214,...,0,0,1.725883,0,0,0,1,0,0,XXXConfid
1,4752,89,0,0,0,26.827681,0,4.542524,7.619885,0.518767,...,0,0,2.592424,0,0,0,0,1,0,XXXConfid
2,4753,73,0,3,1,17.795882,0,19.555085,7.844988,1.826335,...,0,0,7.119548,0,1,0,1,0,0,XXXConfid
3,4754,74,1,0,1,33.800817,1,12.209266,8.428001,7.435604,...,0,1,6.481226,0,0,0,0,0,0,XXXConfid
4,4755,89,0,0,0,20.716974,0,18.454356,6.310461,0.795498,...,0,0,0.014691,0,0,1,1,0,0,XXXConfid


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2149 entries, 0 to 2148
Data columns (total 35 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   PatientID                  2149 non-null   int64  
 1   Age                        2149 non-null   int64  
 2   Gender                     2149 non-null   int64  
 3   Ethnicity                  2149 non-null   int64  
 4   EducationLevel             2149 non-null   int64  
 5   BMI                        2149 non-null   float64
 6   Smoking                    2149 non-null   int64  
 7   AlcoholConsumption         2149 non-null   float64
 8   PhysicalActivity           2149 non-null   float64
 9   DietQuality                2149 non-null   float64
 10  SleepQuality               2149 non-null   float64
 11  FamilyHistoryAlzheimers    2149 non-null   int64  
 12  CardiovascularDisease      2149 non-null   int64  
 13  Diabetes                   2149 non-null   int64

## Data Preprocessing Class

In [5]:
class AlzheimersDataPreprocessor:
    def __init__(self):
        self.scalers = {}
        
    def preprocess(self, data, is_training=True):
        """
        Preprocess the Alzheimer's dataset
        
        Parameters:
        data (pd.DataFrame): Raw dataset
        is_training (bool): Whether this is training data or prediction data
        
        Returns:
        tuple: (processed_features, processed_target) for training
               or processed_features for prediction
        """
        # Create a copy to avoid modifying original data
        df = data.copy()
        
        # 1. Remove non-predictive columns
        df = df.drop(['PatientID', 'DoctorInCharge'], axis=1)
        
        # 2. Separate features and target
        if 'Diagnosis' in df.columns:
            y = df['Diagnosis']
            X = df.drop('Diagnosis', axis=1)
        else:
            X = df
            y = None
            
        # 3. Define column groups
        numerical_columns = [
            'Age', 'BMI', 'AlcoholConsumption', 'PhysicalActivity',
            'DietQuality', 'SleepQuality', 'SystolicBP', 'DiastolicBP',
            'CholesterolTotal', 'CholesterolLDL', 'CholesterolHDL',
            'CholesterolTriglycerides', 'MMSE', 'FunctionalAssessment', 'ADL'
        ]
        
        # Already encoded categorical columns (no processing needed)
        categorical_columns = ['Gender', 'Ethnicity', 'EducationLevel']
        
        # Binary columns (already 0/1)
        binary_columns = [
            'Smoking', 'FamilyHistoryAlzheimers', 'CardiovascularDisease',
            'Diabetes', 'Depression', 'HeadInjury', 'Hypertension',
            'MemoryComplaints', 'BehavioralProblems', 'Confusion',
            'Disorientation', 'PersonalityChanges', 'DifficultyCompletingTasks',
            'Forgetfulness'
        ]
        
        # 4. Scale numerical features
        for col in numerical_columns:
            if is_training:
                self.scalers[col] = StandardScaler()
                X[col] = self.scalers[col].fit_transform(X[col].values.reshape(-1, 1))
            else:
                X[col] = self.scalers[col].transform(X[col].values.reshape(-1, 1))
        
        # 5. Create mask for missing values
        mask = ~X.isna()
        
        # 6. Fill missing values with 0 (they'll be masked anyway)
        X = X.fillna(0)
        
        if y is not None:
            return X, mask, y
        return X, mask

## Prepare Data for Training

In [6]:
def prepare_data(data, test_size=0.2, random_state=42):
    """
    Prepare the dataset for training
    
    Parameters:
    data (pd.DataFrame): Input dataframe
    test_size (float): Proportion of data to use for testing
    random_state (int): Random seed for reproducibility
    
    Returns:
    tuple: (X_train, X_test, masks_train, masks_test, y_train, y_test, preprocessor)
    """
    # Create preprocessor
    preprocessor = AlzheimersDataPreprocessor()
    
    # Split data
    train_data, test_data = train_test_split(
        data, 
        test_size=test_size, 
        random_state=random_state,
        stratify=data['Diagnosis']
    )
    
    # Preprocess training data
    X_train, masks_train, y_train = preprocessor.preprocess(train_data, is_training=True)
    
    # Preprocess test data using fitted preprocessor
    X_test, masks_test, y_test = preprocessor.preprocess(test_data, is_training=False)
    
    return (X_train, X_test, masks_train, masks_test, 
            y_train, y_test, preprocessor)

## Process the Data

In [7]:
# Prepare the data
X_train, X_test, masks_train, masks_test, y_train, y_test, preprocessor = prepare_data(data)

# Print information about the processed data
print("Processed Data Shapes:")
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"masks_train shape: {masks_train.shape}")
print(f"masks_test shape: {masks_test.shape}")

Processed Data Shapes:
X_train shape: (1719, 32)
X_test shape: (430, 32)
masks_train shape: (1719, 32)
masks_test shape: (430, 32)


## Examine Processed Data

In [8]:
# Display sample of processed numerical features
numerical_columns = [
    'Age', 'BMI', 'AlcoholConsumption', 'PhysicalActivity',
    'DietQuality', 'SleepQuality', 'SystolicBP', 'DiastolicBP',
    'CholesterolTotal', 'CholesterolLDL', 'CholesterolHDL',
    'CholesterolTriglycerides', 'MMSE', 'FunctionalAssessment', 'ADL'
]
print("Sample of scaled numerical features:")
print(X_train[numerical_columns].head())

Sample of scaled numerical features:
           Age       BMI  AlcoholConsumption  PhysicalActivity  DietQuality  \
780   1.230721  0.065466            1.478214          0.571099    -1.276449   
1341 -1.430841  0.070065           -0.407432         -1.050032     0.340303   
820   0.010838 -0.316941           -0.288334          1.628134    -1.119152   
857   1.563416 -0.567289           -1.418146         -1.257326     1.279730   
593   0.121737  1.533521           -0.190409          1.188813     0.982253   

      SleepQuality  SystolicBP  DiastolicBP  CholesterolTotal  CholesterolLDL  \
780       1.457443    0.195308    -0.234958          0.059480       -1.656832   
1341     -1.135747    1.622491     1.200748          1.498922        1.190810   
820       0.512803   -0.498997     1.545317          0.019782        0.414247   
857       1.255038    1.391056    -1.096381         -0.159480       -0.970426   
593      -0.632116   -0.614714    -0.522099         -1.634594       -0.220092   

 

In [9]:
# Display sample of categorical features (already encoded)
categorical_columns = ['Gender', 'Ethnicity', 'EducationLevel']
print("Sample of categorical features (already encoded):")
print(X_train[categorical_columns].head())

Sample of categorical features (already encoded):
      Gender  Ethnicity  EducationLevel
780        0          2               1
1341       1          0               3
820        0          0               0
857        0          0               0
593        1          0               2


In [10]:
# Check class distribution
print("\nClass distribution:")
print("Training set:")
print(pd.Series(y_train).value_counts(normalize=True))
print("\nTest set:")
print(pd.Series(y_test).value_counts(normalize=True))


Class distribution:
Training set:
Diagnosis
0    0.646306
1    0.353694
Name: proportion, dtype: float64

Test set:
Diagnosis
0    0.646512
1    0.353488
Name: proportion, dtype: float64


## Neural Network Implementation

In [12]:
import tensorflow as tf
from tensorflow.keras import layers, Model
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix
from tqdm.notebook import tqdm
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


### Define the Neural Network Model

In [15]:
class MissingValueNetwork:
    def __init__(self, input_dim, hidden_dims=[64, 32]):
        self.input_dim = input_dim
        self.hidden_dims = hidden_dims
        self.model = self._build_model()

    def _build_model(self):

        # Main input for features

        main_input = layers.Input(shape=(self.input_dim,), name='main_input')


        # Mask input (1 for present, 0 for missing)

        mask_input = layers.Input(shape=(self.input_dim,), name='mask_input')


        # Branch 1: Process available values

        masked_input = layers.Multiply()([main_input, mask_input])


        x1 = masked_input

        for dim in self.hidden_dims:

            x1 = layers.Dense(dim, activation='relu')(x1)

            x1 = layers.BatchNormalization()(x1)

            x1 = layers.Dropout(0.3)(x1)


        # Branch 2: Process missing patterns

        x2 = mask_input

        for dim in self.hidden_dims:

            x2 = layers.Dense(dim//2, activation='relu')(x2)

            x2 = layers.BatchNormalization()(x2)

            x2 = layers.Dropout(0.3)(x2)


        # Combine both branches

        combined = layers.Concatenate()([x1, x2])


        # Final processing

        x = layers.Dense(32, activation='relu')(combined)

        x = layers.BatchNormalization()(x)

        x = layers.Dropout(0.3)(x)


        # Output layer

        output = layers.Dense(1, activation='sigmoid')(x)


        # Create model

        model = Model(inputs=[main_input, mask_input], outputs=output)


        # Compile model
        model.compile(

            optimizer='adam',

            loss='binary_crossentropy',

            metrics=['accuracy', tf.keras.metrics.AUC(name='auc')]
        )

        return model


    def train(self, X_train, masks_train, y_train, validation_data, 

              epochs=50, batch_size=32, class_weights=None):

        """

        Train the model
        

        Parameters:

        X_train: Training features

        masks_train: Training masks

        y_train: Training labels

        validation_data: Tuple of (X_val, masks_val, y_val)

        """

        # Prepare validation data

        X_val, masks_val, y_val = validation_data


        # Add early stopping

        early_stopping = tf.keras.callbacks.EarlyStopping(

            monitor='val_auc',

            patience=10,

            mode='max',

            restore_best_weights=True
        )


        # Add model checkpoint

        checkpoint = tf.keras.callbacks.ModelCheckpoint(
            "best_model.keras", monitor="val_auc", mode="max", save_best_only=True
        )


        # Create TQDM callback for progress bar

        class TqdmCallback(tf.keras.callbacks.Callback):
            def __init__(self, epochs):
                super().__init__()
                self.epochs = epochs
                self.progress_bar = None

            def on_train_begin(self, logs=None):
                self.progress_bar = tqdm(total=self.epochs, desc="Training")

            def on_epoch_end(self, epoch, logs=None):
                self.progress_bar.update(1)
                self.progress_bar.set_postfix(
                    {
                        "loss": f"{logs['loss']:.4f}",
                        "acc": f"{logs['accuracy']:.4f}",
                        "val_loss": f"{logs['val_loss']:.4f}",
                        "val_acc": f"{logs['val_accuracy']:.4f}",
                        'auc': f"{logs.get('auc', 0):.4f}"
                    }
                )

            def on_train_end(self, logs=None):
                self.progress_bar.close()


        # Train model with TQDM progress bar

        history = self.model.fit(

            [X_train, masks_train],
            y_train,

            validation_data=([X_val, masks_val], y_val),

            epochs=epochs,

            batch_size=batch_size,

            callbacks=[early_stopping, checkpoint, TqdmCallback(epochs)],

            class_weight=class_weights,

            verbose=0  # Set to 0 since we're using TQDM
        )


        return history


    def evaluate(self, X_test, masks_test, y_test):

        """

        Evaluate the model

        """

        # Get predictions

        y_pred_proba = self.model.predict([X_test, masks_test])

        y_pred = (y_pred_proba > 0.5).astype(int)


        # Print classification report

        print("\nClassification Report:")
        print(classification_report(y_test, y_pred))


        # Create confusion matrix

        cm = confusion_matrix(y_test, y_pred)


        # Plot confusion matrix

        plt.figure(figsize=(8, 6))

        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')

        plt.title('Confusion Matrix')

        plt.ylabel('True Label')

        plt.xlabel('Predicted Label')

        plt.show()


        return y_pred_proba, y_pred


    def plot_training_history(self, history):

        """

        Plot training history

        """

        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))


        # Plot accuracy

        ax1.plot(history.history['accuracy'])

        ax1.plot(history.history['val_accuracy'])

        ax1.set_title('Model Accuracy')

        ax1.set_ylabel('Accuracy')

        ax1.set_xlabel('Epoch')

        ax1.legend(['Train', 'Validation'])


        # Plot loss

        ax2.plot(history.history['loss'])

        ax2.plot(history.history['val_loss'])

        ax2.set_title('Model Loss')

        ax2.set_ylabel('Loss')

        ax2.set_xlabel('Epoch')

        ax2.legend(['Train', 'Validation'])


        plt.tight_layout()

        plt.show()

In [19]:
# Print class distribution
print("Unique classes in training data:", np.unique(y_train))
print("\nClass distribution in training data:")
print(pd.Series(y_train).value_counts())

# Convert y_train and y_test to numpy arrays if they're pandas Series
y_train = np.array(y_train)
y_test = np.array(y_test)

# Calculate class weights
unique_classes = np.unique(y_train)
n_samples = len(y_train)
class_weights = dict(
    enumerate(n_samples / (len(unique_classes) * np.bincount(y_train)))
)

print("\nClass weights:", class_weights)

# Train the model
history = model.train(
    X_train,
    masks_train,
    y_train,
    validation_data=(X_test, masks_test, y_test),
    epochs=50,
    batch_size=32,
    class_weights=class_weights,
)

Unique classes in training data: [0 1]

Class distribution in training data:
Diagnosis
0    1111
1     608
Name: count, dtype: int64

Class weights: {0: np.float64(0.7736273627362736), 1: np.float64(1.4136513157894737)}




Training:   0%|          | 0/50 [03:38<?, ?it/s]
Training:   0%|          | 0/50 [01:15<?, ?it/s]
  current = self.get_monitor_value(logs)
  self._save_model(epoch=epoch, batch=None, logs=logs)


[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

Training: 

### Train the Model

In [18]:
# Initialize model with the correct input dimension
input_dim = X_train.shape[1]
model = MissingValueNetwork(input_dim=input_dim)

# Calculate class weights if dataset is imbalanced
class_weights = None
if len(set(y_train)) > 1:
    unique_classes = np.unique(y_train)
    n_samples = len(y_train)
    class_weights = {
        cls: n_samples / (len(unique_classes) * sum(y_train == cls))
        for cls in unique_classes
    }

# Train the model
history = model.train(
    X_train,
    masks_train,
    y_train,
    validation_data=(X_test, masks_test, y_test),
    epochs=50,
    batch_size=32,
    class_weights=class_weights,
)


[A

KeyError: 1

### Evaluate the Model

In [None]:
# Plot training history
model.plot_training_history(history)

# Evaluate model
y_pred_proba, y_pred = model.evaluate(X_test, masks_test, y_test)

# %% [markdown]
# ### Make Predictions for New Data

# %%
def predict_new_cases(model, preprocessor, new_data):
    """
    Make predictions for new cases
    
    Parameters:
    model: Trained MissingValueNetwork
    preprocessor: Trained AlzheimersDataPreprocessor
    new_data: DataFrame with new cases
    """
    # Preprocess new data
    X_new, masks_new = preprocessor.preprocess(new_data, is_training=False)
    
    # Make predictions
    predictions_proba = model.model.predict([X_new, masks_new])
    predictions = (predictions_proba > 0.5).astype(int)
    
    # Create results DataFrame
    results = pd.DataFrame({
        'Predicted_Probability': predictions_proba.flatten(),
        'Predicted_Class': predictions.flatten()
    })
    
    return results