In [1]:
!pip install -q imbalanced-learn==0.12.4;

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m258.3/258.3 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
# === Standard library ===
import os
import warnings

# === Data manipulation ===
import pandas as pd
import numpy as np
from collections import Counter

# === Visualization ===
import matplotlib.pyplot as plt
import seaborn as sns

# === Machine Learning / Preprocessing ===
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    roc_auc_score,
    roc_curve
)
from sklearn.impute import KNNImputer

# === Imbalanced learning ===
from imblearn.over_sampling import BorderlineSMOTE

# === Statistics ===
from scipy.stats import uniform, randint

# === Deep Learning (PyTorch) ===
import torch
import torch.nn as nn
import torch.nn.init as init
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split, TensorDataset
from torch.optim.lr_scheduler import ReduceLROnPlateau
import gc

# === Explainability ===
import shap

# === Notebook utilities ===
from IPython.display import FileLink, display, HTML

# === Joblib ===
import joblib

# === Settings ===
warnings.filterwarnings("ignore")
np.random.seed(123)
torch.manual_seed(123)     

RANDOM_STATE = 42
TARGET_COL = "label"
DROP_COLS  = ["patient_id", "date"]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
EXPERIMENT_NAME = "binary_classification_blood_pressure_perceptron"     

torch.__version__, device

('2.6.0+cu124', device(type='cuda'))

In [3]:
train = pd.read_csv('../input/dementia-dataset-activity-and-physiology/train_data.csv')
val = pd.read_csv('../input/dementia-dataset-activity-and-physiology/val_data.csv')
test = pd.read_csv('../input/dementia-dataset-activity-and-physiology/test_data.csv')

display(train.shape, val.shape, test.shape)
display(train.isna().sum().sum(), val.isna().sum().sum(), test.isna().sum().sum())
display(train.head(10))

(2005, 39)

(336, 39)

(455, 39)

0

0

0

Unnamed: 0,patient_id,date,Bathroom_count_sum,Bedroom_count_sum,Fridge Door_count_sum,Hallway_count_sum,Kitchen_count_sum,Lounge_count_sum,Door_count_sum,Bathroom_count_std,...,Door_count_max,Body Temperature,Body weight,Diastolic blood pressure,Heart rate,O/E - muscle mass,Skin Temperature,Systolic blood pressure,Total body water,label
0,0697d,2019-06-28,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.666667,0.0,0.998452,0.0,0.7,0.988281,1
1,0697d,2019-06-29,0.571429,0.068966,0.347826,1.0,1.0,1.0,0.433333,0.022035,...,0.1,1.0,0.997683,0.0,1.0,1.0,0.0,1.0,1.0,1
2,0697d,2019-06-30,1.0,1.0,0.0,1.0,0.928571,0.621622,0.0,1.0,...,0.0,0.273556,0.0,1.0,0.65,0.0,0.0,0.0,0.0,0
3,0d5ef,2019-05-13,1.0,0.592593,0.493151,0.791667,0.613636,0.784091,0.638554,0.905569,...,1.0,0.989296,0.972468,0.621429,0.661017,0.953191,0.0,0.721591,0.979839,0
4,0d5ef,2019-05-14,0.416667,0.62963,0.643836,0.694444,0.651515,0.625,0.457831,0.357417,...,0.36,0.983286,0.970501,0.657143,0.635593,0.951773,0.0,0.875,0.97379,0
5,0d5ef,2019-05-15,0.5,0.703704,0.821918,0.736111,0.719697,0.676136,0.433735,0.608998,...,0.4,0.980845,0.981318,1.0,0.652542,0.957447,0.0,0.943182,0.975806,1
6,0d5ef,2019-05-16,0.541667,0.759259,0.684932,0.819444,0.75,1.0,0.481928,0.447111,...,0.4,0.978618,1.0,0.564286,0.762712,1.0,0.0,0.704545,0.991935,0
7,0d5ef,2019-05-17,0.5,0.462963,0.520548,0.5625,0.621212,0.630682,0.120482,0.332106,...,0.16,0.978779,0.0,0.628571,0.686441,0.0,0.0,0.857955,0.0,0
8,0d5ef,2019-05-18,0.5,0.648148,0.547945,0.736111,0.666667,0.784091,0.53012,0.468661,...,0.48,0.975077,0.981318,0.614286,0.677966,0.957447,0.0,0.892045,0.975806,0
9,0d5ef,2019-05-19,0.958333,0.666667,0.767123,0.680556,0.742424,0.875,0.216867,0.724653,...,0.16,0.986023,0.0,0.6,0.627119,0.0,0.0,0.806818,0.0,0


In [4]:
X_train = train.drop(columns=[TARGET_COL] + [c for c in DROP_COLS if c in train.columns])
y_train = train[TARGET_COL].copy()
X_val   = val.drop(columns=[TARGET_COL] + [c for c in DROP_COLS if c in val.columns])
y_val   = val[TARGET_COL].copy()
X_test  = test.drop(columns=[TARGET_COL] + [c for c in DROP_COLS if c in test.columns])
y_test  = test[TARGET_COL].copy()

In [5]:
class DEMENTIAANNClassifier(nn.Module):
    def __init__(self, input_dim, neurons, dropout=0.10):
        super(DEMENTIAANNClassifier, self).__init__()

        self.layer1 = nn.Linear(input_dim, neurons[0])
        self.layer2 = nn.Linear(neurons[0], neurons[1])
        self.layer3 = nn.Linear(neurons[1], neurons[2])
        self.layer4 = nn.Linear(neurons[2], neurons[3])
        self.output_layer = nn.Linear(neurons[3], 1)

        self.bn1 = nn.BatchNorm1d(neurons[0])
        self.bn2 = nn.BatchNorm1d(neurons[1])
        self.bn3 = nn.BatchNorm1d(neurons[2])
        self.bn4 = nn.BatchNorm1d(neurons[3])

        self.gelu = nn.GELU()
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, x):
        x = self.gelu(self.bn1(self.layer1(x))) 
        x = self.dropout(x)

        x = self.gelu(self.bn2(self.layer2(x)))
        x = self.dropout(x)

        x = self.gelu(self.bn3(self.layer3(x)))
        x = self.dropout(x)

        x = self.gelu(self.bn4(self.layer4(x)))
        x = self.output_layer(x)

        return x.squeeze(1)  # shape (batch,)

    def fit(self, train_loader, val_loader, epochs=200, lr=1e-3, weight_decay=1e-4, device='cpu', patience=20):
        self.to(device)
        criterion = nn.BCEWithLogitsLoss()
        optimizer = optim.Adam(self.parameters(), lr=lr, weight_decay=weight_decay)
        scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=10, verbose=True)

        best_val_loss = float('inf')
        patience_counter = 0
        
        history = {
            'train_loss': [], 'train_acc': [],
            'val_loss': [], 'val_acc': [], 'val_auc': []
        }

        for epoch in range(epochs):
            # ========= TRAIN =========
            self.train()
            running_loss = 0.0
            correct, total = 0, 0

            for inputs, targets in train_loader:
                inputs, targets = inputs.to(device), targets.to(device)

                optimizer.zero_grad()
                outputs = self(inputs)

                outputs = outputs.view(-1).float()
                targets = targets.view(-1).float() 

                loss = criterion(outputs, targets)
                loss.backward()
                optimizer.step()

                preds = (torch.sigmoid(outputs) > 0.5).long()
                correct += (preds == targets.long()).sum().item()
                total += targets.size(0)
                running_loss += loss.item() * inputs.size(0)

            train_loss = running_loss / len(train_loader.dataset)
            train_acc = correct / total
            history['train_loss'].append(train_loss)
            history['train_acc'].append(train_acc)


            # ========= VALIDATION =========
            self.eval()
            val_loss = 0.0
            correct_val, total_val = 0, 0
            all_probas, all_true = [], []

            with torch.no_grad():
                for inputs, targets in val_loader:
                    inputs, targets = inputs.to(device), targets.to(device)

                    outputs = self(inputs)
                    outputs = outputs.view(-1).float()
                    targets = targets.view(-1).float()

                    loss = criterion(outputs, targets)
                    val_loss += loss.item() * inputs.size(0)

                    proba = torch.sigmoid(outputs)
                    pred = (proba > 0.5).long()

                    correct_val += (pred == targets.long()).sum().item()
                    total_val += targets.size(0)

                    all_probas.append(proba.cpu())
                    all_true.append(targets.cpu())

            val_loss /= len(val_loader.dataset)
            val_acc = correct_val / total_val

            y_true = torch.cat(all_true).numpy()
            y_proba = torch.cat(all_probas).numpy()

            try:
                val_auc = roc_auc_score(y_true, y_proba)
            except ValueError:
                val_auc = float('nan')

            history['val_loss'].append(val_loss)
            history['val_acc'].append(val_acc)
            history['val_auc'].append(val_auc)

            # log
            print(
                f"Epoch [{epoch+1}/{epochs}] | "
                f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f} | "
                f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}, Val AUC: {val_auc:.4f}"
            )

            # Early stopping based on validation loss
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                patience_counter = 0
                best_state = self.state_dict()  # save best model
            else:
                patience_counter += 1

            if patience_counter >= patience:
                print("Early stopping triggered.")
                break

            scheduler.step(val_loss)

        # restore best model
        self.load_state_dict(best_state)

        return history



    def test(self, test_loader, device='cpu'):
        self.to(device)
        self.eval()
        
        all_preds, all_probas, all_targets = [], [], []
        correct, total = 0, 0
        test_loss = 0.0
        criterion = nn.BCEWithLogitsLoss()
        history = {'test_loss': [], 'test_acc': [], 'test_auc': []}

        with torch.no_grad():
            for inputs, targets in test_loader:
                inputs, targets = inputs.to(device), targets.to(device)

                outputs = self(inputs)
                
                outputs = outputs.view(-1).float()
                targets = targets.view(-1).float() 
                
                loss = criterion(outputs, targets)
                
                proba = torch.sigmoid(outputs)
                pred = (proba > 0.5).long()

                all_preds.append(pred.cpu())
                all_probas.append(proba.cpu())
                all_targets.append(targets.cpu())
                
                correct += (pred == targets.long()).sum().item()
                total += targets.size(0)
                test_loss += loss.item() * inputs.size(0)
        
        test_acc = correct / total
        test_loss /= len(test_loader.dataset)
        history['test_loss'].append(test_loss)
        history['test_acc'].append(test_acc)
        
        y_true = torch.cat(all_targets).numpy().astype(int)
        y_pred = torch.cat(all_preds).numpy().astype(int)
        y_proba = torch.cat(all_probas).numpy().astype(float)

        try:
            auc = roc_auc_score(y_true, y_proba)
        except ValueError:
            auc = float('nan')

        report = classification_report(y_true, y_pred, output_dict=False)
        history['test_auc'].append(auc)

        print(
            f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_acc:.4f}, Test AUC: {auc:.4f}"
            )
        
        return test_acc, auc, report, history

In [6]:
borderSmote = BorderlineSMOTE(random_state=RANDOM_STATE, kind='borderline-2')
X_train, y_train = borderSmote.fit_resample(X_train, y_train)

In [7]:
Counter(y_train), Counter(y_val), Counter(y_test)

(Counter({1: 1844, 0: 1844}),
 Counter({0: 316, 1: 20}),
 Counter({0: 385, 1: 70}))

In [8]:
X_train_array = X_train.to_numpy()
X_val_array = X_val.to_numpy()
X_test_array = X_test.to_numpy()

y_train_array = y_train.to_numpy()
y_val_array = y_val.to_numpy()
y_test_array = y_test.to_numpy()

In [9]:
X_train_tensor = torch.tensor(X_train_array, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train_array, dtype=torch.long).to(device)

X_val_tensor = torch.tensor(X_val_array, dtype=torch.float32).to(device)
y_val_tensor = torch.tensor(y_val_array, dtype=torch.long).to(device)

X_test_tensor = torch.tensor(X_test_array, dtype=torch.float32).to(device)
y_test_tensor = torch.tensor(y_test_array, dtype=torch.long).to(device)

In [10]:
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32) 
test_loader = DataLoader(test_dataset, batch_size=32)

In [11]:
X_train.shape[1]

36

In [12]:
model = DEMENTIAANNClassifier(
    input_dim=X_train.shape[1],
    neurons=[128, 64, 32, 16],
    dropout=0.30
).to(device)

In [13]:
fit_history = model.fit(
                train_loader=train_loader,
                val_loader=val_loader,
                epochs=200,
                lr=1e-3,
                weight_decay=1e-5,
                device=device,
                patience=20
            )

Epoch [1/200] | Train Loss: 0.5384, Train Acc: 0.7324 | Val Loss: 0.5359, Val Acc: 0.6131, Val AUC: 0.7623
Epoch [2/200] | Train Loss: 0.4291, Train Acc: 0.8040 | Val Loss: 0.4933, Val Acc: 0.6815, Val AUC: 0.7752
Epoch [3/200] | Train Loss: 0.3835, Train Acc: 0.8219 | Val Loss: 0.4794, Val Acc: 0.7083, Val AUC: 0.7597
Epoch [4/200] | Train Loss: 0.3486, Train Acc: 0.8501 | Val Loss: 0.4854, Val Acc: 0.7232, Val AUC: 0.7413
Epoch [5/200] | Train Loss: 0.3357, Train Acc: 0.8568 | Val Loss: 0.5053, Val Acc: 0.7202, Val AUC: 0.7511
Epoch [6/200] | Train Loss: 0.3173, Train Acc: 0.8658 | Val Loss: 0.3922, Val Acc: 0.7738, Val AUC: 0.7638
Epoch [7/200] | Train Loss: 0.3166, Train Acc: 0.8685 | Val Loss: 0.4797, Val Acc: 0.7470, Val AUC: 0.7883
Epoch [8/200] | Train Loss: 0.2902, Train Acc: 0.8804 | Val Loss: 0.3862, Val Acc: 0.7798, Val AUC: 0.7729
Epoch [9/200] | Train Loss: 0.2744, Train Acc: 0.8848 | Val Loss: 0.4815, Val Acc: 0.7560, Val AUC: 0.8051
Epoch [10/200] | Train Loss: 0.2730, 

In [14]:
acc, auc, report, test_history = model.test(
                test_loader=test_loader,
                device=device
            )

Test Loss: 0.8696, Test Accuracy: 0.7758, Test AUC: 0.5407


In [15]:
print(report)

              precision    recall  f1-score   support

           0       0.85      0.89      0.87       385
           1       0.20      0.16      0.18        70

    accuracy                           0.78       455
   macro avg       0.53      0.52      0.52       455
weighted avg       0.75      0.78      0.76       455

