In [1]:
# Step 1: Data Loading
# --------------------

import pandas as pd
import json

def load_data():
    """Load all relevant datasets."""
    discharge_diagnosis = pd.read_csv('../data/raw/discharge_diagnosis.csv')
    discharge_procedures = pd.read_csv('../data/raw/discharge_procedures.csv')
    history_of_present_illness = pd.read_csv('../data/raw/history_of_present_illness.csv')
    icd_diagnosis = pd.read_csv('../data/raw/icd_diagnosis.csv')
    icd_procedures = pd.read_csv('../data/raw/icd_procedures.csv')
    lab_test_mapping = pd.read_csv('../data/raw/lab_test_mapping.csv')
    laboratory_tests = pd.read_csv('../data/raw/laboratory_tests.csv')
    microbiology = pd.read_csv('../data/raw/microbiology.csv')
    physical_examination = pd.read_csv('../data/raw/physical_examination.csv')
    radiology_reports = pd.read_csv('../data/raw/radiology_reports.csv')

    with open('../data/raw/pathology_ids.json') as f:
        pathology_ids = json.load(f)

    return {
        "discharge_diagnosis": discharge_diagnosis,
        "discharge_procedures": discharge_procedures,
        "hpi": history_of_present_illness,
        "icd_diagnosis": icd_diagnosis,
        "icd_procedures": icd_procedures,
        "lab_test_mapping": lab_test_mapping,
        "lab_tests": laboratory_tests,
        "microbiology": microbiology,
        "physical_examination": physical_examination,
        "radiology_reports": radiology_reports,
        "pathology_ids": pathology_ids
}

# Load data and check
data = load_data()
for key, df in data.items():
    if isinstance(df, pd.DataFrame):
        print(f"{key}: {df.shape}")
    else:
        print(f"{key}: Loaded JSON data")



discharge_diagnosis: (2400, 2)
discharge_procedures: (2122, 2)
hpi: (2400, 2)
icd_diagnosis: (17357, 2)
icd_procedures: (2917, 4)
lab_test_mapping: (1209, 6)
lab_tests: (138788, 5)
microbiology: (4403, 4)
physical_examination: (2400, 2)
radiology_reports: (5960, 6)
pathology_ids: Loaded JSON data


In [2]:
# Step 2: Data Preprocessing
# --------------------------

def merge_data(data):
    """
    Merge all clinical data into a single DataFrame based on 'hadm_id'.
    """
    merged_data = data['discharge_diagnosis'].merge(data['hpi'], on='hadm_id', how='inner') \
                                             .merge(data['physical_examination'], on='hadm_id', how='inner') \
                                             .merge(data['lab_tests'], on='hadm_id', how='inner') \
                                             .merge(data['microbiology'], on='hadm_id', how='inner') \
                                             .merge(data['radiology_reports'], on='hadm_id', how='inner')
    return merged_data

def filter_by_pathologies(data, pathology_ids):
    """
    Filter data based on specific pathology IDs.
    """
    pathology_ids_list = [item for sublist in pathology_ids.values() for item in sublist]
    return data[data['hadm_id'].isin(pathology_ids_list)]

def handle_missing_values(data):
    """
    Handle missing data by filling forward.
    """
    return data.fillna(method='ffill')

def preprocess_data(data, pathology_ids):
    """
    Preprocess and combine data steps.
    """
    combined_data = merge_data(data)
    filtered_data = filter_by_pathologies(combined_data, pathology_ids)
    cleaned_data = handle_missing_values(filtered_data)
    return cleaned_data

# Preprocess data
processed_data = preprocess_data(data, data['pathology_ids'])
print(f"Processed Data Shape: {processed_data.shape}")
processed_data.head()


  return data.fillna(method='ffill')


Processed Data Shape: (1941697, 16)


Unnamed: 0,hadm_id,discharge_diagnosis,hpi,pe,itemid,valuestr_x,ref_range_lower,ref_range_upper,test_itemid,valuestr_y,spec_itemid,note_id,modality,region,exam_name,text
0,20890008,acute appendicitis,___ with no significant PMH presenting with ac...,Temp: 97.6 HR: 46 BP: 106/65 RR: 18 100% Ra Ge...,51085,"NEGATIVE. FOR QUANTITATION OF POSITIVES, SEND...",,,90039,"<10,000 organisms/ml.",70079,10040626-RR-9,CT,Abdomen,CT ABD & PELVIS WITH CONTRAST,EXAMINATION:\nCT ABD AND PELVIS WITH CONTRAST:...
1,20890008,acute appendicitis,___ with no significant PMH presenting with ac...,Temp: 97.6 HR: 46 BP: 106/65 RR: 18 100% Ra Ge...,51085,"NEGATIVE. FOR QUANTITATION OF POSITIVES, SEND...",,,90201,NO GROWTH.,70012,10040626-RR-9,CT,Abdomen,CT ABD & PELVIS WITH CONTRAST,EXAMINATION:\nCT ABD AND PELVIS WITH CONTRAST:...
2,20890008,acute appendicitis,___ with no significant PMH presenting with ac...,Temp: 97.6 HR: 46 BP: 106/65 RR: 18 100% Ra Ge...,51514,NEG.,0.2,1.0,90039,"<10,000 organisms/ml.",70079,10040626-RR-9,CT,Abdomen,CT ABD & PELVIS WITH CONTRAST,EXAMINATION:\nCT ABD AND PELVIS WITH CONTRAST:...
3,20890008,acute appendicitis,___ with no significant PMH presenting with ac...,Temp: 97.6 HR: 46 BP: 106/65 RR: 18 100% Ra Ge...,51514,NEG.,0.2,1.0,90201,NO GROWTH.,70012,10040626-RR-9,CT,Abdomen,CT ABD & PELVIS WITH CONTRAST,EXAMINATION:\nCT ABD AND PELVIS WITH CONTRAST:...
4,20890008,acute appendicitis,___ with no significant PMH presenting with ac...,Temp: 97.6 HR: 46 BP: 106/65 RR: 18 100% Ra Ge...,51508,Straw.,0.2,1.0,90039,"<10,000 organisms/ml.",70079,10040626-RR-9,CT,Abdomen,CT ABD & PELVIS WITH CONTRAST,EXAMINATION:\nCT ABD AND PELVIS WITH CONTRAST:...


In [3]:
# Step 3: Feature Engineering
# ---------------------------

def create_text_features(data):
    """
    Create features based on text length of 'hpi'.
    """
    if 'hpi' in data.columns:
        data['hpi_length'] = data['hpi'].apply(lambda x: len(str(x).split()))
    return data

def encode_categorical_features(data):
    """
    One-hot encode radiology modality and ICD codes.
    """
    if 'modality' in data.columns:
        data = pd.get_dummies(data, columns=['modality'], prefix='modality')
    if 'icd_code' in data.columns:
        data = pd.get_dummies(data, columns=['icd_code'], prefix='icd')
    return data

def feature_engineering(data):
    """
    Apply feature engineering transformations.
    """
    data = create_text_features(data)
    data = encode_categorical_features(data)
    return data

# Apply feature engineering
feature_data = feature_engineering(processed_data)
feature_data.to_csv('../data/processed/feature_matrix.csv', index=False)
print(f"Feature Data Shape: {feature_data.shape}")
feature_data.head()


Feature Data Shape: (1941697, 28)


Unnamed: 0,hadm_id,discharge_diagnosis,hpi,pe,itemid,valuestr_x,ref_range_lower,ref_range_upper,test_itemid,valuestr_y,...,modality_Carotid ultrasound,modality_Drainage,modality_ERCP,modality_Fluoroscopy,modality_MRCP,modality_MRE,modality_MRI,modality_Radiograph,modality_Ultrasound,modality_Upper GI Series
0,20890008,acute appendicitis,___ with no significant PMH presenting with ac...,Temp: 97.6 HR: 46 BP: 106/65 RR: 18 100% Ra Ge...,51085,"NEGATIVE. FOR QUANTITATION OF POSITIVES, SEND...",,,90039,"<10,000 organisms/ml.",...,False,False,False,False,False,False,False,False,False,False
1,20890008,acute appendicitis,___ with no significant PMH presenting with ac...,Temp: 97.6 HR: 46 BP: 106/65 RR: 18 100% Ra Ge...,51085,"NEGATIVE. FOR QUANTITATION OF POSITIVES, SEND...",,,90201,NO GROWTH.,...,False,False,False,False,False,False,False,False,False,False
2,20890008,acute appendicitis,___ with no significant PMH presenting with ac...,Temp: 97.6 HR: 46 BP: 106/65 RR: 18 100% Ra Ge...,51514,NEG.,0.2,1.0,90039,"<10,000 organisms/ml.",...,False,False,False,False,False,False,False,False,False,False
3,20890008,acute appendicitis,___ with no significant PMH presenting with ac...,Temp: 97.6 HR: 46 BP: 106/65 RR: 18 100% Ra Ge...,51514,NEG.,0.2,1.0,90201,NO GROWTH.,...,False,False,False,False,False,False,False,False,False,False
4,20890008,acute appendicitis,___ with no significant PMH presenting with ac...,Temp: 97.6 HR: 46 BP: 106/65 RR: 18 100% Ra Ge...,51508,Straw.,0.2,1.0,90039,"<10,000 organisms/ml.",...,False,False,False,False,False,False,False,False,False,False


In [4]:
import time
import tensorflow as tf

def measure_time_cpu(matrix_a, matrix_b):
    start_time = time.time()
    result = tf.matmul(matrix_a, matrix_b)
    tf.experimental.numpy.copy(result)  # Ensure completion of the computation
    return time.time() - start_time

def measure_time_gpu(matrix_a, matrix_b):
    # Warm-up runs
    for _ in range(5):
        _ = tf.matmul(matrix_a, matrix_b)

    start_time = time.time()
    result = tf.matmul(matrix_a, matrix_b)
    tf.experimental.numpy.copy(result)  # Ensure completion of the computation
    return time.time() - start_time

# Generate random matrices
matrix_a = tf.random.normal(shape=(10000, 10000))
matrix_b = tf.random.normal(shape=(10000, 10000))

# Measure CPU time
cpu_time = measure_time_cpu(matrix_a, matrix_b)
print("Average time on CPU:", cpu_time)

# Check for GPU availability
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    # Ensure matrices are on the GPU
    matrix_a_gpu = tf.constant(matrix_a)
    matrix_b_gpu = tf.constant(matrix_b)

    # Measure GPU time
    gpu_time = measure_time_gpu(matrix_a_gpu, matrix_b_gpu)
    print("Average time on GPU:", gpu_time)

    # Calculate and print the performance difference
    performance_improvement = ((cpu_time - gpu_time) / cpu_time) * 100
    print(f"GPU is {performance_improvement:.2f}% faster than CPU.")
else:
    print("GPU not available.")


2024-11-14 01:49:56.475793: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-14 01:49:56.481813: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1731538196.488063 3213788 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1731538196.489895 3213788 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-14 01:49:56.496712: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

Average time on CPU: 0.016513824462890625
Average time on GPU: 5.1975250244140625e-05
GPU is 99.69% faster than CPU.


I0000 00:00:1731538197.251946 3213788 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 7457 MB memory:  -> device: 0, name: NVIDIA GeForce GTX 1070 Ti, pci bus id: 0000:01:00.0, compute capability: 6.1


In [17]:
import pandas as pd
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler, LabelEncoder
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Step 1: Load a sample of the data to reduce memory usage
print("Loading data...")
df = pd.read_csv('../data/processed/feature_matrix.csv').sample(frac=0.1, random_state=42)  # Load 10% of the data

# Step 2: Reduce data precision to save memory
print("Reducing data precision...")
for col in df.select_dtypes(include=['float64']).columns:
    df[col] = df[col].astype('float32')

# Step 3: Separate features and target
X = df.drop(columns=['discharge_diagnosis'])  # Replace 'discharge_diagnosis' with the actual target column name
y = df['discharge_diagnosis']

# Identify and drop non-numeric columns from X
non_numeric_columns = X.select_dtypes(include=['object']).columns
print(f"Dropping non-numeric columns: {non_numeric_columns}")
X = X.select_dtypes(exclude=['object'])

# Step 4: Feature selection to reduce dimensionality (remove low-variance features)
print("Applying feature selection...")
selector = VarianceThreshold(threshold=0.01)  # Remove features with low variance
X = selector.fit_transform(X)

# Step 5: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Standardize features
print("Standardizing features...")
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

Loading data...
Reducing data precision...
Dropping non-numeric columns: Index(['hpi', 'pe', 'valuestr_x', 'valuestr_y', 'note_id', 'region',
       'exam_name', 'text'],
      dtype='object')
Applying feature selection...
Standardizing features...


In [18]:
# Step 7: Encode target labels if they are strings
if y_train.dtype == 'object':
    print("Encoding target labels...")
    le = LabelEncoder()
    y_train = le.fit_transform(y_train)
    y_test = le.transform(y_test)


Encoding target labels...


In [27]:
import warnings
# Suppress warnings (optional)
warnings.filterwarnings("ignore", category=UserWarning, module="xgboost")

# Ensure no NaN or infinite values are present
X_train = np.nan_to_num(X_train)
y_train = np.nan_to_num(y_train)

# Create an XGBoost classifier with optimized parameters
model = xgb.XGBClassifier(
    tree_method='hist',  # Use histogram-based method
    gpu_id=0,  # Specify CUDA device
    max_bin=64,  # Further reduce bin count
    subsample=0.8,  # Subsample data for each tree
    colsample_bytree=0.8,  # Use a fraction of features for each tree
    n_estimators=30,  # Reduce number of boosting rounds
    max_depth=4  # Lower tree depth to reduce memory usage
)


In [33]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import numpy as np

# Convert data to PyTorch tensors
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
X_train = torch.tensor(X_train, dtype=torch.float32).to(device)
X_test = torch.tensor(X_test, dtype=torch.float32).to(device)

# Ensure y_train and y_test are integers
if y_train.dtype == 'object' or y_test.dtype == 'object':
    le = LabelEncoder()
    y_train = le.fit_transform(y_train)
    y_test = le.transform(y_test)

y_train = torch.tensor(y_train, dtype=torch.long).to(device)
y_test = torch.tensor(y_test, dtype=torch.long).to(device)

# Improved neural network with additional layers, dropout, and batch normalization
class ImprovedNN(nn.Module):
    def __init__(self, input_dim):
        super(ImprovedNN, self).__init__()
        self.layer1 = nn.Linear(input_dim, 128)
        self.bn1 = nn.BatchNorm1d(128)
        self.dropout1 = nn.Dropout(0.3)
        self.layer2 = nn.Linear(128, 64)
        self.bn2 = nn.BatchNorm1d(64)
        self.dropout2 = nn.Dropout(0.3)
        self.layer3 = nn.Linear(64, 32)
        self.output = nn.Linear(32, len(torch.unique(y_train)))  # Adjust output size based on number of classes

    def forward(self, x):
        x = torch.relu(self.bn1(self.layer1(x)))
        x = self.dropout1(x)
        x = torch.relu(self.bn2(self.layer2(x)))
        x = self.dropout2(x)
        x = torch.relu(self.layer3(x))
        x = self.output(x)
        return x

# Initialize the model
model = ImprovedNN(X_train.shape[1]).to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train)
    loss = criterion(outputs, y_train)
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# Evaluation
model.eval()
with torch.no_grad():
    predictions = model(X_test).argmax(dim=1)
    accuracy = (predictions == y_test).float().mean().item()
    print(f'Accuracy: {accuracy:.4f}')


  X_train = torch.tensor(X_train, dtype=torch.float32).to(device)
  X_test = torch.tensor(X_test, dtype=torch.float32).to(device)
  y_train = torch.tensor(y_train, dtype=torch.long).to(device)
  y_test = torch.tensor(y_test, dtype=torch.long).to(device)


OutOfMemoryError: CUDA out of memory. Tried to allocate 76.00 MiB. GPU 0 has a total capacity of 7.92 GiB of which 47.25 MiB is free. Including non-PyTorch memory, this process has 7.82 GiB memory in use. Of the allocated memory 415.29 MiB is allocated by PyTorch, and 6.71 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [29]:
best_loss = float('inf')
patience = 10
trigger_times = 0

for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train)
    loss = criterion(outputs, y_train)
    loss.backward()
    optimizer.step()

    model.eval()
    with torch.no_grad():
        val_outputs = model(X_test)
        val_loss = criterion(val_outputs, y_test)

    if val_loss < best_loss:
        best_loss = val_loss
        trigger_times = 0
    else:
        trigger_times += 1
        if trigger_times >= patience:
            print('Early stopping!')
            break


NameError: name 'num_epochs' is not defined

In [28]:
# Train the model
model.fit(X_train, y_train)

XGBoostError: [02:03:37] /workspace/src/c_api/../common/device_helpers.cuh:393: Memory allocation error on worker 0: std::bad_alloc: cudaErrorMemoryAllocation: out of memory
- Free memory: 500432896
- Requested memory: 555481536

Stack trace:
  [bt] (0) /home/zinger/miniconda3/lib/python3.12/site-packages/xgboost/lib/libxgboost.so(+0x22dcbc) [0x78c2ea22dcbc]
  [bt] (1) /home/zinger/miniconda3/lib/python3.12/site-packages/xgboost/lib/libxgboost.so(+0x8a9966) [0x78c2ea8a9966]
  [bt] (2) /home/zinger/miniconda3/lib/python3.12/site-packages/xgboost/lib/libxgboost.so(+0xec59a) [0x78c2ea0ec59a]
  [bt] (3) /home/zinger/miniconda3/lib/python3.12/site-packages/xgboost/lib/libxgboost.so(+0x927a4d) [0x78c2ea927a4d]
  [bt] (4) /home/zinger/miniconda3/lib/python3.12/site-packages/xgboost/lib/libxgboost.so(+0x929058) [0x78c2ea929058]
  [bt] (5) /home/zinger/miniconda3/lib/python3.12/site-packages/xgboost/lib/libxgboost.so(+0x737fb5) [0x78c2ea737fb5]
  [bt] (6) /home/zinger/miniconda3/lib/python3.12/site-packages/xgboost/lib/libxgboost.so(+0x57bcd1) [0x78c2ea57bcd1]
  [bt] (7) /home/zinger/miniconda3/lib/python3.12/site-packages/xgboost/lib/libxgboost.so(+0x5cb03e) [0x78c2ea5cb03e]
  [bt] (8) /home/zinger/miniconda3/lib/python3.12/site-packages/xgboost/lib/libxgboost.so(XGBoosterUpdateOneIter+0x6f) [0x78c2ea13752f]



In [None]:
# Step 9: Make predictions and evaluate
print("Evaluating the model...")
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred))

In [None]:
# Step 4: Model Training
# ----------------------

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

# def train_random_forest(X, y):
#     """
#     Train a RandomForest model.
#     """
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
#     model = RandomForestClassifier(n_estimators=100, random_state=42)
#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test)
#     accuracy = accuracy_score(y_test, y_pred)
#     report = classification_report(y_test, y_pred)
#     return model, accuracy, report

# Load feature data
feature_data = pd.read_csv('../data/processed/feature_matrix.csv')
X = feature_data.drop(columns=['discharge_diagnosis', 'hadm_id'])  # Replace 'outcome' with actual target variable
y = feature_data['discharge_diagnosis']  # Replace 'outcome' with actual target variable

# Identify and drop non-numeric columns from X
non_numeric_columns = X.select_dtypes(include=['object']).columns
print(f"Dropping non-numeric columns: {non_numeric_columns}")
X = X.select_dtypes(exclude=['object'])

In [None]:
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Load and split your data (assuming X and y are already defined)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Encode string labels to integers
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

In [None]:
import torch
torch.cuda.empty_cache()

import gc

# Delete variables you no longer need
del X_train, X_test, y_train, y_test  # Adjust according to your variable names
gc.collect()  # Force garbage collection

In [None]:


# Create an XGBoost classifier with GPU support and optimized parameters
model = xgb.XGBClassifier(
    tree_method='gpu_hist',  # Use GPU-accelerated training
    use_label_encoder=False,
    max_bin=256,  # Reduce bins to save memory
    subsample=0.8,  # Subsample 80% of the data
    colsample_bytree=0.8,  # Use 80% of features for each tree
    n_estimators=50,  # Reduce the number of trees
    max_depth=6  # Limit tree depth
)

# Train the model
model.fit(X_train, y_train)

# Make predictions and evaluate
y_pred = model.predict(X_test)

# Transform predictions back to original string labels (optional, for interpretation)
y_pred_labels = le.inverse_transform(y_pred)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred_labels))

In [None]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import gc

# Load and split your data (ensure it's reduced or optimized)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Reduce memory usage
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')

# Remove constant columns if applicable
constant_columns = [col for col in X_train.columns if X_train[col].nunique() <= 1]
if constant_columns:
    print(f"Removing constant columns: {constant_columns}")
    X_train = X_train.drop(columns=constant_columns)
    X_test = X_test.drop(columns=constant_columns)

# Clear memory
gc.collect()

# Train LightGBM model with parameters to reduce memory consumption
model = lgb.LGBMClassifier(device='gpu', max_bin=256, n_estimators=100, max_depth=6)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred))


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import numpy as np

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.01, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Convert data to PyTorch tensors
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
X_train = torch.tensor(X_train, dtype=torch.float32).to(device)
X_test = torch.tensor(X_test, dtype=torch.float32).to(device)

# Ensure y_train and y_test are integers
if y_train.dtype == 'object' or y_test.dtype == 'object':
    le = LabelEncoder()
    y_train = le.fit_transform(y_train)
    y_test = le.transform(y_test)

y_train = torch.tensor(y_train, dtype=torch.long).to(device)
y_test = torch.tensor(y_test, dtype=torch.long).to(device)

# Improved neural network with additional layers, dropout, and batch normalization
class ImprovedNN(nn.Module):
    def __init__(self, input_dim):
        super(ImprovedNN, self).__init__()
        self.layer1 = nn.Linear(input_dim, 128)
        self.bn1 = nn.BatchNorm1d(128)
        self.dropout1 = nn.Dropout(0.3)
        self.layer2 = nn.Linear(128, 64)
        self.bn2 = nn.BatchNorm1d(64)
        self.dropout2 = nn.Dropout(0.3)
        self.layer3 = nn.Linear(64, 32)
        self.output = nn.Linear(32, len(torch.unique(y_train)))  # Adjust output size based on number of classes

    def forward(self, x):
        x = torch.relu(self.bn1(self.layer1(x)))
        x = self.dropout1(x)
        x = torch.relu(self.bn2(self.layer2(x)))
        x = self.dropout2(x)
        x = torch.relu(self.layer3(x))
        x = self.output(x)
        return x

# Initialize the model
model = ImprovedNN(X_train.shape[1]).to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

# Training loop
num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train)
    loss = criterion(outputs, y_train)
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# Evaluation
model.eval()
with torch.no_grad():
    predictions = model(X_test).argmax(dim=1)
    accuracy = (predictions == y_test).float().mean().item()
    print(f'Accuracy: {accuracy:.4f}')


In [None]:
print(f"Model Accuracy: {accuracy:.4f}")

# Save the model
joblib.dump(model, '../models/random_forest_model.pkl')

In [None]:
# Train the model again
model, accuracy, report = train_random_forest(X, y)


# Save the model
joblib.dump(model, '../models/random_forest_model.pkl')

In [None]:
# Step 5: SHAP Analysis
# ---------------------

import shap
import matplotlib.pyplot as plt

def run_shap_analysis(model, X):
    """
    Compute SHAP values for feature importance analysis.
    """
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X)
    return explainer, shap_values

# Load model and data
model = joblib.load('../models/random_forest_model.pkl')
X = pd.read_csv('../data/processed/feature_matrix.csv').drop(columns=['discharge_diagnosis', 'hadm_id'])  # Adjust as needed

explainer, shap_values = run_shap_analysis(model, X)

# Visualize SHAP values
plt.title("SHAP Summary Plot")
shap.summary_plot(shap_values[1], X)
