In [None]:
# Step 1: Data Loading
# --------------------

import pandas as pd
import json

def load_data():
    """Load all relevant datasets."""
    discharge_diagnosis = pd.read_csv('../data/raw/discharge_diagnosis.csv')
    discharge_procedures = pd.read_csv('../data/raw/discharge_procedures.csv')
    history_of_present_illness = pd.read_csv('../data/raw/history_of_present_illness.csv')
    icd_diagnosis = pd.read_csv('../data/raw/icd_diagnosis.csv')
    icd_procedures = pd.read_csv('../data/raw/icd_procedures.csv')
    lab_test_mapping = pd.read_csv('../data/raw/lab_test_mapping.csv')
    laboratory_tests = pd.read_csv('../data/raw/laboratory_tests.csv')
    microbiology = pd.read_csv('../data/raw/microbiology.csv')
    physical_examination = pd.read_csv('../data/raw/physical_examination.csv')
    radiology_reports = pd.read_csv('../data/raw/radiology_reports.csv')

    with open('../data/raw/pathology_ids.json') as f:
        pathology_ids = json.load(f)

    return {
        "discharge_diagnosis": discharge_diagnosis,
        "discharge_procedures": discharge_procedures,
        "hpi": history_of_present_illness,
        "icd_diagnosis": icd_diagnosis,
        "icd_procedures": icd_procedures,
        "lab_test_mapping": lab_test_mapping,
        "lab_tests": laboratory_tests,
        "microbiology": microbiology,
        "physical_examination": physical_examination,
        "radiology_reports": radiology_reports,
        "pathology_ids": pathology_ids
}

# Load data and check
data = load_data()
for key, df in data.items():
    if isinstance(df, pd.DataFrame):
        print(f"{key}: {df.shape}")
    else:
        print(f"{key}: Loaded JSON data")



In [None]:
# Step 2: Data Preprocessing
# --------------------------

def merge_data(data):
    """
    Merge all clinical data into a single DataFrame based on 'hadm_id'.
    """
    merged_data = data['discharge_diagnosis'].merge(data['hpi'], on='hadm_id', how='inner') \
                                             .merge(data['physical_examination'], on='hadm_id', how='inner') \
                                             .merge(data['lab_tests'], on='hadm_id', how='inner') \
                                             .merge(data['microbiology'], on='hadm_id', how='inner') \
                                             .merge(data['radiology_reports'], on='hadm_id', how='inner')
    return merged_data

def filter_by_pathologies(data, pathology_ids):
    """
    Filter data based on specific pathology IDs.
    """
    pathology_ids_list = [item for sublist in pathology_ids.values() for item in sublist]
    return data[data['hadm_id'].isin(pathology_ids_list)]

def handle_missing_values(data):
    """
    Handle missing data by filling forward.
    """
    return data.fillna(method='ffill')

def preprocess_data(data, pathology_ids):
    """
    Preprocess and combine data steps.
    """
    combined_data = merge_data(data)
    filtered_data = filter_by_pathologies(combined_data, pathology_ids)
    cleaned_data = handle_missing_values(filtered_data)
    return cleaned_data

# Preprocess data
processed_data = preprocess_data(data, data['pathology_ids'])
print(f"Processed Data Shape: {processed_data.shape}")
processed_data.head()


In [None]:
# Step 3: Feature Engineering
# ---------------------------

def create_text_features(data):
    """
    Create features based on text length of 'hpi'.
    """
    if 'hpi' in data.columns:
        data['hpi_length'] = data['hpi'].apply(lambda x: len(str(x).split()))
    return data

def encode_categorical_features(data):
    """
    One-hot encode radiology modality and ICD codes.
    """
    if 'modality' in data.columns:
        data = pd.get_dummies(data, columns=['modality'], prefix='modality')
    if 'icd_code' in data.columns:
        data = pd.get_dummies(data, columns=['icd_code'], prefix='icd')
    return data

def feature_engineering(data):
    """
    Apply feature engineering transformations.
    """
    data = create_text_features(data)
    data = encode_categorical_features(data)
    return data

# Apply feature engineering
feature_data = feature_engineering(processed_data)
feature_data.to_csv('../data/processed/feature_matrix.csv', index=False)
print(f"Feature Data Shape: {feature_data.shape}")
feature_data.head()


In [None]:
import time
import tensorflow as tf

def measure_time_cpu(matrix_a, matrix_b):
    start_time = time.time()
    result = tf.matmul(matrix_a, matrix_b)
    tf.experimental.numpy.copy(result)  # Ensure completion of the computation
    return time.time() - start_time

def measure_time_gpu(matrix_a, matrix_b):
    # Warm-up runs
    for _ in range(5):
        _ = tf.matmul(matrix_a, matrix_b)

    start_time = time.time()
    result = tf.matmul(matrix_a, matrix_b)
    tf.experimental.numpy.copy(result)  # Ensure completion of the computation
    return time.time() - start_time

# Generate random matrices
matrix_a = tf.random.normal(shape=(10000, 10000))
matrix_b = tf.random.normal(shape=(10000, 10000))

# Measure CPU time
cpu_time = measure_time_cpu(matrix_a, matrix_b)
print("Average time on CPU:", cpu_time)

# Check for GPU availability
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    # Ensure matrices are on the GPU
    matrix_a_gpu = tf.constant(matrix_a)
    matrix_b_gpu = tf.constant(matrix_b)

    # Measure GPU time
    gpu_time = measure_time_gpu(matrix_a_gpu, matrix_b_gpu)
    print("Average time on GPU:", gpu_time)

    # Calculate and print the performance difference
    performance_improvement = ((cpu_time - gpu_time) / cpu_time) * 100
    print(f"GPU is {performance_improvement:.2f}% faster than CPU.")
else:
    print("GPU not available.")


In [None]:
# Step 4: Model Training
# ----------------------

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

# def train_random_forest(X, y):
#     """
#     Train a RandomForest model.
#     """
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
#     model = RandomForestClassifier(n_estimators=100, random_state=42)
#     model.fit(X_train, y_train)
#     y_pred = model.predict(X_test)
#     accuracy = accuracy_score(y_test, y_pred)
#     report = classification_report(y_test, y_pred)
#     return model, accuracy, report


In [20]:
# Load feature data
feature_data = pd.read_csv('../data/processed/feature_matrix.csv')
X = feature_data.drop(columns=['discharge_diagnosis', 'hadm_id'])  # Replace 'outcome' with actual target variable
y = feature_data['discharge_diagnosis']  # Replace 'outcome' with actual target variable

# Identify and drop non-numeric columns from X
non_numeric_columns = X.select_dtypes(include=['object']).columns
print(f"Dropping non-numeric columns: {non_numeric_columns}")
X = X.select_dtypes(exclude=['object'])

Dropping non-numeric columns: Index(['hpi', 'pe', 'valuestr_x', 'valuestr_y', 'note_id', 'region',
       'exam_name', 'text'],
      dtype='object')


In [None]:
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Load and split your data (assuming X and y are already defined)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Encode string labels to integers
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

# Create an XGBoost classifier with GPU support and optimized parameters
model = xgb.XGBClassifier(
    tree_method='gpu_hist',  # Use GPU-accelerated training
    use_label_encoder=False,
    max_bin=256,  # Reduce bins to save memory
    subsample=0.8,  # Subsample 80% of the data
    colsample_bytree=0.8,  # Use 80% of features for each tree
    n_estimators=50,  # Reduce the number of trees
    max_depth=6  # Limit tree depth
)

# Train the model
model.fit(X_train, y_train)

# Make predictions and evaluate
y_pred = model.predict(X_test)

# Transform predictions back to original string labels (optional, for interpretation)
y_pred_labels = le.inverse_transform(y_pred)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred_labels))

In [40]:
import torch
torch.cuda.empty_cache()

import gc

# Delete variables you no longer need
del X_train, X_test, y_train, y_test  # Adjust according to your variable names
gc.collect()  # Force garbage collection


855

In [None]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import gc

# Load and split your data (ensure it's reduced or optimized)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Reduce memory usage
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')

# Remove constant columns if applicable
constant_columns = [col for col in X_train.columns if X_train[col].nunique() <= 1]
if constant_columns:
    print(f"Removing constant columns: {constant_columns}")
    X_train = X_train.drop(columns=constant_columns)
    X_test = X_test.drop(columns=constant_columns)

# Clear memory
gc.collect()

# Train LightGBM model with parameters to reduce memory consumption
model = lgb.LGBMClassifier(device='gpu', max_bin=256, n_estimators=100, max_depth=6)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred))


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import numpy as np

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.01, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Convert data to PyTorch tensors
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
X_train = torch.tensor(X_train, dtype=torch.float32).to(device)
X_test = torch.tensor(X_test, dtype=torch.float32).to(device)

# Ensure y_train and y_test are integers
if y_train.dtype == 'object' or y_test.dtype == 'object':
    le = LabelEncoder()
    y_train = le.fit_transform(y_train)
    y_test = le.transform(y_test)

y_train = torch.tensor(y_train, dtype=torch.long).to(device)
y_test = torch.tensor(y_test, dtype=torch.long).to(device)

# Improved neural network with additional layers, dropout, and batch normalization
class ImprovedNN(nn.Module):
    def __init__(self, input_dim):
        super(ImprovedNN, self).__init__()
        self.layer1 = nn.Linear(input_dim, 128)
        self.bn1 = nn.BatchNorm1d(128)
        self.dropout1 = nn.Dropout(0.3)
        self.layer2 = nn.Linear(128, 64)
        self.bn2 = nn.BatchNorm1d(64)
        self.dropout2 = nn.Dropout(0.3)
        self.layer3 = nn.Linear(64, 32)
        self.output = nn.Linear(32, len(torch.unique(y_train)))  # Adjust output size based on number of classes

    def forward(self, x):
        x = torch.relu(self.bn1(self.layer1(x)))
        x = self.dropout1(x)
        x = torch.relu(self.bn2(self.layer2(x)))
        x = self.dropout2(x)
        x = torch.relu(self.layer3(x))
        x = self.output(x)
        return x

# Initialize the model
model = ImprovedNN(X_train.shape[1]).to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)

# Training loop
num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train)
    loss = criterion(outputs, y_train)
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# Evaluation
model.eval()
with torch.no_grad():
    predictions = model(X_test).argmax(dim=1)
    accuracy = (predictions == y_test).float().mean().item()
    print(f'Accuracy: {accuracy:.4f}')


In [None]:
print(f"Model Accuracy: {accuracy:.4f}")

# Save the model
joblib.dump(model, '../models/random_forest_model.pkl')

In [None]:
# Train the model again
model, accuracy, report = train_random_forest(X, y)


# Save the model
joblib.dump(model, '../models/random_forest_model.pkl')

In [None]:
# Step 5: SHAP Analysis
# ---------------------

import shap
import matplotlib.pyplot as plt

def run_shap_analysis(model, X):
    """
    Compute SHAP values for feature importance analysis.
    """
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X)
    return explainer, shap_values

# Load model and data
model = joblib.load('../models/random_forest_model.pkl')
X = pd.read_csv('../data/processed/feature_matrix.csv').drop(columns=['discharge_diagnosis', 'hadm_id'])  # Adjust as needed

explainer, shap_values = run_shap_analysis(model, X)

# Visualize SHAP values
plt.title("SHAP Summary Plot")
shap.summary_plot(shap_values[1], X)
