In [28]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [29]:
data_ready = pd.read_csv("data_ready.csv")

In [30]:
data_ready[data_ready.select_dtypes(include=['float64']).columns] = data_ready.select_dtypes(include=['float64']).astype("float32")

In [31]:
# 70% training data, 15% validation, 15% test
train_dev, test = train_test_split(data_ready, test_size=0.15, random_state=42)
train, dev = train_test_split(train_dev, test_size=0.176, random_state=42)

In [32]:
# convert to tensors and extract labels

train_label = train.pop("5YR_SURV")
dev_label = dev.pop("5YR_SURV")
test_label = test.pop("5YR_SURV")

train_tf = tf.convert_to_tensor(train)
dev_tf = tf.convert_to_tensor(dev)
test_tf = tf.convert_to_tensor(test)

train_label_tf = tf.convert_to_tensor(train_label)
dev_label_tf = tf.convert_to_tensor(dev_label)
test_label_tf = tf.convert_to_tensor(test_label)

In [158]:
from tensorflow.keras.initializers import GlorotUniform

In [159]:
model = tf.keras.Sequential(
    [
       tf.keras.Input(shape=(214,)),
       tf.keras.layers.Dense(10, activation="relu", kernel_regularizer=tf.keras.regularizers.l1_l2(l1=0.03, l2=0.0003), name="L1", kernel_initializer=GlorotUniform()),
       tf.keras.layers.Dropout(0.2),
       tf.keras.layers.Dense(10, activation="relu", kernel_regularizer=tf.keras.regularizers.l1_l2(l1=0.03, l2=0.0003), name="L3", kernel_initializer=GlorotUniform()),
       tf.keras.layers.Dropout(0.2),
       tf.keras.layers.Dense(5, activation="relu", kernel_regularizer=tf.keras.regularizers.l1_l2(l1=0.03, l2=0.0003), name="L5", kernel_initializer=GlorotUniform()),
       tf.keras.layers.Dropout(0.2),
       tf.keras.layers.Dense(1, activation="sigmoid", name="L9") 
    ]
)

In [160]:
from tensorflow.keras import backend as K

# Sensitivity (Recall)
import tensorflow as tf
from tensorflow.keras import backend as K

def sensitivity(y_true, y_pred):
    """
    param:
    y_pred - Predicted labels
    y_true - True labels 
    Returns:
    Specificity score
    """
    pos_y_true = y_true
    pos_y_pred = y_pred
    fn = K.sum(pos_y_true * y_pred)
    tp = K.sum(pos_y_true * pos_y_pred)
    specificity = tp / (tp + fn + K.epsilon())
    
    return sensitivity
# Specificity
def specificity(y_true, y_pred):
    """
    param:
    y_pred - Predicted labels
    y_true - True labels 
    Returns:
    Specificity score
    """
    neg_y_true = 1 - y_true
    neg_y_pred = 1 - y_pred
    fp = K.sum(neg_y_true * y_pred)
    tn = K.sum(neg_y_true * neg_y_pred)
    specificity = tn / (tn + fp + K.epsilon())
    spec_val = specificity
    return specificity

In [161]:
model.compile(
    optimizer=tf.keras.optimizers.Adamax(learning_rate=1e-3),
    loss=tf.keras.losses.BinaryCrossentropy(), 
    metrics=["accuracy", specificity]
)

In [162]:
from imblearn.over_sampling import SMOTE

smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(train_tf, train_label_tf)

In [163]:
history = model.fit(X_resampled, y_resampled, epochs=100, validation_data = (dev_tf, dev_label_tf))

Epoch 1/100
[1m761/761[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 4ms/step - accuracy: 0.5075 - loss: 3.2047 - specificity: 0.5187 - val_accuracy: 0.7119 - val_loss: 0.7361 - val_specificity: 0.4893
Epoch 2/100
[1m761/761[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.4957 - loss: 0.7238 - specificity: 0.4961 - val_accuracy: 0.7119 - val_loss: 0.7001 - val_specificity: 0.4939
Epoch 3/100
[1m761/761[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.5004 - loss: 0.7008 - specificity: 0.4991 - val_accuracy: 0.7119 - val_loss: 0.7005 - val_specificity: 0.4955
Epoch 4/100
[1m761/761[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.4970 - loss: 0.7007 - specificity: 0.4994 - val_accuracy: 0.7119 - val_loss: 0.7006 - val_specificity: 0.4954
Epoch 5/100
[1m541/761[0m [32m━━━━━━━━━━━━━━[0m[37m━━━━━━[0m [1m0s[0m 4ms/step - accuracy: 0.5063 - loss: 0.7006 - specificity: 0.4991

KeyboardInterrupt: 

In [164]:
from sklearn.metrics import classification_report, roc_auc_score

y_pred = model.predict(X_resampled)
y_pred = y_pred.round()
print("Classification Report:")
print(classification_report(dev_label_tf, y_pred))
print("ROC AUC Score:", roc_auc_score(dev_label_tf, y_pred))

[1m761/761[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step
Classification Report:


ValueError: Found input variables with inconsistent numbers of samples: [3585, 24350]

In [None]:
y_pred = model.predict(train_tf)
y_pred = y_pred.round()

precision = precision_score(train_label_tf, y_pred)
recall = recall_score(train_label_tf, y_pred)
f1 = f1_score(train_label_tf, y_pred)

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")
recall_score(train_label_tf, y_pred, pos_label=0)

In [None]:
loss_values = history.history['loss']
val_loss_values = history.history.get('val_loss')

# Plot the loss values
plt.figure(figsize=(10, 6))
plt.plot(loss_values, label='Training Loss')
if val_loss_values:
    plt.plot(val_loss_values, label='Validation Loss', linestyle='--')

plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
model.evaluate(dev_tf, dev_label_tf)

In [None]:
%pip install xgboost

In [75]:
from xgboost import XGBClassifier

In [None]:
model = XGBClassifier(n_estimators=500, max_depth=6, subsample=0.7, colsample_bytree=0.7, learning_rate=0.1, reg_alpha=0.1, reg_lambda=1.0, scale_pos_weight=0.33, objective='binary:logistic')
model.fit(X_resampled, y_resampled)
print(model.score(train_tf, train_label_tf))
print(model.score(dev_tf, dev_label_tf))
y_pred = model.predict(dev_tf)

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

precision = precision_score(dev_label_tf, y_pred)
recall = recall_score(dev_label_tf, y_pred)
f1 = f1_score(dev_label_tf, y_pred)

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")

In [None]:
# from sklearn.model_selection import GridSearchCV

# param_grid = {
#     'n_estimators': [100, 300, 500],
#     'max_depth': [3, 6, 9],
#     'learning_rate': [0.01, 0.1, 0.2],
#     'subsample': [0.7, 0.8, 1.0],
#     'colsample_bytree': [0.7, 0.8, 1.0],
#     'reg_alpha': [0.01, 0.1, 1],
#     'reg_lambda': [0.01, 0.1, 1]
# }

# grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='accuracy', cv=5)
# grid_search.fit(np.array(train_tf), np.array(train_label_tf))

# print(f"Best Hyperparameters: {grid_search.best_params_}")
# print(f"Best Cross-Validation Accuracy: {grid_search.best_score_ * 100:.2f}%")


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

model = LogisticRegression(max_iter=1000)
model.fit(train_tf, train_label_tf)
print(f"Logistic Regression Accuracy: {model.score(dev_tf, dev_label_tf) * 100:.2f}%")

model = RandomForestClassifier(n_estimators=100)
model.fit(train_tf, train_label_tf)
print(f"Random Forest Accuracy: {model.score(dev_tf, dev_label_tf) * 100:.2f}%")
model.predict(dev_tf)


precision = precision_score(dev_label_tf, y_pred)
recall = recall_score(dev_label_tf, y_pred)
f1 = f1_score(dev_label_tf, y_pred)

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")
from sklearn.metrics import recall_score
recall_score(dev_label_tf, y_pred, pos_label=0)