# Model Logistic Regression

In [373]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, f1_score, matthews_corrcoef

In [433]:
dataset_dead_inputs_train = pd.read_csv('dataset_dead_inputs_train.csv', index_col = 0)
dataset_dead_targets_train = pd.read_csv('dataset_dead_targets_train.csv', index_col = 0)#, header = None)
dataset_deadicu_inputs_train = pd.read_csv('dataset_deadicu_inputs_train.csv', index_col = 0)
dataset_deadicu_targets_train = pd.read_csv('dataset_deadicu_targets_train.csv', index_col = 0)#, header = None)

In [434]:
dataset_deadicu_inputs_train

Unnamed: 0,reason:6,reason:9,cronic_ops_lung_disease,conscious:2,conscious:3,conscious:4,age:61-70,age:71-80,age:81-90,age:91-100,puls:60-70or90-100,puls:40-60or100-120,puls:0-40or120-200
0,1,0,0,0,0,0,0,0,1,0,0,1,0
1,0,0,1,0,0,0,0,0,0,0,1,0,0
2,0,1,0,0,0,0,0,1,0,0,0,1,0
3,0,0,0,0,0,0,0,0,0,1,0,1,0
4,0,0,1,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
273,0,0,1,0,0,0,0,1,0,0,1,0,0
274,0,0,0,0,0,0,0,1,0,0,0,0,0
275,1,0,0,0,0,0,0,0,0,0,0,0,1
276,0,1,0,0,0,0,0,0,1,0,0,1,0


In [410]:
# Initialize Logistic Regression model
model = LogisticRegression(fit_intercept=True)

In [411]:
X = dataset_dead_inputs_train
y = dataset_dead_targets_train['dead']

In [412]:
y.shape

(275,)

In [415]:
def bootstraped(X, y, n_bootstraps):
    bootstrap_auc = []
    bootstrap_accuracy = []
    bootstrap_specificity = []
    bootstrap_sensitivity = []
    bootstrap_f1 = []
    bootstrap_odds_ratio = []
    bootstrap_ci_lower = []
    bootstrap_ci_upper = []
    bootstrap_mcc = []
    conf_matrices = []
    bootstrap_ppp = []
    bootstrap_npp = []
    
    # Bootstrap process
    for i in range(n_bootstraps):
        # Sample with replacement to create a bootstrapped dataset
        bootstrap_indices = np.random.choice(range(len(X)), size=len(X), replace=True)
        X_bootstrap = X.iloc[bootstrap_indices]
        y_bootstrap = y.iloc[bootstrap_indices]

        # Get the out-of-bag (OOB) indices - the samples that were not included in the bootstrap sample
        oob_indices = list(set(range(len(X))) - set(bootstrap_indices))

        if len(oob_indices) > 0:  # Ensure there are OOB samples to test on
            X_oob = X.iloc[oob_indices]
            y_oob = y.iloc[oob_indices]

            # Train the model on the bootstrapped dataset
            model.fit(X_bootstrap, y_bootstrap)

            # Make predictions on the OOB data
            y_oob_pred = model.predict(X_oob)
            y_oob_pred_prob = model.predict_proba(X_oob)[:, 1]
            y_oob_pred = np.where(y_oob_pred_prob >= 0.3, 1, 0)
            
            # Confusion matrix to get TP, TN, FP, FN
            tn, fp, fn, tp = confusion_matrix(y_oob, y_oob_pred).ravel()


            # Store the confusion matrix for each bootstrap
            conf_matrices.append(confusion_matrix(y_oob, y_oob_pred))
            
            # Calculate metrics
            # Accuracy
            accuracy = accuracy_score(y_oob, y_oob_pred)
            bootstrap_accuracy.append(accuracy)

            # Specificity (TN / (TN + FP))
            specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
            bootstrap_specificity.append(specificity)

            # Sensitivity (Recall or TP / (TP + FN))
            sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
            bootstrap_sensitivity.append(sensitivity)
            
            # Positive predictive power
            pos_predictive_power = tp / (tp + fp) if tp+fp > 0 else np.nan
            bootstrap_ppp.append(pos_predictive_power)
            
            # Negative predictive power
            neg_predictive_power = tn / (tn + fn) if tn+fn > 0 else np.nan
            bootstrap_npp.append(neg_predictive_power)

            # AUC (Area Under the Curve)
            auc = roc_auc_score(y_oob, y_oob_pred_prob)
            bootstrap_auc.append(auc)

            # F1-score (Harmonic mean of precision and recall)
            f1 = f1_score(y_oob, y_oob_pred)
            bootstrap_f1.append(f1)

            # Odds Ratio: (TP / FP) / (FN / TN), but we avoid division by zero.
            odds_ratio = (tp / fp) / (fn / tn) if fp > 0 and tn > 0 else np.nan
            bootstrap_odds_ratio.append(odds_ratio)
            
            # 95% confidence interval for odds ratio
            ci_lower = np.exp(np.log(odds_ratio) - 1.96*np.sqrt(1/tp + 1/tn + 1/fp + 1/fn)) if tp*tn*fp*fn>0 else np.nan
            ci_upper = np.exp(np.log(odds_ratio) + 1.96*np.sqrt(1/tp + 1/tn + 1/fp + 1/fn)) if tp*tn*fp*fn>0 else np.nan
            bootstrap_ci_lower.append(ci_lower)
            bootstrap_ci_upper.append(ci_upper)

            # Matthews Correlation Coefficient (MCC)
            mcc = matthews_corrcoef(y_oob, y_oob_pred)
            bootstrap_mcc.append(mcc)
            
    # Final evaluation: Average accuracy from all bootstraps
    final_accuracy = np.mean(bootstrap_accuracy)
    final_auc = np.nanmean(bootstrap_auc)
    final_sensitivity = np.nanmean(bootstrap_sensitivity)
    final_specificity = np.nanmean(bootstrap_specificity)
    final_ppp = np.nanmean(bootstrap_ppp)
    final_npp = np.nanmean(bootstrap_npp)
    final_odds_ratio = np.nanmean(bootstrap_odds_ratio)
    final_ci_lower = np.nanmean(bootstrap_ci_lower)
    final_ci_upper = np.nanmean(bootstrap_ci_upper)
    final_f1_score = np.nanmean(bootstrap_f1)
    final_mcc = np.nanmean(bootstrap_mcc)
    
    # Average confusion matrix (optional)
    avg_conf_matrix = np.mean(conf_matrices, axis=0)
    
    return {'n_bootstraps': n_bootstraps,
            'average conf matrix': avg_conf_matrix,
           'tp': tp, 'fp': fp, 'tn': tn, 'fn': fn,
            'average sensitivity': final_sensitivity,
            'average specificity': final_specificity,
            'average ppp': final_ppp,
            'average npp': final_npp,
            'average odds ratio': final_odds_ratio,
            'average ci lower': final_ci_lower,
            'average ci upper': final_ci_upper,
            'average f1-score': final_f1_score,
           'average accuracy': final_accuracy,
            'average mcc': final_mcc,
            'average auc': final_auc
            }

In [417]:
bootstraped(X, y, 1000)

{'n_bootstraps': 1000,
 'average conf matrix': array([[80.017,  6.721],
        [10.571,  3.826]]),
 'tp': 4,
 'fp': 3,
 'tn': 83,
 'fn': 10,
 'average sensitivity': 0.2737366499134419,
 'average specificity': 0.9231584917230139,
 'average ppp': 0.4038114033565236,
 'average npp': 0.8841215438687975,
 'average odds ratio': 6.903533820912786,
 'average ci lower': 1.3682307584317162,
 'average ci upper': 40.66286101377693,
 'average f1-score': 0.3013833187048007,
 'average accuracy': 0.8290825108746172,
 'average mcc': 0.22982138978922892,
 'average auc': 0.7229720197068235}

In [278]:
X = dataset_deadicu_inputs_train
y = dataset_deadicu_targets_train['dead_icu']

In [279]:
bootstraped(X, y, 1000)

{'n_bootstraps': 1000,
 'average conf matrix': array([[65.765,  6.146],
        [24.037,  4.96 ]]),
 'tp': 7,
 'fp': 3,
 'tn': 65,
 'fn': 28,
 'average sensitivity': 0.1756967878923952,
 'average specificity': 0.9155591794011197,
 'average ppp': 0.4711743264067536,
 'average npp': 0.7336503636910595,
 'average odds ratio': 2.9088667942170905,
 'average ci lower': 0.7099492999945702,
 'average ci upper': 14.330088694577505,
 'average f1-score': 0.23993334910605388,
 'average accuracy': 0.7009522487746774,
 'average mcc': 0.13176471756397798,
 'average auc': 0.6585348660791059}

# different approach

In [280]:
import statsmodels.api as sm

In [419]:
X = dataset_dead_inputs_train[["reason:6", "reason:9"]]
y = dataset_dead_targets_train

In [420]:
model = LogisticRegression(fit_intercept=True)

In [421]:
model.fit(X, y)

  y = column_or_1d(y, warn=True)


In [422]:
print("Coefficients for reason variables:")
for feature, coef in zip(X.columns, model.coef_.flatten()):
    print(f"{feature}: {coef:.4f}")
print("Odds for reason variables (exp(coef)):")
for feature, coef in zip(X.columns, model.coef_.flatten()):
    print(f"{feature}: {np.exp(coef):.4f}")

Coefficients for reason variables:
reason:6: -0.4503
reason:9: -1.1029
Odds for reason variables (exp(coef)):
reason:6: 0.6375
reason:9: 0.3319


In [423]:
X = sm.add_constant(X)

In [424]:
model = sm.Logit(y, X)
result = model.fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.387364
         Iterations 7


0,1,2,3
Dep. Variable:,dead,No. Observations:,278.0
Model:,Logit,Df Residuals:,275.0
Method:,MLE,Df Model:,2.0
Date:,"Mon, 09 Sep 2024",Pseudo R-squ.:,0.04469
Time:,22:44:22,Log-Likelihood:,-107.69
converged:,True,LL-Null:,-112.72
Covariance Type:,nonrobust,LLR p-value:,0.006488

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-1.3471,0.220,-6.119,0.000,-1.779,-0.916
reason:6,-0.5988,0.460,-1.301,0.193,-1.501,0.303
reason:9,-1.3610,0.476,-2.861,0.004,-2.293,-0.429


In [425]:
X = dataset_dead_inputs_train[["conscious:2", "conscious:3", "conscious:4"]]
X = sm.add_constant(X)

In [426]:
model = sm.Logit(y, X)
result = model.fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.371055
         Iterations 6


0,1,2,3
Dep. Variable:,dead,No. Observations:,278.0
Model:,Logit,Df Residuals:,274.0
Method:,MLE,Df Model:,3.0
Date:,"Mon, 09 Sep 2024",Pseudo R-squ.:,0.08491
Time:,22:44:51,Log-Likelihood:,-103.15
converged:,True,LL-Null:,-112.72
Covariance Type:,nonrobust,LLR p-value:,0.0002553

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-2.0708,0.197,-10.509,0.000,-2.457,-1.685
conscious:2,0.9722,1.171,0.830,0.407,-1.324,3.268
conscious:3,2.4762,0.934,2.652,0.008,0.646,4.307
conscious:4,2.4762,0.675,3.669,0.000,1.153,3.799


In [435]:
X = dataset_dead_inputs_train["oxygen"]
X = sm.add_constant(X)

In [436]:
model = sm.Logit(y, X)
result = model.fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.357531
         Iterations 6


0,1,2,3
Dep. Variable:,dead,No. Observations:,278.0
Model:,Logit,Df Residuals:,276.0
Method:,MLE,Df Model:,1.0
Date:,"Mon, 09 Sep 2024",Pseudo R-squ.:,0.1183
Time:,22:47:02,Log-Likelihood:,-99.394
converged:,True,LL-Null:,-112.72
Covariance Type:,nonrobust,LLR p-value:,2.423e-07

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-2.5055,0.260,-9.636,0.000,-3.015,-1.996
oxygen,1.8798,0.367,5.129,0.000,1.161,2.598


In [437]:
X = dataset_dead_inputs_train[["reason:6", "reason:9", "conscious:2", "conscious:3", "conscious:4"]]
X = sm.add_constant(X)

In [438]:
model = sm.Logit(y, X)
result = model.fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.354431
         Iterations 7


0,1,2,3
Dep. Variable:,dead,No. Observations:,278.0
Model:,Logit,Df Residuals:,272.0
Method:,MLE,Df Model:,5.0
Date:,"Mon, 09 Sep 2024",Pseudo R-squ.:,0.1259
Time:,22:47:14,Log-Likelihood:,-98.532
converged:,True,LL-Null:,-112.72
Covariance Type:,nonrobust,LLR p-value:,3.059e-05

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-1.5632,0.240,-6.518,0.000,-2.033,-1.093
reason:6,-0.9038,0.517,-1.748,0.080,-1.917,0.110
reason:9,-1.2999,0.500,-2.597,0.009,-2.281,-0.319
conscious:2,0.6515,1.188,0.548,0.583,-1.677,2.980
conscious:3,2.7082,0.981,2.762,0.006,0.786,4.630
conscious:4,2.4412,0.723,3.377,0.001,1.024,3.858


In [439]:
X = dataset_dead_inputs_train[["age:61-70", "age:71-80", "age:81-90", "age:91-100"]]
X = sm.add_constant(X)

In [440]:
model = sm.Logit(y, X)
result = model.fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.371180
         Iterations 7


0,1,2,3
Dep. Variable:,dead,No. Observations:,278.0
Model:,Logit,Df Residuals:,273.0
Method:,MLE,Df Model:,4.0
Date:,"Mon, 09 Sep 2024",Pseudo R-squ.:,0.0846
Time:,22:47:14,Log-Likelihood:,-103.19
converged:,True,LL-Null:,-112.72
Covariance Type:,nonrobust,LLR p-value:,0.0007602

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-3.4012,0.719,-4.732,0.000,-4.810,-1.992
age:61-70,0.8362,0.887,0.943,0.346,-0.901,2.574
age:71-80,2.0431,0.766,2.668,0.008,0.542,3.544
age:81-90,1.8478,0.792,2.334,0.020,0.296,3.400
age:91-100,3.1781,0.983,3.232,0.001,1.251,5.105


In [441]:
X = dataset_dead_inputs_train[["reason:6", "reason:9", "conscious:2", "conscious:3", "conscious:4",
                               "age:61-70", "age:71-80", "age:81-90", "age:91-100"]]
X = sm.add_constant(X)

In [442]:
model = sm.Logit(y, X)
result = model.fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.321108
         Iterations 8


0,1,2,3
Dep. Variable:,dead,No. Observations:,278.0
Model:,Logit,Df Residuals:,268.0
Method:,MLE,Df Model:,9.0
Date:,"Mon, 09 Sep 2024",Pseudo R-squ.:,0.2081
Time:,22:47:15,Log-Likelihood:,-89.268
converged:,True,LL-Null:,-112.72
Covariance Type:,nonrobust,LLR p-value:,4.072e-07

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-3.1677,0.765,-4.139,0.000,-4.668,-1.668
reason:6,-1.0320,0.564,-1.829,0.067,-2.138,0.074
reason:9,-1.5139,0.526,-2.879,0.004,-2.544,-0.483
conscious:2,0.2376,1.474,0.161,0.872,-2.652,3.127
conscious:3,2.6820,1.059,2.533,0.011,0.607,4.757
conscious:4,2.4638,0.799,3.084,0.002,0.898,4.030
age:61-70,1.0326,0.922,1.120,0.263,-0.774,2.839
age:71-80,1.9577,0.810,2.416,0.016,0.370,3.546
age:81-90,2.1445,0.829,2.587,0.010,0.520,3.770


In [443]:
X = dataset_dead_inputs_train[["respiration:>=21"]]
X = sm.add_constant(X)

In [444]:
model = sm.Logit(y, X)
result = model.fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.394530
         Iterations 6


0,1,2,3
Dep. Variable:,dead,No. Observations:,278.0
Model:,Logit,Df Residuals:,276.0
Method:,MLE,Df Model:,1.0
Date:,"Mon, 09 Sep 2024",Pseudo R-squ.:,0.02702
Time:,22:47:15,Log-Likelihood:,-109.68
converged:,True,LL-Null:,-112.72
Covariance Type:,nonrobust,LLR p-value:,0.01358

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-1.9042,0.181,-10.510,0.000,-2.259,-1.549
respiration:>=21,1.9042,0.730,2.609,0.009,0.474,3.335


In [445]:
X = dataset_dead_inputs_train[["reason:6", "reason:9", "conscious:2", "conscious:3", "conscious:4",
                               "age:61-70", "age:71-80", "age:81-90", "age:91-100", "respiration:>=21"]]
X = sm.add_constant(X)

In [446]:
model = sm.Logit(y, X)
result = model.fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.316019
         Iterations 8


0,1,2,3
Dep. Variable:,dead,No. Observations:,278.0
Model:,Logit,Df Residuals:,267.0
Method:,MLE,Df Model:,10.0
Date:,"Mon, 09 Sep 2024",Pseudo R-squ.:,0.2206
Time:,22:47:15,Log-Likelihood:,-87.853
converged:,True,LL-Null:,-112.72
Covariance Type:,nonrobust,LLR p-value:,2.976e-07

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-3.2026,0.765,-4.185,0.000,-4.702,-1.703
reason:6,-0.9200,0.567,-1.623,0.105,-2.031,0.191
reason:9,-1.4713,0.534,-2.753,0.006,-2.519,-0.424
conscious:2,0.3607,1.437,0.251,0.802,-2.455,3.176
conscious:3,2.7539,1.057,2.604,0.009,0.681,4.826
conscious:4,2.4117,0.810,2.979,0.003,0.825,3.999
age:61-70,0.9713,0.922,1.054,0.292,-0.836,2.778
age:71-80,1.8694,0.812,2.303,0.021,0.278,3.460
age:81-90,2.1063,0.827,2.546,0.011,0.485,3.728


In [447]:
X = dataset_dead_inputs_train[["blood_pressure:121-140", "blood_pressure:141-160", "blood_pressure:161-250"]]
X = sm.add_constant(X)

In [489]:
model = sm.Logit(y, X)
result = model.fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.358278
         Iterations 6


0,1,2,3
Dep. Variable:,dead,No. Observations:,278.0
Model:,Logit,Df Residuals:,276.0
Method:,MLE,Df Model:,1.0
Date:,"Mon, 09 Sep 2024",Pseudo R-squ.:,0.1164
Time:,22:49:15,Log-Likelihood:,-99.601
converged:,True,LL-Null:,-112.72
Covariance Type:,nonrobust,LLR p-value:,3.003e-07

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,7.2256,1.963,3.681,0.000,3.379,11.072
spo2,-0.0981,0.021,-4.587,0.000,-0.140,-0.056


In [449]:
X = dataset_dead_inputs_train[["reason:6", "reason:9", "conscious:2", "conscious:3", "conscious:4",
                               "age:61-70", "age:71-80", "age:81-90", "age:91-100", 
                               "blood_pressure:121-140", "blood_pressure:141-160", "blood_pressure:161-250", "oxygen"]]
X = sm.add_constant(X)

In [450]:
model = sm.Logit(y, X)
result = model.fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.297923
         Iterations 8


0,1,2,3
Dep. Variable:,dead,No. Observations:,278.0
Model:,Logit,Df Residuals:,264.0
Method:,MLE,Df Model:,13.0
Date:,"Mon, 09 Sep 2024",Pseudo R-squ.:,0.2653
Time:,22:47:15,Log-Likelihood:,-82.823
converged:,True,LL-Null:,-112.72
Covariance Type:,nonrobust,LLR p-value:,5.693e-08

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-3.0965,0.794,-3.902,0.000,-4.652,-1.541
reason:6,-0.4270,0.601,-0.711,0.477,-1.604,0.750
reason:9,-1.3399,0.554,-2.420,0.016,-2.425,-0.255
conscious:2,0.5459,1.552,0.352,0.725,-2.496,3.588
conscious:3,2.1642,1.116,1.939,0.053,-0.024,4.352
conscious:4,1.7629,0.843,2.092,0.036,0.111,3.414
age:61-70,0.8693,0.943,0.922,0.356,-0.978,2.717
age:71-80,1.6941,0.825,2.054,0.040,0.078,3.311
age:81-90,1.9376,0.836,2.319,0.020,0.300,3.575


When I add blood_pressure to dummy variables of {reason, conscious, age}, blood_preasure becomes insignificant (level 0.05).

In [451]:
X = dataset_dead_inputs_train[["puls:40-60or100-120", "puls:0-40or120-200"]]
X = sm.add_constant(X)

In [452]:
model = sm.Logit(y, X)
result = model.fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.397621
         Iterations 6


0,1,2,3
Dep. Variable:,dead,No. Observations:,278.0
Model:,Logit,Df Residuals:,275.0
Method:,MLE,Df Model:,2.0
Date:,"Mon, 09 Sep 2024",Pseudo R-squ.:,0.01939
Time:,22:47:15,Log-Likelihood:,-110.54
converged:,True,LL-Null:,-112.72
Covariance Type:,nonrobust,LLR p-value:,0.1123

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-2.0980,0.237,-8.855,0.000,-2.562,-1.634
puls:40-60or100-120,0.6041,0.409,1.476,0.140,-0.198,1.406
puls:0-40or120-200,0.8816,0.467,1.887,0.059,-0.034,1.797


In [453]:
X = dataset_dead_inputs_train[["reason:6", "reason:9", "conscious:2", "conscious:3", "conscious:4",
                               "age:61-70", "age:71-80", "age:81-90", "age:91-100", 
                               "puls:40-60or100-120", "puls:0-40or120-200", "oxygen"]]
X = sm.add_constant(X)

In [454]:
model = sm.Logit(y, X)
result = model.fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.292997
         Iterations 8


0,1,2,3
Dep. Variable:,dead,No. Observations:,278.0
Model:,Logit,Df Residuals:,265.0
Method:,MLE,Df Model:,12.0
Date:,"Mon, 09 Sep 2024",Pseudo R-squ.:,0.2774
Time:,22:47:15,Log-Likelihood:,-81.453
converged:,True,LL-Null:,-112.72
Covariance Type:,nonrobust,LLR p-value:,7.729e-09

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-3.9315,0.849,-4.631,0.000,-5.595,-2.268
reason:6,-0.5279,0.583,-0.906,0.365,-1.671,0.615
reason:9,-1.9266,0.604,-3.189,0.001,-3.111,-0.742
conscious:2,0.5971,1.557,0.383,0.701,-2.455,3.649
conscious:3,2.0590,1.101,1.871,0.061,-0.098,4.216
conscious:4,2.0196,0.826,2.444,0.015,0.400,3.639
age:61-70,1.0666,0.958,1.114,0.265,-0.811,2.944
age:71-80,1.9859,0.858,2.313,0.021,0.303,3.668
age:81-90,2.1822,0.864,2.527,0.012,0.490,3.875


In [455]:
model = LogisticRegression(fit_intercept=True)

In [456]:
X = dataset_dead_inputs_train[["reason:6", "reason:9", "conscious:2", "conscious:3", "conscious:4",
                               "age:61-70", "age:71-80", "age:81-90", "age:91-100", 
                               "puls:40-60or100-120", "puls:0-40or120-200"]]
y = dataset_dead_targets_train['dead']

In [457]:
bootstraped(X, y, 1000)

{'n_bootstraps': 1000,
 'average conf matrix': array([[83.408,  4.471],
        [10.733,  3.562]]),
 'tp': 2,
 'fp': 1,
 'tn': 85,
 'fn': 11,
 'average sensitivity': 0.2571212154845349,
 'average specificity': 0.9497099284919124,
 'average ppp': 0.4902914691388698,
 'average npp': 0.8866605904139874,
 'average odds ratio': 10.593304386608704,
 'average ci lower': 1.7977033833628362,
 'average ci upper': 74.65853842163125,
 'average f1-score': 0.31348834676327725,
 'average accuracy': 0.8513808363059682,
 'average mcc': 0.26966539692424524,
 'average auc': 0.7279584210975036}

In [458]:
dataset = pd.read_csv('dataset_all.csv', index_col = 0)
target_d = dataset['dead']
target_di = dataset['dead_icu']

In [459]:
dataset = dataset.drop(columns=['dead', 'dead_icu'], axis=1)

In [460]:
target_di

0      0
1      0
2      1
3      0
4      0
      ..
273    0
274    0
275    0
276    0
277    1
Name: dead_icu, Length: 278, dtype: int64

In [461]:
dataset

Unnamed: 0,RB,SPOL,MJESTO,DOB,RAZLOG/dg,OTPUST,2 DANA,30 DANA,ICU,SMRT,...,reason:6,reason:9,cronic_ops_lung_disease:0,cronic_ops_lung_disease:1,oxygen:0,oxygen:1,conscious:1,conscious:2,conscious:3,conscious:4
0,1,1,1,1936,6,1,1.0,1.0,0,0,...,1,0,1,0,1,0,1,0,0,0
1,2,2,1,1972,10,1,3.0,4.0,0,0,...,0,0,0,1,0,1,1,0,0,0
2,3,2,1,1946,9,4,5.0,1.0,1,0,...,0,1,1,0,1,0,1,0,0,0
3,4,2,1,1927,11,6,1.0,2.0,0,0,...,0,0,1,0,0,1,1,0,0,0
4,5,2,1,1934,10,3,3.0,1.0,0,0,...,0,0,0,1,1,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
273,274,1,1,1946,10,1,1.0,1.0,0,0,...,0,0,0,1,1,0,1,0,0,0
274,275,2,2,1945,11,6,1.0,3.0,0,0,...,0,0,1,0,1,0,1,0,0,0
275,276,2,2,1964,6,3,3.0,1.0,0,0,...,1,0,1,0,1,0,1,0,0,0
276,277,2,2,1937,9,3,3.0,1.0,0,0,...,0,1,1,0,1,0,1,0,0,0


# Not using preprocessing:

In [462]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, matthews_corrcoef, roc_auc_score

In [463]:
X = dataset[['reason', 'conscious', 'age', 'respiration', 'blood_pressure', 'puls_abs']]
y = target_d

In [464]:
categorical_features = ['reason', 'conscious']
continuous_features = ['age', 'respiration', 'blood_pressure', 'puls_abs']

In [465]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first'), categorical_features),  
        ('num', 'passthrough', continuous_features)  
    ])


In [466]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),  # Preprocess the data
    ('classifier', LogisticRegression())  # Logistic regression model
])

In [467]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [468]:
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [469]:
y_pred = model.predict(X_test)

In [470]:
print("Model Coefficients:", model.named_steps['classifier'].coef_)
print("Intercept:", model.named_steps['classifier'].intercept_)

Model Coefficients: [[ 0.34983935 -0.59053813  0.91208029  1.06347223  1.65575203  0.04472662
   0.19396044 -0.02527746  0.02081649]]
Intercept: [-5.39771149]


In [471]:
conf_matrix = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = conf_matrix.ravel()
conf_matrix

array([[63,  0],
       [ 6,  1]])

In [472]:
y_test.index

Int64Index([ 30, 126, 199, 142, 253, 237,  97, 206, 263, 144,  79, 209, 231,
            101, 195, 114, 194,  60, 203, 242,  45,  73, 182, 258, 275, 213,
             84,  42,   9, 118,  22, 155,  24, 168, 226,  75, 234, 236,   6,
             68,  46, 177,  66,  25, 125, 164,  19,  82, 172, 276,  90, 152,
            261,  15, 159,  10, 260, 124,  33,  86, 264, 221,  37, 140,  16,
            193, 243,  67, 132, 154],
           dtype='int64')

In [473]:
y_pred_prob = model.predict_proba(X_test)[:, 1]
y_pred_prob

array([0.37116401, 0.0218355 , 0.02918975, 0.10622286, 0.05140774,
       0.04992484, 0.03257954, 0.31417355, 0.22514429, 0.00579951,
       0.01517459, 0.08964257, 0.10708487, 0.31768823, 0.39729327,
       0.04126072, 0.13867197, 0.08551364, 0.14895211, 0.09728472,
       0.15794146, 0.05236338, 0.5809508 , 0.20998136, 0.02047172,
       0.10000998, 0.1723132 , 0.22997209, 0.14553208, 0.03800212,
       0.08970567, 0.11149537, 0.03652773, 0.11641672, 0.01325235,
       0.13344152, 0.07896143, 0.02460039, 0.06752726, 0.19761515,
       0.13519889, 0.03098377, 0.04247974, 0.44143931, 0.26442612,
       0.08703349, 0.04764071, 0.07441706, 0.01804258, 0.05760177,
       0.13508518, 0.39690644, 0.01967145, 0.4522586 , 0.08703807,
       0.11656696, 0.1503877 , 0.01908434, 0.12770512, 0.12698776,
       0.01658547, 0.02182101, 0.02834254, 0.12271515, 0.01017451,
       0.18294785, 0.0406016 , 0.05387811, 0.0783855 , 0.03030041])

In [474]:
accuracy = accuracy_score(y_test, y_pred)
specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
pos_predictive_power = tp / (tp + fp) if tp+fp > 0 else np.nan
neg_predictive_power = tn / (tn + fn) if tn+fn > 0 else np.nan
auc = roc_auc_score(y_test, y_pred_prob)
f1 = f1_score(y_test, y_pred)
odds_ratio = (tp / fp) / (fn / tn) if fp > 0 and tn > 0 else np.nan
ci_lower = np.exp(np.log(odds_ratio) - 1.96*np.sqrt(1/tp + 1/tn + 1/fp + 1/fn)) if tp*tn*fp*fn>0 else np.nan
ci_upper = np.exp(np.log(odds_ratio) + 1.96*np.sqrt(1/tp + 1/tn + 1/fp + 1/fn)) if tp*tn*fp*fn>0 else np.nan
            
mcc = matthews_corrcoef(y_test, y_pred)

print(accuracy, specificity, sensitivity, pos_predictive_power, neg_predictive_power, auc, f1, odds_ratio, ci_lower, ci_upper, mcc)

0.9142857142857143 1.0 0.14285714285714285 1.0 0.9130434782608695 0.8117913832199546 0.25 nan nan nan 0.3611575592573076


In [475]:
print('tp:', tp, 'fp:', fp, 'tn:', tn, 'fn:', fn)

tp: 1 fp: 0 tn: 63 fn: 6


In [476]:
dataset_all = pd.read_csv('dataset_all.csv', index_col = 0)

In [477]:
X = dataset_all[["age"]]
X = sm.add_constant(X)
y = dataset_all["dead"]

In [478]:
model = sm.Logit(y, X)
result = model.fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.375335
         Iterations 7


0,1,2,3
Dep. Variable:,dead,No. Observations:,278.0
Model:,Logit,Df Residuals:,276.0
Method:,MLE,Df Model:,1.0
Date:,"Mon, 09 Sep 2024",Pseudo R-squ.:,0.07436
Time:,22:47:19,Log-Likelihood:,-104.34
converged:,True,LL-Null:,-112.72
Covariance Type:,nonrobust,LLR p-value:,4.234e-05

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-5.9307,1.249,-4.747,0.000,-8.379,-3.482
age,0.0558,0.016,3.487,0.000,0.024,0.087


In [479]:
X = dataset_all[["temperature"]]
X = sm.add_constant(X)
y = dataset_all["dead"]

In [480]:
model = sm.Logit(y, X)
result = model.fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.399854
         Iterations 6


0,1,2,3
Dep. Variable:,dead,No. Observations:,278.0
Model:,Logit,Df Residuals:,276.0
Method:,MLE,Df Model:,1.0
Date:,"Mon, 09 Sep 2024",Pseudo R-squ.:,0.01389
Time:,22:47:19,Log-Likelihood:,-111.16
converged:,True,LL-Null:,-112.72
Covariance Type:,nonrobust,LLR p-value:,0.07681

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-22.3852,11.003,-2.034,0.042,-43.952,-0.819
temperature,0.5666,0.303,1.872,0.061,-0.027,1.160


In [481]:
X = dataset_all[["blood_pressure"]]
X = sm.add_constant(X)
y = dataset_all["dead"]

In [482]:
model = sm.Logit(y, X)
result = model.fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.371324
         Iterations 7


0,1,2,3
Dep. Variable:,dead,No. Observations:,278.0
Model:,Logit,Df Residuals:,276.0
Method:,MLE,Df Model:,1.0
Date:,"Mon, 09 Sep 2024",Pseudo R-squ.:,0.08425
Time:,22:47:19,Log-Likelihood:,-103.23
converged:,True,LL-Null:,-112.72
Covariance Type:,nonrobust,LLR p-value:,1.312e-05

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,1.5838,0.812,1.950,0.051,-0.008,3.176
blood_pressure,-0.0271,0.007,-4.047,0.000,-0.040,-0.014


In [483]:
X = dataset_all[["puls_abs"]]
X = sm.add_constant(X)
y = dataset_all["dead"]

In [484]:
model = sm.Logit(y, X)
result = model.fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.401943
         Iterations 6


0,1,2,3
Dep. Variable:,dead,No. Observations:,278.0
Model:,Logit,Df Residuals:,276.0
Method:,MLE,Df Model:,1.0
Date:,"Mon, 09 Sep 2024",Pseudo R-squ.:,0.008736
Time:,22:47:19,Log-Likelihood:,-111.74
converged:,True,LL-Null:,-112.72
Covariance Type:,nonrobust,LLR p-value:,0.1605

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-2.0714,0.258,-8.013,0.000,-2.578,-1.565
puls_abs,0.0121,0.008,1.447,0.148,-0.004,0.028


In [485]:
X = dataset_all[["respiration"]]
X = sm.add_constant(X)
y = dataset_all["dead"]

In [486]:
model = sm.Logit(y, X)
result = model.fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.385593
         Iterations 6


0,1,2,3
Dep. Variable:,dead,No. Observations:,278.0
Model:,Logit,Df Residuals:,276.0
Method:,MLE,Df Model:,1.0
Date:,"Mon, 09 Sep 2024",Pseudo R-squ.:,0.04906
Time:,22:47:19,Log-Likelihood:,-107.19
converged:,True,LL-Null:,-112.72
Covariance Type:,nonrobust,LLR p-value:,0.000882

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-5.8318,1.256,-4.644,0.000,-8.293,-3.371
respiration,0.2543,0.077,3.311,0.001,0.104,0.405


In [487]:
X = dataset_all[["spo2"]]
X = sm.add_constant(X)
y = dataset_all["dead"]

In [488]:
model = sm.Logit(y, X)
result = model.fit()
result.summary()

Optimization terminated successfully.
         Current function value: 0.358278
         Iterations 6


0,1,2,3
Dep. Variable:,dead,No. Observations:,278.0
Model:,Logit,Df Residuals:,276.0
Method:,MLE,Df Model:,1.0
Date:,"Mon, 09 Sep 2024",Pseudo R-squ.:,0.1164
Time:,22:47:19,Log-Likelihood:,-99.601
converged:,True,LL-Null:,-112.72
Covariance Type:,nonrobust,LLR p-value:,3.003e-07

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,7.2256,1.963,3.681,0.000,3.379,11.072
spo2,-0.0981,0.021,-4.587,0.000,-0.140,-0.056
