In [25]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, roc_auc_score, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

In [26]:
path_csv = '/home/paul/workspace/postdoc-year1/projects/calprotectin/data/Sepsis_Validation_FINAL DATASET_110925_SH.xlsx'
df = pd.read_excel(path_csv)

In [27]:
keys_cleaned = ['birth_weight', 'gest_weeks', 'age_mother', 'gravidity', 'parity', 'umbilical_cord_ph',
       'o2_demand', 'breath_aid', 'heart_rate',
       'respiration_rate', 'rr_systolic', 'rr_diastolic', 'base_excess',
       'ph_value', 'antibiotic_therapy',
       'diagnosis_infection', 'gestation_diabetes',
       'diabetes_type_1_2', 'adiposity', 'early_membrane_rupture',
       'early_labor_pain', 'green_amniotic_liquor',
       'b_streptococcus',
       'fever_sub_partu', 'antibiotics_prepartal',
       'crp_value_1', 'IL6_time_1', 'IL6_value_1', 'Cal_value_1']

In [28]:
df_cleaned = df.dropna(subset=keys_cleaned).copy()
df_cleaned.shape

(324, 58)

In [29]:
features_clinical = ['birth_weight', 'gest_weeks', 'age_mother', 'gravidity', 'parity', 'umbilical_cord_ph',
       'o2_demand', 'breath_aid', 'heart_rate',
       'respiration_rate', 'rr_systolic', 'rr_diastolic', 'base_excess',
       'ph_value', 'gestation_diabetes',
       'diabetes_type_1_2', 'adiposity', 'early_membrane_rupture',
       'early_labor_pain', 'green_amniotic_liquor',
       'b_streptococcus',
       'fever_sub_partu', 'antibiotics_prepartal']
features_biomarkers_classic = ['crp_value_1', 'IL6_value_1']
feature_cal = ['Cal_value_1']
target = 'diagnosis_infection'
all_features = features_clinical + features_biomarkers_classic

In [30]:
X_all = df_cleaned[all_features]
X_cal = df_cleaned[feature_cal]
X_bio_classic = df_cleaned[features_biomarkers_classic]
y = df_cleaned[target]

In [36]:
# scale the data
scaler = StandardScaler()

scaler.fit(X_bio_classic)
X_scaled = scaler.transform(X_bio_classic)

In [37]:


tsne = TSNE(n_components=2, random_state=42, perplexity=30)
X_tsne = tsne.fit_transform(X_scaled)
tsne.kl_divergence_


fig = px.scatter(x=X_tsne[:, 0], y=X_tsne[:, 1], color=y)
fig.update_layout(
    title="t-SNE visualization of Custom Classification dataset",
    xaxis_title="First t-SNE",
    yaxis_title="Second t-SNE",
)
fig.show()

In [38]:
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

In [39]:
fig = px.scatter(x=X_pca[:, 0], y=X_pca[:, 1], color=y)
fig.update_layout(
    title="PCA visualization of Custom Classification dataset",
    xaxis_title="First Principal Component",
    yaxis_title="Second Principal Component",
)
fig.show()

In [42]:
fig = px.scatter(x=X_scaled[:,0], y=X_scaled[:,1], color=y)
fig.update_layout(
    title="PCA visualization of Custom Classification dataset",
    xaxis_title="First Principal Component",
    yaxis_title="Second Principal Component",
)
fig.show()

In [41]:
X_scaled.shape

(324, 2)