In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE

In [4]:
path_csv = '/home/paul/Documents/datasets/files_neonates_final/Sepsis_Validation_FINAL DATASET_131025_SH.sav'
df = pd.read_spss(path_csv)

In [5]:
df.shape

(355, 72)

In [6]:
print(df.columns)

Index(['ID', 'Birth_time', 'birth_weight', 'gest_weeks', 'gest_days',
       'age_mother', 'gravidity', 'parity', 'birth_mode', 'umbilical_cord_ph',
       'capillary_time', 'o2_demand', 'breath_aid', 'heart_rate',
       'respiration_rate', 'rr_systolic', 'rr_diastolic', 'base_excess',
       'ph_value', 'antibiotic_therapy', 'antibiotic_therapy_duration',
       'diagnosis_infection', 'result_blood_culture', 'gestation_diabetes',
       'diabetes_type_1_2', 'adiposity', 'early_membrane_rupture',
       'membrane_rupture_hours', 'early_labor_pain', 'green_amniotic_liquor',
       'b_streptococcus', 'Präeklampsie', 'crp_max_prepartal',
       'leukocyte_max_prepartal', 'fever_sub_partu', 'antibiotics_prepartal',
       'prepartal_antibiotics_count', 'Labtime1', 'SCP1', 'CRP1', 'IL61',
       'WBC1', 'Labtime2', 'SCP2', 'CRP2', 'IL62', 'WBC2', 'Labtime3', 'SCP3',
       'CRP3', 'IL63', 'WBC3', 'Labtime4', 'SCP4', 'CRP4', 'IL64', 'WBC4',
       'Labtime5', 'SCP5', 'CRP5', 'IL65', 'WBC5',

In [33]:
# select only the relevant features
# select features for classification later
features_relevant_total = ['birth_weight', 'gest_weeks', 'gest_days',
       'age_mother', 'gravidity', 'parity', 'umbilical_cord_ph','o2_demand', 'breath_aid', 'heart_rate',
       'respiration_rate', 'rr_systolic', 'rr_diastolic', 'base_excess', 'diagnosis_infection', 'gestation_diabetes',
       'diabetes_type_1_2', 'adiposity', 'early_membrane_rupture',
       'membrane_rupture_hours', 'early_labor_pain', 'green_amniotic_liquor',
       'b_streptococcus', 'fever_sub_partu', 'antibiotics_prepartal', 'SCP1', 'CRP1', 'IL61', 'WBC1']

df_subset = df[features_relevant_total].copy()

In [35]:
# look at null counts 
null_counts = df_subset.isnull().sum()

# Filter to show only columns with missing values
# null_counts = null_counts[null_counts > 0]

print("Columns with missing values:")
print(null_counts)

Columns with missing values:
birth_weight                0
gest_weeks                  1
gest_days                   1
age_mother                  0
gravidity                   3
parity                      2
umbilical_cord_ph           3
o2_demand                   0
breath_aid                  0
heart_rate                  1
respiration_rate            5
rr_systolic                 4
rr_diastolic                4
base_excess                17
diagnosis_infection         0
gestation_diabetes          2
diabetes_type_1_2           2
adiposity                   2
early_membrane_rupture      2
membrane_rupture_hours    227
early_labor_pain            3
green_amniotic_liquor       3
b_streptococcus             3
fever_sub_partu             2
antibiotics_prepartal       2
SCP1                       52
CRP1                       10
IL61                        1
WBC1                       10
dtype: int64


In [36]:
# insert 0 for membrane rupture hours
initial_rows = len(df_subset)
df_subset.dropna(subset=['early_membrane_rupture'], inplace=True)
print(f"Dropped {initial_rows - len(df_subset)} rows.")
print(f"Current shape: {df_subset.shape}")

df_subset['membrane_rupture_hours'].fillna(0, inplace=True)
print("Filled NaN values in 'membrane_rupture_hours' with 0.")
print(df_subset[['early_membrane_rupture', 'membrane_rupture_hours']].head())

Dropped 2 rows.
Current shape: (353, 29)
Filled NaN values in 'membrane_rupture_hours' with 0.
   early_membrane_rupture  membrane_rupture_hours
0                     0.0                     0.0
1                     1.0                     0.0
2                     0.0                     0.0
3                     1.0                    57.0
4                     0.0                     0.0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_subset['membrane_rupture_hours'].fillna(0, inplace=True)


In [37]:
# look at null counts again
null_counts = df_subset.isnull().sum()

# Filter to show only columns with missing values
# null_counts = null_counts[null_counts > 0]

print("Columns with missing values:")
print(null_counts)

Columns with missing values:
birth_weight               0
gest_weeks                 1
gest_days                  1
age_mother                 0
gravidity                  2
parity                     1
umbilical_cord_ph          3
o2_demand                  0
breath_aid                 0
heart_rate                 1
respiration_rate           5
rr_systolic                4
rr_diastolic               4
base_excess               17
diagnosis_infection        0
gestation_diabetes         0
diabetes_type_1_2          0
adiposity                  0
early_membrane_rupture     0
membrane_rupture_hours     0
early_labor_pain           1
green_amniotic_liquor      1
b_streptococcus            1
fever_sub_partu            0
antibiotics_prepartal      0
SCP1                      50
CRP1                      10
IL61                       1
WBC1                       9
dtype: int64


In [38]:
# drop where there are null values
print(f"Shape before dropping remaining NaNs: {df_subset.shape}")
# Drop any row that still has a missing value in any of the selected columns
df_subset.dropna(inplace=True)
print(f"Shape after dropping remaining NaNs: {df_subset.shape}")

print("\nFinal DataFrame Info:")
df_subset.info()
print("\nFinal DataFrame Head:")
print(df_subset.head())

Shape before dropping remaining NaNs: (353, 29)
Shape after dropping remaining NaNs: (273, 29)

Final DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
Index: 273 entries, 1 to 354
Data columns (total 29 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   birth_weight            273 non-null    float64
 1   gest_weeks              273 non-null    float64
 2   gest_days               273 non-null    float64
 3   age_mother              273 non-null    float64
 4   gravidity               273 non-null    float64
 5   parity                  273 non-null    float64
 6   umbilical_cord_ph       273 non-null    float64
 7   o2_demand               273 non-null    float64
 8   breath_aid              273 non-null    float64
 9   heart_rate              273 non-null    float64
 10  respiration_rate        273 non-null    float64
 11  rr_systolic             273 non-null    float64
 12  rr_diastolic            273 non-nul

In [39]:
# compute total gest days
df_subset['total_gest_days'] = (df_subset['gest_weeks'] * 7) + df_subset['gest_days']
# We can now drop the original columns if they are no longer needed
df_subset.drop(columns=['gest_weeks', 'gest_days'], inplace=True)
print("Created 'total_gest_days' column and dropped original columns.")
print(df_subset[['total_gest_days']].head())

Created 'total_gest_days' column and dropped original columns.
   total_gest_days
1            232.0
2            253.0
3            241.0
4            269.0
6            269.0


In [None]:
# select features and labels 
features = ['birth_weight', 'total_gest_days',
       'age_mother', 'gravidity', 'parity', 'umbilical_cord_ph',
       'o2_demand', 'breath_aid', 'heart_rate',
       'respiration_rate', 'rr_systolic', 'rr_diastolic', 'base_excess', 'gestation_diabetes',
       'diabetes_type_1_2', 'adiposity', 'early_membrane_rupture',
       'membrane_rupture_hours', 'early_labor_pain', 'green_amniotic_liquor',
       'b_streptococcus', 'fever_sub_partu', 'antibiotics_prepartal', 'SCP1', 'CRP1', 'IL61', 'WBC1']
label = ['diagnosis_infection']

In [44]:
# visualize 
X_all = df_subset[features]
y = df_subset[label]

In [45]:
# scale the data
scaler = StandardScaler()
scaler.fit(X_all)
X_scaled = scaler.transform(X_all)

In [55]:
import plotly.express as px
tsne = TSNE(n_components=2, random_state=42, perplexity=25)
X_tsne = tsne.fit_transform(X_all)
tsne.kl_divergence_


fig = px.scatter(x=X_tsne[:, 0], y=X_tsne[:, 1], color=y)
fig.update_layout(
    title="t-SNE visualization of Custom Classification dataset",
    xaxis_title="First t-SNE",
    yaxis_title="Second t-SNE",
)
fig.show()

In [56]:
# make train and test split and save
from sklearn.model_selection import train_test_split

X_train, X_temp, y_train, y_temp = train_test_split(
    X_all, 
    y, 
    test_size=0.3,          
    random_state=42,  
    stratify=y
)

In [57]:
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, 
    y_temp, 
    test_size=0.66,          
    random_state=42,        
    stratify=y_temp 
)


In [58]:
print(f"Original dataset shape: {X_all.shape}")
print("-" * 30)
print(f"Training set shape: {X_train.shape} features, {y_train.shape} labels")
print(f"Validation set shape: {X_val.shape} features, {y_val.shape} labels")
print(f"Test set shape: {X_test.shape} features, {y_test.shape} labels")

Original dataset shape: (273, 27)
------------------------------
Training set shape: (191, 27) features, (191,) labels
Validation set shape: (27, 27) features, (27,) labels
Test set shape: (55, 27) features, (55,) labels


In [59]:
train_set = pd.concat([X_train, y_train], axis=1)
validation_set = pd.concat([X_val, y_val], axis=1)
test_set = pd.concat([X_test, y_test], axis=1)

# Save to CSV files
# index=False prevents pandas from writing the dataframe index as a column
train_set.to_csv('../data/cleaned_train_val_test/train_set.csv', index=False)
validation_set.to_csv('../data/cleaned_train_val_test/validation_set.csv', index=False)
test_set.to_csv('../data/cleaned_train_val_test/test_set.csv', index=False)
