In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [13]:
path_csv = '/home/paul/Documents/datasets/files_neonates_final/Sepsis_Validation_FINAL DATASET_131025_SH.sav'
df = pd.read_spss(path_csv)

In [14]:
df.shape

(355, 72)

In [15]:
print(df.columns)

Index(['ID', 'Birth_time', 'birth_weight', 'gest_weeks', 'gest_days',
       'age_mother', 'gravidity', 'parity', 'birth_mode', 'umbilical_cord_ph',
       'capillary_time', 'o2_demand', 'breath_aid', 'heart_rate',
       'respiration_rate', 'rr_systolic', 'rr_diastolic', 'base_excess',
       'ph_value', 'antibiotic_therapy', 'antibiotic_therapy_duration',
       'diagnosis_infection', 'result_blood_culture', 'gestation_diabetes',
       'diabetes_type_1_2', 'adiposity', 'early_membrane_rupture',
       'membrane_rupture_hours', 'early_labor_pain', 'green_amniotic_liquor',
       'b_streptococcus', 'Präeklampsie', 'crp_max_prepartal',
       'leukocyte_max_prepartal', 'fever_sub_partu', 'antibiotics_prepartal',
       'prepartal_antibiotics_count', 'Labtime1', 'SCP1', 'CRP1', 'IL61',
       'WBC1', 'Labtime2', 'SCP2', 'CRP2', 'IL62', 'WBC2', 'Labtime3', 'SCP3',
       'CRP3', 'IL63', 'WBC3', 'Labtime4', 'SCP4', 'CRP4', 'IL64', 'WBC4',
       'Labtime5', 'SCP5', 'CRP5', 'IL65', 'WBC5',

In [16]:
df['early_membrane_rupture'].isnull().sum()

np.int64(2)

In [17]:
# drop where gest_days/weeks is null 
print("\nStep 1: Dropping rows with null gestational age...")
initial_rows = len(df)
df.dropna(subset=['gest_weeks', 'gest_days'], inplace=True)
print(f"Dropped {initial_rows - len(df)} rows.")
print(f"Current shape: {df.shape}")


Step 1: Dropping rows with null gestational age...
Dropped 1 rows.
Current shape: (354, 72)


In [18]:
# add gest_total in days
print("\nStep 2: Calculating total gestation time in days...")
df['total_gest_days'] = (df['gest_weeks'] * 7) + df['gest_days']
# We can now drop the original columns if they are no longer needed
df.drop(columns=['gest_weeks', 'gest_days'], inplace=True)
print("Created 'total_gest_days' column and dropped original columns.")
print(df[['total_gest_days']].head())



Step 2: Calculating total gestation time in days...
Created 'total_gest_days' column and dropped original columns.
   total_gest_days
0            271.0
1            232.0
2            253.0
3            241.0
4            269.0


In [19]:
# insert 0 for membrane rupture hours
initial_rows = len(df)
df.dropna(subset=['early_membrane_rupture'], inplace=True)
print(f"Dropped {initial_rows - len(df)} rows.")
print(f"Current shape: {df.shape}")

df['membrane_rupture_hours'].fillna(0, inplace=True)
print("Filled NaN values in 'membrane_rupture_hours' with 0.")
print(df[['early_membrane_rupture', 'membrane_rupture_hours']].head())

Dropped 2 rows.
Current shape: (352, 71)
Filled NaN values in 'membrane_rupture_hours' with 0.
   early_membrane_rupture  membrane_rupture_hours
0                     0.0                     0.0
1                     1.0                     0.0
2                     0.0                     0.0
3                     1.0                    57.0
4                     0.0                     0.0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['membrane_rupture_hours'].fillna(0, inplace=True)


In [36]:
# select features for classification later
features_clinical = ['birth_weight', 'total_gest_days',
       'age_mother', 'gravidity', 'parity', 'umbilical_cord_ph',
       'capillary_time', 'o2_demand', 'breath_aid', 'heart_rate',
       'respiration_rate', 'rr_systolic', 'rr_diastolic', 'base_excess',
       'ph_value', 'antibiotic_therapy', 'diagnosis_infection', 'gestation_diabetes',
       'diabetes_type_1_2', 'adiposity', 'early_membrane_rupture',
       'membrane_rupture_hours', 'early_labor_pain', 'green_amniotic_liquor',
       'b_streptococcus', 'fever_sub_partu', 'antibiotics_prepartal']

features_biomarkers = ['SCP1', 'CRP1', 'IL61']

In [39]:
null_counts = df[features_clinical + features_biomarkers].isnull().sum()

# Filter to show only columns with missing values
# null_counts = null_counts[null_counts > 0]

print("Columns with missing values:")
print(null_counts)

Columns with missing values:
birth_weight               0
total_gest_days            0
age_mother                 0
gravidity                  2
parity                     1
umbilical_cord_ph          3
capillary_time             0
o2_demand                  0
breath_aid                 0
heart_rate                 1
respiration_rate           5
rr_systolic                4
rr_diastolic               4
base_excess               17
ph_value                  13
antibiotic_therapy         0
diagnosis_infection        0
gestation_diabetes         0
diabetes_type_1_2          0
adiposity                  0
early_membrane_rupture     0
membrane_rupture_hours     0
early_labor_pain           1
green_amniotic_liquor      1
b_streptococcus            1
fever_sub_partu            0
antibiotics_prepartal      0
SCP1                      50
CRP1                      10
IL61                       1
dtype: int64


In [38]:
# drop where there are null values
df_final = df[features_clinical + features_biomarkers].copy()
print(f"Shape before dropping remaining NaNs: {df_final.shape}")
# Drop any row that still has a missing value in any of the selected columns
df_final.dropna(inplace=True)
print(f"Shape after dropping remaining NaNs: {df_final.shape}")

print("\nFinal DataFrame Info:")
df_final.info()
print("\nFinal DataFrame Head:")
print(df_final.head())

Shape before dropping remaining NaNs: (352, 30)
Shape after dropping remaining NaNs: (279, 30)

Final DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
Index: 279 entries, 1 to 354
Data columns (total 30 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   birth_weight            279 non-null    float64
 1   total_gest_days         279 non-null    float64
 2   age_mother              279 non-null    float64
 3   gravidity               279 non-null    float64
 4   parity                  279 non-null    float64
 5   umbilical_cord_ph       279 non-null    float64
 6   capillary_time          279 non-null    object 
 7   o2_demand               279 non-null    float64
 8   breath_aid              279 non-null    float64
 9   heart_rate              279 non-null    float64
 10  respiration_rate        279 non-null    float64
 11  rr_systolic             279 non-null    float64
 12  rr_diastolic            279 non-nul

In [None]:
# select features and labels 

In [None]:
# visualize 


In [None]:
# make train and test split and save
