In [1]:
import splits_generation
import numpy as np

import pandas as pd
import numpy as np
 
import warnings
warnings.filterwarnings("ignore")
 
import pyten
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
 
import utils


# E2

In [2]:
# SEPSIS DEFINITION
# Question 1) How many hours do we propagate a True atb backwards and forwards? 
# Parameters defined as Nts_pre and Nts_post. 
# Question 2) How many hours do we propagate from sepsis_onset?
 
sep_def = {
    # Parameteres for antibiotic propagation
    'Nts_pre': 24, 'Nts_post': 24,
    # Parametere for sepsis propagation
    'N_prog_sep': 12, 
    # Parameter for determine sepsis onset
    'increm_sofa': 2
}
 
keys_to_select = ['stay_id', 'stay_time',
'hr_raw','o2sat_raw','temp_raw','sbp_raw','map_raw','dbp_raw','resp_raw','etco2_raw','fio2_raw',
'be_raw', 'bicar_raw','ph_raw','pco2_raw','cl_raw','mg_raw','phos_raw','k_raw','ast_raw','bun_raw',
'alp_raw','ca_raw','crea_raw','bildir_raw','glu_raw','lact_raw', 'bili_raw','tri_raw','hct_raw',
'hgb_raw','ptt_raw','wbc_raw','fgn_raw','plt_raw','age_static','female_static','cai_raw','na_raw',
'po2_raw','alb_raw','alt_raw','ck_raw','ckmb_raw','crp_raw','tnt_raw','urine_raw','basos_raw',
'bnd_raw','eos_raw','esr_raw','hbco_raw','inrpt_raw','lymph_raw','mch_raw','mchc_raw',
'mcv_raw','methb_raw','neut_raw','pt_raw','rbc_raw','rdw_raw','tco2_raw','weight_static','height_static', 
'SI','sep_onset','sep_%2s' % str(sep_def['N_prog_sep'])]

 
params_to_configure = {
    # File to laod
    'path': '../datasets/eicu_demo_0.5.6.parquet',
    'w_pre_onset':  3,  # Number of windows pre sep_onset = 1
    'w_post_onset':  3,  # Number of windows post sep_onset = 1
    'keys': keys_to_select,
    'label':  ['SI','sep_onset','sep_'+str(sep_def['N_prog_sep'])],
    'f_tr_te':  ['stay_id', 'stay_time', 'w_id', 'SI','sep_onset','sep_%2s' % str(sep_def['N_prog_sep'])],
    # sliding window
    'moving_span': 1,
    # min_length_pat
    'min_length_pat': 0, # 0 value significa calcular
    # Type of imputation
    'imputationType': "LVCF", #LRTC (to implement)
    # Threshold to remove features without data
    'thr_nan': 0,
}
 
seeds = [34, 56, 78]
folders = ["s1", "s2", "s3"]

In [3]:
def preprocessing(params, sep_def, debug=False):
 
    # Load data
    df = pd.read_parquet(params['path'])
    print("# of patients:", len(df.stay_id.unique()))
    df = utils.get_SI(df, sep_def['Nts_pre'], sep_def['Nts_post'])
    df = utils.get_sep(df, sep_def['N_prog_sep'],  sep_def['increm_sofa'])
    df = df.drop(['diff'], axis=1)
     
    # Step -1. Select data of ICU
    df_icu_entry = df[df.stay_time >= 0].reset_index(drop=True)
    print("# of icu-patients:", len(df_icu_entry.stay_id.unique()))
 
    if debug:
        print("% of missing values (pre-filter):", np.round((df.isnull().sum().sum() / \
                                                             (df.shape[0]*df.shape[1])*100), 4))
        print("% of missing values (post-filter):", np.round((df_icu_entry.isnull().sum().sum() / \
                                                              (df_icu_entry.shape[0]*df_icu_entry.shape[1])*100), 4))
        print("# of patients labels:", len(df_icu_entry.stay_id.unique()))
        print("% of missing values post remove patients with nan label:", np.round((df_icu_entry.isnull().sum().sum() / \
                                                             (df_icu_entry.shape[0]*df_icu_entry.shape[1])*100), 4))
 
    # ## Step 1. Select relevant features. Remove feature without data, based on threshold.
    df_final = df_icu_entry[params['keys']]
    missing_percentage = df_final.isnull().mean() * 100
    feats = missing_percentage[missing_percentage > params['thr_nan']].index.tolist()
 
    if debug:
        print("Features with more than " + str(params['thr_nan']) + "% missing data:" +  str(len(feats)))
        print("# of patients:", len(df_final.stay_id.unique()))
        print("Dimensiones of dataset:", df_final.shape)
 
#     df_filter = df_final.drop(feats, axis=1)
#     df_final = df_filter.astype(float)
 
    print("Dimensions post remove some feautures:", df_final.shape)
 
    if params['imputationType'] == "LVCF":
        df_final = utils.LVCF(df_final)
        df_final = df_final.fillna(0)
        print("# of patients post imputation:", len(df_final.stay_id.unique()))
    
    if params['min_length_pat'] == 0:
        min_length_pat = df_final.groupby("stay_id").size().reset_index()[0].min()
        print("min_length_pat:", min_length_pat)
    
    params['min_length_pat'] = min_length_pat
    df_final.stay_id = df_final.stay_id.astype(int)
 
    if debug:
        print("# of patients:", len(df_final.stay_id.unique()))
        print("# of pacientes labeled with 1", len(df_final[df_final.sep_onset == 1][['stay_id']].stay_id.unique()))
        print("# of pacientes labeled with 0", len(df_final[df_final.sep_onset == 0][['stay_id']].stay_id.unique()))
        print("Dimensiones pre-sliding window:", df_final.shape)
    return df_final, min_length_pat
    
df, min_length_pat = preprocessing(params_to_configure, 
                                sep_def,  
                                debug=False)

# of patients: 1647
# of icu-patients: 1647
Dimensions post remove some feautures: (88613, 68)
# of patients post imputation: 1647
min_length_pat: 6


In [9]:
pats =  df[df.sep_onset == 1].stay_id.unique()
df[df.stay_id == pats[2]][['stay_id', 'sep_onset', 'sep_12']].reset_index(drop=True).loc[0:30]

Unnamed: 0,stay_id,sep_onset,sep_12
0,152954,0,0
1,152954,0,0
2,152954,0,0
3,152954,0,0
4,152954,0,0
5,152954,0,0
6,152954,0,0
7,152954,0,0
8,152954,0,0
9,152954,0,0


In [5]:
 def slidingWindow(df, moving_span, window_length):
 
    list_patients = np.unique(df.stay_id)
    df_sw_ttl = pd.DataFrame()
    for idx_pat in range(len(list_patients)):
        df_sw = pd.DataFrame()
        df_patient = df[df.stay_id == list_patients[idx_pat]]
        id_pat = df_patient.stay_id.unique()[0]
        iterations = int(np.ceil((df_patient.shape[0] - (window_length)+1) / moving_span))
        for j in range(iterations):
            df_aux = df_patient[moving_span*j: (window_length + moving_span*j)]
            df_aux['w_id'] = str(id_pat) + "_" + str(j)
            df_sw = pd.concat([df_sw, df_aux],ignore_index=True)
 
        df_sw_ttl = pd.concat([df_sw_ttl, df_sw],ignore_index=True)
 
    return df_sw_ttl

In [6]:
df_final = slidingWindow(df, params_to_configure['moving_span'], min_length_pat)

In [18]:
print(len(df_final[df_final.stay_id == pats[0]].w_id.unique()))
df_final[df_final.stay_id == pats[0]].shape[0] / 36

39


6.5

In [22]:
debug=False
if debug:
    print("Dimensiones post-sliding window:", df_final.shape)
df_filter = utils.filter_windows(df_final, params_to_configure['w_pre_onset'], params_to_configure['w_post_onset'])
if debug:
    print("Dimensiones post-filtering window:", df_filter.shape)

In [28]:
# Meter mascara 666 al modelo.
# 
df_filter

Unnamed: 0,stay_id,stay_time,hr_raw,o2sat_raw,temp_raw,sbp_raw,map_raw,dbp_raw,resp_raw,etco2_raw,...,pt_raw,rbc_raw,rdw_raw,tco2_raw,weight_static,height_static,SI,sep_onset,sep_12,w_id
0,147784,0.0,82.5,95.0,0.0,0.0,98.0,0.0,18.0,0.0,...,10.1,4.02,13.8,0.0,95.6,154.9,1,1,1,147784_0
1,147784,1.0,90.5,96.5,0.0,0.0,106.0,0.0,18.0,0.0,...,10.1,4.02,13.8,0.0,95.6,154.9,1,0,1,147784_0
2,147784,2.0,90.0,98.0,0.0,0.0,91.0,0.0,16.0,0.0,...,10.1,4.02,13.8,0.0,95.6,154.9,1,0,1,147784_0
3,147784,3.0,92.0,97.0,0.0,0.0,90.5,0.0,20.0,0.0,...,10.1,4.02,13.8,0.0,95.6,154.9,1,0,1,147784_0
4,147784,4.0,87.0,96.5,0.0,0.0,86.5,0.0,24.0,0.0,...,10.1,4.02,13.8,0.0,95.6,154.9,1,0,1,147784_0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27595,3347502,4.0,69.0,97.0,0.0,0.0,73.0,0.0,36.0,0.0,...,13.2,3.73,17.1,0.0,75.0,172.7,1,0,1,3347502_3
27596,3347502,5.0,69.0,97.0,0.0,0.0,79.0,0.0,38.5,0.0,...,13.2,3.73,17.1,0.0,75.0,172.7,1,0,1,3347502_3
27597,3347502,6.0,69.0,98.0,0.0,0.0,79.0,0.0,38.5,0.0,...,13.2,3.73,17.1,0.0,75.0,172.7,1,0,1,3347502_3
27598,3347502,7.0,69.0,97.0,0.0,0.0,75.0,0.0,41.5,0.0,...,13.2,3.73,17.1,0.0,75.0,172.7,1,0,1,3347502_3


In [56]:
df_filter_aux = df_filter.drop(['sep_12', 'w_id', 'stay_id', 'stay_time'], axis=1)
df_filter_aux[df_filter_aux.duplicated(keep=False)]

Unnamed: 0,hr_raw,o2sat_raw,temp_raw,sbp_raw,map_raw,dbp_raw,resp_raw,etco2_raw,fio2_raw,be_raw,...,methb_raw,neut_raw,pt_raw,rbc_raw,rdw_raw,tco2_raw,weight_static,height_static,SI,sep_onset
1,90.5,96.5,0.0,0.0,106.0,0.0,18.0,0.0,60.0,10.0,...,0.0,77.0,10.1,4.02,13.8,0.0,95.6,154.9,1,0
2,90.0,98.0,0.0,0.0,91.0,0.0,16.0,0.0,60.0,6.0,...,0.0,77.0,10.1,4.02,13.8,0.0,95.6,154.9,1,0
3,92.0,97.0,0.0,0.0,90.5,0.0,20.0,0.0,60.0,10.0,...,0.0,77.0,10.1,4.02,13.8,0.0,95.6,154.9,1,0
4,87.0,96.5,0.0,0.0,86.5,0.0,24.0,0.0,60.0,18.0,...,0.0,77.0,10.1,4.02,13.8,0.0,95.6,154.9,1,0
5,83.0,96.0,0.0,0.0,75.0,0.0,24.0,0.0,60.0,10.0,...,0.0,77.0,10.1,4.02,13.8,0.0,95.6,154.9,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27594,69.0,93.0,0.0,0.0,73.0,0.0,33.0,0.0,50.0,-12.0,...,0.0,0.0,13.2,3.73,17.1,0.0,75.0,172.7,1,0
27595,69.0,97.0,0.0,0.0,73.0,0.0,36.0,0.0,50.0,-14.0,...,0.0,0.0,13.2,3.73,17.1,0.0,75.0,172.7,1,0
27596,69.0,97.0,0.0,0.0,79.0,0.0,38.5,0.0,50.0,-14.0,...,0.0,0.0,13.2,3.73,17.1,0.0,75.0,172.7,1,0
27597,69.0,98.0,0.0,0.0,79.0,0.0,38.5,0.0,50.0,-14.0,...,0.0,0.0,13.2,3.73,17.1,0.0,75.0,172.7,1,0


1

In [69]:
import pandas as pd
import numpy as np
from joblib import Parallel, delayed

# Example DataFrames
# df_filter_aux and df_filter should be already defined

def check_condition(i, df_filter_aux, df_filter):
    rows = df_filter_aux.shape[0]
    for j in range(rows):
        if np.sum(df_filter_aux.iloc[i].values == df_filter_aux.iloc[j].values) == df_filter_aux.shape[1]:
            if pd.isna(df_filter.loc[i, 'sep_12']):  # Using pd.isna to check for NaN
                return i
    return None

def parallel_process(df_filter_aux, df_filter):
    rows = df_filter_aux.shape[0]
    indices = list(range(rows))
    
    # Use joblib to parallelize the task
    results = Parallel(n_jobs=-1)(delayed(check_condition)(i, df_filter_aux, df_filter) for i in indices)
    
    # Filter out None values
    arr = [i for i in results if i is not None]
    return arr

# Assuming df_filter_aux and df_filter are already defined DataFrames
arr = parallel_process(df_filter_aux, df_filter)
print(arr)


[]


In [43]:
# Mismas features pero y que cambie (label)
# Agrupar por las características
grouped = df_filter.groupby(list(df_filter_aux.columns))['sep_12']

# Verificar si hay grupos con más de una etiqueta distinta
inconsistent_groups = grouped.nunique() > 1

# Filtrar y mostrar las filas con inconsistencias
if inconsistent_groups.any():
    inconsistent_features = inconsistent_groups[inconsistent_groups].index
    print("Hay filas con las mismas características pero diferentes etiquetas:")
    print(df[df[feature_columns].apply(tuple, axis=1).isin(inconsistent_features)])
else:
    print("No hay inconsistencias en las etiquetas.")

No hay inconsistencias en las etiquetas.


In [48]:
grouped.nunique() > 1

stay_id  stay_time  hr_raw  o2sat_raw  temp_raw  sbp_raw  map_raw  dbp_raw  resp_raw  etco2_raw  fio2_raw  be_raw  bicar_raw  ph_raw  pco2_raw  cl_raw  mg_raw  phos_raw  k_raw  ast_raw  bun_raw  alp_raw  ca_raw  crea_raw  bildir_raw  glu_raw  lact_raw  bili_raw  tri_raw  hct_raw  hgb_raw  ptt_raw  wbc_raw  fgn_raw  plt_raw  age_static  female_static  cai_raw  na_raw  po2_raw  alb_raw  alt_raw  ck_raw  ckmb_raw  crp_raw  tnt_raw  urine_raw  basos_raw  bnd_raw  eos_raw  esr_raw  hbco_raw  inrpt_raw  lymph_raw  mch_raw  mchc_raw  mcv_raw  methb_raw  neut_raw  pt_raw  rbc_raw  rdw_raw  tco2_raw  weight_static  height_static  SI  sep_onset  w_id     
147784   0.0        82.5    95.0       0.0       0.0      98.0     0.0      18.0      0.0        60.0       5.0    39.0       7.120   67.0      102.0   2.0     0.0       4.3    20.0     17.0     83.0     7.9     0.64      0.0         139.5    1.2       0.3       0.03     41.4     12.8     28.0     10.7     0.0      195.0    0.0         True    

In [49]:
import pandas as pd

# Crear un DataFrame de ejemplo
data = {
    'feature1': [1, 1, 2, 2, 3, 3],
    'feature2': [10, 10, 20, 20, 30, 30],
    'label': ['A', 'B', 'A', 'A', 'B', 'B']
}

df = pd.DataFrame(data)

# Identificar las columnas de características y la columna de etiqueta
feature_columns = ['feature1', 'feature2']
label_column = 'label'

# Agrupar por las características
grouped = df.groupby(feature_columns)[label_column]

# Verificar si hay grupos con más de una etiqueta distinta
inconsistent_groups = grouped.nunique() > 1

# Filtrar y mostrar las filas con inconsistencias
if inconsistent_groups.any():
    inconsistent_features = inconsistent_groups[inconsistent_groups].index
    inconsistent_rows = df[df[feature_columns].apply(tuple, axis=1).isin(inconsistent_features)]
    print("Hay filas con las mismas características pero diferentes etiquetas:")
    print(inconsistent_rows)
else:
    print("No hay inconsistencias en las etiquetas.")


Hay filas con las mismas características pero diferentes etiquetas:
   feature1  feature2 label
0         1        10     A
1         1        10     B


In [51]:
grouped.nunique() > 1

feature1  feature2
1         10           True
2         20          False
3         30          False
Name: label, dtype: bool