# Building up on Question 2.1.1 , adding min,max,std,skew,mean and tsfresh features

In [None]:
# Core Libraries
import os
import glob
import itertools

# Data Handling
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

# Progress Bar
from tqdm import tqdm

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import export_graphviz
from IPython.display import Image
from sklearn.metrics import ConfusionMatrixDisplay, roc_curve, auc

# Statistics
from scipy.stats import skew, kurtosis, randint

# Modeling
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    average_precision_score,
    recall_score,
    confusion_matrix,
    roc_auc_score,
)
from sklearn import linear_model

In [None]:

static_variables = ['RecordID', 'Age', 'Gender', 'Height', 'ICUType', 'Weight']

static_variables.remove('ICUType')

static_variables_we_want = ['Age', 'Gender', 'Height', 'Weight']
all_variables = ['Weight', 'Age', 'TroponinI', 'DiasABP', 'MechVent', 'HCO3', 'Cholesterol', 'HCT', 'SaO2', 'WBC', 'SysABP', 'Urine', 'ICUType', 'Gender', 'ALP', 'Creatinine', 'K', 'AST', 'Glucose', 'RespRate', 'MAP', 'FiO2', 'BUN', 'Na', 'Bilirubin', 'TroponinT', 'PaCO2', 'Height', 'GCS', 'HR', 'pH', 'PaO2', 'Lactate', 'ALT', 'NISysABP', 'RecordID', 'Platelets', 'Temp', 'Mg', 'NIDiasABP', 'Albumin', 'NIMAP']
dyn_variables = [x for x in all_variables if x not in static_variables]
dyn_variables.remove('ICUType')
dyn_variables.append('Weight_VAR')
len(dyn_variables), len(static_variables_we_want)

initial_column_lists = static_variables_we_want + dyn_variables

In [None]:

# import parquet file 
df_a =pd.read_parquet('data/processed_raw_data_set-a_1.parquet', engine='pyarrow')
df_b =pd.read_parquet('data/processed_raw_data_set-b_1.parquet', engine='pyarrow')
df_c =pd.read_parquet('data/processed_raw_data_set-c_1.parquet', engine='pyarrow')

drop_ICUType = True 
if drop_ICUType:
    df_a = df_a.drop(columns=['ICUType'])
    df_b = df_b.drop(columns=['ICUType'])
    df_c = df_c.drop(columns=['ICUType'])


#  drop Time variable in df_a
if 'Time' in df_a.columns:
    df_a = df_a.drop(columns=['Time'])
    df_b = df_b.drop(columns=['Time'])
    df_c = df_c.drop(columns=['Time'])




## Computing features vectors of our patient

Instead of working on the table where the missing values had already been filled, i prefer working on the not filled table because otherwise filled values would be taken into the mean and might flatten patient with lots of missing values. Then I compute the mean of variables for eahc patient over the 49 timestamps. 

Then i compute the median on the resulting table to fill the missing values

In [None]:
len(static_variables_we_want) , len(dyn_variables), len(static_variables_we_want) + len(dyn_variables) 

In [None]:
# we define aggregation functions
agg_funcs = {col: ['mean','std','max','min','skew'] for col in dyn_variables}  

agg_funcs['RecordID'] = 'first'  # Keep RecordID
for stat_var in static_variables_we_want:
    if stat_var in df_a.columns:
        agg_funcs[stat_var] = 'first'  # Keep static variables

# Compute mean and std in one go
df_a_agg = df_a.groupby('RecordID').agg(agg_funcs)

df_a_agg.columns = ['_'.join(col).strip() for col in df_a_agg.columns.values]


In [None]:
# same for df_b 
agg_funcs = {col: ['mean','std','max','min','skew'] for col in dyn_variables}
agg_funcs['RecordID'] = 'first'  # Keep RecordID
for stat_var in static_variables_we_want:
    if stat_var in df_b.columns:
        agg_funcs[stat_var] = 'first'  # Keep static variables

# Compute mean and std in one go
df_b_agg = df_b.groupby('RecordID').agg(agg_funcs)

df_b_agg.columns = ['_'.join(col).strip() for col in df_b_agg.columns.values]


In [None]:
# same for df_c
agg_funcs = {col: ['mean','std','max','min','skew'] for col in dyn_variables}
agg_funcs['RecordID'] = 'first'  # Keep RecordID
for stat_var in static_variables_we_want:
    if stat_var in df_c.columns:
        agg_funcs[stat_var] = 'first'  # Keep static variables

# Compute mean and std in one go
df_c_agg = df_c.groupby('RecordID').agg(agg_funcs)
df_c_agg.columns = ['_'.join(col).strip() for col in df_c_agg.columns.values]

In [None]:
# compute median of df_a 

df_a_agg_median = df_a_agg.median()

# fill missing values with median
df_a_agg.fillna(df_a_agg_median, inplace=True)

df_b_agg.fillna(df_a_agg_median, inplace=True)
df_c_agg.fillna(df_a_agg_median, inplace=True)

In [None]:
# (df_a_agg.isnull().sum() != 0 ) print where true
# print columns with missing values
missing_values_a = df_a_agg.isnull().sum() != 0
#  print only where true
missing_values_a = missing_values_a[missing_values_a].index.tolist()
print("Missing values in df_a_agg:", missing_values_a)

In [None]:
missing_values_b = df_b_agg.isnull().sum() != 0
#  print only where true
missing_values_b = missing_values_b[missing_values_b].index.tolist()
print("Missing values in df_b_agg:", missing_values_b)

In [None]:
missing_values_c = df_c_agg.isnull().sum() != 0
#  print only where true
missing_values_c = missing_values_c[missing_values_c].index.tolist()
print("Missing values in df_c_agg:", missing_values_c)

In [None]:
# drop Cholesterol_skew in df_a_agg, df_b_agg, df_c_agg
df_a_agg = df_a_agg.drop(columns=['Cholesterol_skew'])
df_b_agg = df_b_agg.drop(columns=['Cholesterol_skew'])
df_c_agg = df_c_agg.drop(columns=['Cholesterol_skew'])

In [None]:
len(df_a_agg['Weight_VAR_mean'].unique()), len(df_a_agg['Weight_VAR_std'].unique()) 

In [None]:
assert df_a_agg.isnull().sum().sum() == 0
assert df_b_agg.isnull().sum().sum() == 0
assert df_c_agg.isnull().sum().sum() == 0

# Tsfresh

In [None]:
# open set-a_no_nan.parquet
df_a_no_nan = pd.read_parquet('data/set-a_no_nan.parquet', engine='pyarrow')
df_b_no_nan = pd.read_parquet('data/set-b_no_nan.parquet', engine='pyarrow')
df_c_no_nan = pd.read_parquet('data/set-c_no_nan.parquet', engine='pyarrow')


In [None]:
df_a_no_nan.columns

In [None]:
from tsfresh import extract_features, select_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_selection.relevance import calculate_relevance_table


In [None]:
outcomes = pd.read_csv('data/outcomes.csv')

outcomes_a = outcomes.loc[outcomes['RecordID'].isin(df_a_no_nan['RecordID'])]
# outcomes_a set index to RecordID
outcomes_a = outcomes_a.set_index('RecordID')
outcomes_a 

## To compute Tsfresh features on training set 

In [None]:
from tsfresh import extract_features
from tsfresh.feature_selection.relevance import calculate_relevance_table
from tsfresh.utilities.dataframe_functions import impute
import pandas as pd
from functools import reduce
from pandas import DataFrame
from tqdm import tqdm

we_compute_training_features = True

if we_compute_training_features:
    # 1. Load data
    df_a_no_nan = pd.read_parquet('data/set-a_no_nan.parquet', engine='pyarrow')
    outcomes = pd.read_csv('data/outcomes.csv')
    outcomes_a = outcomes.loc[outcomes['RecordID'].isin(df_a_no_nan['RecordID'])]
    outcomes_a = outcomes_a.set_index('RecordID')

    # 2. Define your dynamic variables (update this list as needed)
    # cf above

    # 3. Empty list to collect features for all variables
    all_feature_sets = []

    # 4. Loop through each variable
    for i, var in tqdm(enumerate(dyn_variables), desc="Processing variables"):
        print(f"Processing {i}, {var}...")
        if (i <26):
            continue
        # if (i==26):
        #     break
        
        df_long = df_a_no_nan[['RecordID', 'Time', var]].copy()
        df_long = df_long.rename(columns={var: 'value'})  # tsfresh expects 'value'
        
        # extract tsfresh features
        features = extract_features(df_long, column_id='RecordID', column_sort='Time', n_jobs=8)
        
        # drop features with NaNs
        features = features.dropna(axis=1, how='any')
        if features.empty:
            continue  # skip if nothing left
        
        # align with labels
        labels = outcomes_a.loc[features.index]['In-hospital_death']
        
        # calculate relevance
        relevance_table = calculate_relevance_table(features, labels, ml_task='classification')
        top_features = relevance_table[relevance_table.relevant].sort_values("p_value")["feature"][:5]
        
        # reduce to top 5 features and rename
        selected = features[top_features]
        selected.columns = [f"{var}__{col}" for col in selected.columns]
        
        # collect
        all_feature_sets.append(selected)

    # 5. Combine all

    tsfresh_final_features = reduce(lambda left, right: left.join(right, how='outer'), all_feature_sets)

    # 6. Final cleanup
    tsfresh_final_features = tsfresh_final_features.fillna(tsfresh_final_features.median())

    print("Final shape:", tsfresh_final_features.shape)
    tsfresh_final_features.head()

    # save tsfresh_final_features
    tsfresh_final_features.to_parquet('data/tsfresh_final_features_26.parquet', engine='pyarrow', index=True)


## Retrieving the same tsfresh features as in the traning set for the test set 

In [None]:
from collections import defaultdict


In [None]:
# tsfresh_final_features_10 = pd.read_parquet('data/tsfresh_final_features_10.parquet', engine='pyarrow') #CHANGE
# tsfresh_final_features_10.columns
# tsfresh_final_features_11_25 = pd.read_parquet('data/tsfresh_final_features_11_25.parquet', engine='pyarrow')
# tsfresh_final_features_11_25.columns
tsfresh_final_features_26 = pd.read_parquet('data/tsfresh_final_features_26.parquet', engine='pyarrow')
tsfresh_final_features_26.columns


In [None]:
def group_by_first_double_underscore(feature_list):
    grouped = defaultdict(list)
    for item in feature_list:
        if '__' in item:
            key, rest = item.split('__', 1)  # split only at the first occurrence
            grouped[key].append(rest)
    return dict(grouped)


In [None]:
dico_selected = group_by_first_double_underscore(tsfresh_final_features_26.columns.tolist()) #CHANGE
dico_selected

In [None]:
from tsfresh import extract_features
from tsfresh.feature_selection.relevance import calculate_relevance_table
from tsfresh.utilities.dataframe_functions import impute
import pandas as pd
from functools import reduce
from pandas import DataFrame
from tqdm import tqdm


# 1. Load data
df_c_no_nan = pd.read_parquet('data/set-c_no_nan.parquet', engine='pyarrow')
outcomes = pd.read_csv('data/outcomes.csv')

In [None]:
# df_c_no_nan = df_c_no_nan.iloc[:490, :]

In [None]:

# 2. Define your dynamic variables (update this list as needed)
# cf above

# 3. Empty list to collect features for all variables
all_feature_sets = []

# 4. Loop through each variable
for i, var in tqdm(enumerate(dyn_variables), desc="Processing variables"):
    print(f"Processing {i}, {var}...")
    if var not in dico_selected.keys():
        print(var)
        print('not there')
        continue

    # if (i <10): 
    #     continue
    if (i <26): 
        continue
    # if (i==26): # change !
    #     break
    
    df_long = df_c_no_nan[['RecordID', 'Time', var]].copy()
    df_long = df_long.rename(columns={var: 'value'})  # tsfresh expects 'value'
    
    # extract tsfresh features
    features = extract_features(df_long, column_id='RecordID', column_sort='Time', n_jobs=8)


    
    # drop features with NaNs
    # features = features.dropna(axis=1, how='any')
    if features.empty:
        continue  # skip if nothing left
    
    top_features = dico_selected[var]
    selected = features[top_features]
    selected.columns = [f"{var}__{col}" for col in selected.columns]


    
    # collect
    all_feature_sets.append(selected)

# 5. Combine all

tsfresh_final_features = reduce(lambda left, right: left.join(right, how='outer'), all_feature_sets)

# tsfresh_final_features = tsfresh_final_features['TroponinI__value__fourier_entropy__bins_2']

print("Final shape:", tsfresh_final_features.shape)
tsfresh_final_features.head()


In [None]:
# save tsfresh_final_features CHANGE
tsfresh_final_features.to_parquet('data/tsfresh_final_features_C_10.parquet', engine='pyarrow', index=True)
# tsfresh_final_features.to_parquet('data/tsfresh_final_features_C_11_25.parquet', engine='pyarrow', index=True)
# tsfresh_final_features.to_parquet('data/tsfresh_final_features_C_26.parquet', engine='pyarrow', index=True)