In [None]:
import pandas as pd
import numpy as np

import datetime
from datetime import timedelta

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

In [None]:
def dataframe_to_tensor(df, y, eliminateColumn, columns, timeStepLength):
    _, idx = np.unique(df.Admissiondboid, return_index=True)
    listPatients = np.array(df.Admissiondboid)[np.sort(idx)]

    index = df.index
    y = y.reindex(index)

    for i in range(len(listPatients)):
        df_trial = df[df.Admissiondboid == listPatients[i]]
        if eliminateColumn:
            df_trial = df_trial.drop(columns=columns)
        if i == 0:
            X = np.array(df_trial)
            X = X.reshape(1, timeStepLength, df.shape[1] - len(columns))
        else:
            X_2 = np.array(df_trial)
            X_2 = X_2.reshape(1, timeStepLength, df.shape[1] - len(columns))
            X = np.append(X, X_2, axis=0)
    
    #y = y.drop(['Admissiondboid'], axis=1)
    keys = list(df_trial.keys())
    
    return X, y, keys

In [None]:
# #########################
# # ROBUST NORM #
# #########################

def robustNorm(df, all_keys, binary_features):
    
    dicc_params = {}  # Dictionary to store min and max values for each column.

    for id_key in range(len(all_keys)):
        if not all_keys[id_key] in binary_features:
            # Step 0: Remove all rows with the value 666.
            values = df[all_keys[id_key]][df[all_keys[id_key]] != 666].values

            # Step 1: Sort the values in ascending order.
            values_sorted = sorted(values, reverse=False)

            # Step 2: Determine 5% of the total values for trimming.
            num_values = int(np.floor(len(values_sorted) * 0.05))

            # Step 3: Discard the lowest and highest 5% of values.
            min_value = values_sorted[num_values]
            max_value = values_sorted[len(values_sorted) - num_values]
            
            # Store min and max values in the dictionary.
            dicc_params[all_keys[id_key]] = [min_value, max_value]

            # Step 4: Normalize values between min and max, saturating beyond limits.
            for i in range(df.shape[0]):
                val = df[all_keys[id_key]].iloc[i]
                if val != 666:  # Skip the special value 666.
                    if val <= min_value:
                        df[all_keys[id_key]].iloc[i] = 0  # Saturate to 0.
                    elif val >= max_value:
                        df[all_keys[id_key]].iloc[i] = 1  # Saturate to 1.
                    else:
                        # Scale value between 0 and 1.
                        df[all_keys[id_key]].iloc[i] = (df[all_keys[id_key]].iloc[i] * 1) / max_value

    return df, dicc_params


def apply_robustNorm(df, all_keys, binary_features, dicc_params):
    
    for id_key in range(len(all_keys)):
        if not all_keys[id_key] in binary_features:
            # Retrieve the min and max values from dicc_params.
            arr = dicc_params[all_keys[id_key]]
            min_value, max_value = arr[0], arr[1]

            # Step 4: Normalize values between min and max, saturating beyond limits.
            for i in range(df.shape[0]):
                val = df[all_keys[id_key]].iloc[i]
                if val != 666:  # Skip the special value 666.
                    if val <= min_value:
                        df[all_keys[id_key]].iloc[i] = 0  # Saturate to 0.
                    elif val >= max_value:
                        df[all_keys[id_key]].iloc[i] = 1  # Saturate to 1.
                    else:
                        # Scale value between 0 and 1.
                        df[all_keys[id_key]].iloc[i] = (df[all_keys[id_key]].iloc[i] * 1) / max_value

    return df


In [None]:
all_keys = ['year',
           'Current_Assets', 'COGS', 'Depreciation_Amortization', 'EBITDA',
           'Inventory', 'Net_Income', 'Receivables', 'Market_Value', 'Net_Sales',
           'Total_Assets', 'Long-term_Debt', 'EBIT', 'Gross_Profit',
           'Current_Liabilities', 'Retained_Earnings', 'Total_Revenue',
           'Total_Liabilities', 'Operating_Expenses']

binary_features = []

print("Continuous variables:", len(all_keys)-len(binary_features))
print("Binary variables:", len(binary_features))
print(len(all_keys))

In [None]:
from sklearn.utils import resample
from sklearn.model_selection import StratifiedKFold, train_test_split
import numpy as np
import pandas as pd

seeds = [395, 273, 159]
folders = ["s1", "s2", "s3"]

norm = "robustNorm"
numberOfTimeStep = 10

# Load the preprocessed data
df_final_ir = pd.read_csv("bank_data_preprocessed.csv")

# Drop unnecessary columns
df_final_ir = df_final_ir.drop(['Unnamed: 0'], axis=1)

# Add a column to indicate if a patient (grouped by `Admissiondboid`) has any positive status (value of 1 in `individualMRGerm`)
df_final_ir['is_positive'] = df_final_ir.groupby('Admissiondboid')['individualMRGerm'].transform(lambda x: 1 if (x == 1).any() else 0)


df_final_aux = df_final_ir.copy()
admissiondboid = df_final_aux['Admissiondboid'].unique()

# Step 1: Identify positive and negative patients based on `is_positive`
df_patients = df_final_aux.drop_duplicates(subset='Admissiondboid')

# Separate positive and negative patients
positive_patients = df_patients[df_patients['is_positive'] == 1]
negative_patients = df_patients[df_patients['is_positive'] == 0]

# Step 2: Balance patients at the patient level
for i, folder in enumerate(folders):
    print(f"=========================================== {folder} ==================================")
    
    
    dev_data, test_data = train_test_split(df_patients, test_size=0.3, random_state=seeds[i], stratify=df_patients['is_positive'], shuffle=True)
    
    # Extract the test set data for all unique admissions in the test dataset
    X_test = df_final_aux[df_final_aux.Admissiondboid.isin(test_data.Admissiondboid)].reset_index(drop=True)

    # Perform undersampling to balance training data at the patient level
    train_data = dev_data.copy()
    positive_train = train_data[train_data['is_positive'] == 1]
    negative_train = train_data[train_data['is_positive'] == 0]

    # Calculate the desired number of negative patients for a 25/75 balance
    desired_negative_count = int((len(positive_train) * 0.75 / 0.25))

    # Perform undersampling on negative patients
    negative_train_undersampled = resample(negative_train,
                                           replace=False,
                                           n_samples=desired_negative_count,
                                           random_state=seeds[i])
    
    # Combine positive patients with the undersampled negative patients
    balanced_train = pd.concat([positive_train, negative_train_undersampled])
    
    # Shuffle the training dataset
    train_data = balanced_train.sample(frac=1, random_state=seeds[i]).reset_index(drop=True)
    
    # Initialize StratifiedKFold for 5-fold cross-validation
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seeds[i])
    
    for fold, (train_index, val_index) in enumerate(skf.split(train_data, train_data['is_positive'])):
        
        train_split = train_data.iloc[train_index]
        val_split = train_data.iloc[val_index]

       
        X_train = df_final_aux[df_final_aux.Admissiondboid.isin(train_split.Admissiondboid)].reset_index(drop=True)
        X_val = df_final_aux[df_final_aux.Admissiondboid.isin(val_split.Admissiondboid)].reset_index(drop=True)

       
        X_test = df_final_aux[df_final_aux.Admissiondboid.isin(test_data.Admissiondboid)].reset_index(drop=True)

        print("======>>>> Normalization type...... robustNorm <<<<<========")
      
        X_train, parameters = robustNorm(X_train, all_keys, binary_features)
        X_val = apply_robustNorm(X_val, all_keys, binary_features, parameters)
        X_test = apply_robustNorm(X_test, all_keys, binary_features, parameters)


        y_train = X_train[['Admissiondboid', 'dayToDone', 'individualMRGerm']]
        y_val = X_val[['Admissiondboid', 'dayToDone', 'individualMRGerm']]
        y_test = X_test[['Admissiondboid', 'dayToDone', 'individualMRGerm']]


        X_train = X_train.drop(["DaysOfStay", "DaysToPositive", 'is_positive'], axis=1)
        X_val = X_val.drop(["DaysOfStay", "DaysToPositive", 'is_positive'], axis=1)
        X_test = X_test.drop(["DaysOfStay", "DaysToPositive", 'is_positive'], axis=1)

        print("---")
        X_train_tensor, y_train_tensor, keys = dataframe_to_tensor(
            X_train.copy(), X_train[["Admissiondboid", "dayToDone", "individualMRGerm"]],
            eliminateColumn=True, columns=['Admissiondboid', 'dayToDone', 'individualMRGerm'],
            timeStepLength=numberOfTimeStep)

        X_val_tensor, y_val_tensor, keys = dataframe_to_tensor(
            X_val.copy(), X_val[["Admissiondboid", "dayToDone", "individualMRGerm"]],
            eliminateColumn=True, columns=['Admissiondboid', 'dayToDone', 'individualMRGerm'],
            timeStepLength=numberOfTimeStep)

        X_test_tensor, y_test_tensor, keys = dataframe_to_tensor(
            X_test.copy(), X_test[["Admissiondboid", "dayToDone", "individualMRGerm"]],
            eliminateColumn=True, columns=['Admissiondboid', 'dayToDone', 'individualMRGerm'],
            timeStepLength=numberOfTimeStep)


        np.save(f"../../BANK/{folder}/X_train_tensor_{fold}{norm}", X_train_tensor)
        y_train_tensor.to_csv(f"../../BANK/{folder}/y_train_tensor_{fold}{norm}.csv", index=False)

        np.save(f"../../BANK/{folder}/X_val_tensor_{fold}{norm}", X_val_tensor)
        y_val_tensor.to_csv(f"../../BANK/{folder}/y_val_tensor_{fold}{norm}.csv", index=False)

        if fold == 0:  # Save the test set only once
            np.save(f"../../BANK/{folder}/X_test_tensor_{norm}", X_test_tensor)
            y_test_tensor.to_csv(f"../../BANK/{folder}/y_test_tensor_{norm}.csv", index=False)

        # Save the keys
        df = pd.DataFrame(keys, columns=['keys'])
        df.to_csv(f"../../BANK/{folder}/keys_{fold}{norm}.csv", index=False)

        print(f"Fold {fold} completed for {folder}")