In [1]:
import pandas as pd
import os
import numpy as np

import warnings
warnings.filterwarnings("ignore")

import sys
import pyten

In [31]:
def load_and_concatenate_files(directory):
    all_data = []  # List to hold all the dataframes

    # Iterate over each file in the directory
    for filename in os.listdir(directory):
        if filename.endswith('.psv'):
            filepath = os.path.join(directory, filename)
            
            # Load the PSV file into a DataFrame
            df = pd.read_csv(filepath, sep='|')
            # Add a column with the filename as a value
            df['id_pat'] = filename.split(".")[0]
            # Append the DataFrame to the list
            all_data.append(df)

    # Concatenate all the DataFrames in the list row-wise
    concatenated_df = pd.concat(all_data, ignore_index=True)
    
    return concatenated_df


def getlabel(x):
    labels = (x == True).cumsum()
    return (labels > 0).astype(int)

def group(df_grupo, fre):
    return df_grupo.groupby(df_grupo.index // freq).median()

def temporalDataset(df_grupo, maxLength=15, flag=666):
    if len(df_grupo) < maxLength:
        # Completar con 666 si hay menos de maxLength filas
        fill_values = {col: [flag] * (maxLength - len(df_grupo)) for col in df_grupo.columns[1:]}  
        df_grupo = df_grupo.append(pd.DataFrame({'id_pat': [df_grupo['id_pat'].iloc[0]] * (maxLength - len(df_grupo)), **fill_values}), ignore_index=True)
    else: 
        df_grupo = df_grupo.head(maxLength)
    return df_grupo


def dataframeToTensor(df, timeStepLength):
    _, id_pat = np.unique(df.id_pat, return_index=True)
    listPatients = np.array(df.id_pat)[np.sort(id_pat)]

    index = df.index
    for i in range(len(listPatients)):
        df_trial = df[df.id_pat == listPatients[i]]

        if i == 0:
            X = np.array(df_trial)
            X = X.reshape(1, timeStepLength, df.shape[1])
        else:
            X_2 = np.array(df_trial)
            X_2 = X_2.reshape(1, timeStepLength, df.shape[1])
            X = np.append(X, X_2, axis=0)
    
    return np.array(X, dtype=float)

In [39]:
# Load patient information
directory = '../../../mimic_demo'
df = load_and_concatenate_files(directory)
df['label'] = df.groupby('id_pat')['sep3'].apply(getlabel)
df = df[df['startdate'] >= 0].reset_index(drop=True)

# Define a frequency value
freq = 24
df_grouped = df.groupby('id_pat').apply(group, freq).reset_index()
df_grouped['timeStep'] = df_grouped.groupby('id_pat').cumcount() + 1
df_grouped = df_grouped.drop(['startdate', 'level_1'], axis=1)
print("Dimensiones:", df_grouped.shape)
print("max # of days icu stay:", df_grouped.timeStep.max())
print("average # of days icu stay:", df_grouped.timeStep.mean())
print("median # of days icu stay:", df_grouped.timeStep.median())
df_grouped.loc[df_grouped['label'] == 0.5, 'label'] = 1

# Generate a temporal dataset with a maximum number of time steps per patients
maxLength = 15
flag = 666
print("Pre:", df_grouped.shape)
df_temp = df_grouped.groupby('id_pat').apply(temporalDataset, maxLength).reset_index(drop=True)
print("Post:", df_temp.shape)

# Create a tensor dataset based on previous temporal data
X = dataframeToTensor(df_temp, maxLength)
Xf = X[:,:,1:-2]
print(Xf.shape)

# LRTC for imputation of values
n1, n2, n3 = Xf.shape

coords = np.array(np.meshgrid(range(n1), range(n2), range(n3))).reshape(3, -1).T
df_lrtc = pd.DataFrame(coords, columns=["x1", "x2", "x3"])
df_lrtc["r"] = Xf[coords[:, 0], coords[:, 1], coords[:, 2]]
df_lrtc["x1"] += 1
df_lrtc["x2"] += 1
df_lrtc["x3"] += 1

Dimensiones: (1432, 108)
max # of days icu stay: 124
average # of days icu stay: 15.349162011173185
median # of days icu stay: 7.0
Pre: (1432, 108)
Post: (2040, 108)
(136, 15, 105)


In [50]:
import time
# Execute tensor completion
# for i in range(50, 500, 50):
inicio = time.time()
i = 350
[OriTensor, DeTensor, TenClass, RecTensor, RecTensor_hat, mask] = pyten.UI.helios(df_lrtc, i)

# Calculate the error between the real tensor and the reconstructed tensor
# we only consider the values different from nan
rec = RecTensor_hat * mask 
real = np.nan_to_num(Xf)
error = np.linalg.norm(rec - real)/np.linalg.norm(real)
print(error)

if np.any(RecTensor_hat < 0):
    num_negativos = np.count_nonzero(RecTensor_hat < 0)
    print(num_negativos)
else:
    print(i)

RecTensor_hat = np.where(RecTensor_hat < 0, np.abs(RecTensor_hat), RecTensor_hat)
fin = time.time()

# Calcular el tiempo transcurrido
tiempo_transcurrido = fin - inicio

0.0010410913967129189
30244


In [54]:
tiempo_transcurrido/3600

0.4958687196175257

In [52]:
136*15*105

214200