In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


MLP

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Input, Concatenate, Dropout
from tensorflow.keras.models import Model
from tensorflow import keras
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.optimizers import Adam # Add this line to import the Adam
# Split data into training and test sets
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.metrics import mean_absolute_error # Import the mean_absolute_error function


%load_ext tensorboard

import os
log_dir = os.path.join("logs") # Puedes cambiar "logs" por el nombre que quieras
if not os.path.exists(log_dir):
  os.makedirs(log_dir)

from tensorflow.keras.callbacks import TensorBoard

tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)


# Cargar los datos (suponiendo que están en un archivo CSV)
data = pd.read_csv('/content/drive/My Drive/PREDICTIVE MODEL IC/DATA/DATOS_OBSERVACION_VAR_NOTNULL2010-2022_TRANSFORM.csv')

#ARREGLO PARA CALC_IMC NULL
data['CALC_IMC'] = data.groupby('ID')['CALC_IMC'].transform(lambda x: round(x.fillna(x.mean()), 0))
#eliminar filas con datos nulos en IMC
data = data.dropna(subset=['CALC_IMC'])

# Contar registros por ID, ignorando los NaN
#conteo_registros_por_persona = data.groupby('ID')['ID'].count()

# Personas con exactamente 4 registros (sin considerar NaN)
#personas_con_4_registros = conteo_registros_por_persona[conteo_registros_por_persona == 4]
#print(personas_con_4_registros)

data.to_csv('/content/drive/My Drive/PREDICTIVE MODEL IC/DATA/DATOS_OBSERVACION_VAR_NOTNULL2010-2022_TRANSFORM_1.csv', index=False)
data = pd.read_csv('/content/drive/My Drive/PREDICTIVE MODEL IC/DATA/DATOS_OBSERVACION_VAR_NOTNULL2010-2022_TRANSFORM_1.csv')

# Normalize numerical variables
columns_to_scale = ['LOCOMOTION','SENSORY','VITALITY','PSICHOLOGICAL','COGNITION','CIGARRETTES','DRINK','CALC_IMC', 'AGE']
scaler = MinMaxScaler(feature_range=(0, 1))
#scaler = StandardScaler()
scaled_data = scaler.fit_transform(data[columns_to_scale])
#data[columns_to_scale] = scaled_data
data.loc[:, columns_to_scale] = scaled_data

# Convert categorical variables to one-hot encoding
# Crear un objeto OneHotEncoder
columns_to_transform = ['SEX','ALONE','PHYSICAL_ACTIVITY']
encoder = OneHotEncoder(sparse_output=False)
encoded_data = encoder.fit_transform(data[columns_to_transform])
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(columns_to_transform))
data = pd.concat([data, encoded_df], axis=1)

#data = data.drop(columns=['SEX','ALONE','PHYSICAL_ACTIVITY'])

# Agrupar por ID y ordenar por 'Evaluación'
#data_grouped = data.sort_values('evaluacion').groupby('ID')

#data = data.drop(columns=['ID','año','evaluación'])

#data.head()
#print(data.columns)

data.to_csv('/content/drive/My Drive/PREDICTIVE MODEL IC/DATA/DATOS_OBSERVACION_VAR_NOTNULL2010-2022_TRANSFORM_ENCODED.csv', index=False)

# Definimos las columnas de las variables a predecir (target)
target_cols = ['LOCOMOTION','SENSORY','VITALITY','PSICHOLOGICAL','COGNITION']

# Prepare sequences and targets
def create_dataset(data, look_back=3):
  sequences = []
  targets = []
  locomotions = []
  sensories = []
  vitalities = []
  psichologicals = []
  cognitions = []
  auxiliares = []
  for person_id in data['ID'].unique():
    person_data = data[data['ID'] == person_id].sort_values('evaluacion')

     # Get the indices of the relevant columns
    relevant_columns = ['LOCOMOTION', 'SENSORY', 'VITALITY','PSICHOLOGICAL', 'COGNITION',
                        'ALONE_PARTNERED', 'ALONE_UNCOUPLED',
                        'PHYSICAL_ACTIVITY_SEDENTARY', 'PHYSICAL_ACTIVITY_MILD','PHYSICAL_ACTIVITY_MODERATE', 'PHYSICAL_ACTIVITY_VIGOROUS',
                        'CIGARRETTES','DRINK', 'CALC_IMC']
    auxiliar_columns = ['AGE','SEX_FEMALE', 'SEX_MALE']
     # Select the relevant columns first
    person_data_values = person_data[relevant_columns].values
    person_data_auxiliar = person_data[auxiliar_columns].values

    # Create sequences and targets
    if len(person_data_values) < look_back + 1:
        continue
    sequence = person_data_values[:look_back, :]
    target = person_data_values[look_back, data.columns.get_indexer(target_cols)]
    locomotion = person_data_values[look_back, 0]
    sensory = person_data_values[look_back, 1]
    vitality = person_data_values[look_back, 2]
    psichological = person_data_values[look_back, 3]
    cognition = person_data_values[look_back, 4]
    auxiliar = person_data_auxiliar[look_back, :]
    sequences.append(sequence)
    targets.append(target)
    locomotions.append(locomotion)
    sensories.append(sensory)
    vitalities.append(vitality)
    psichologicals.append(psichological)
    cognitions.append(cognition)
    auxiliares.append(auxiliar)
  return np.array(sequences), np.array(targets), np.array(locomotions), np.array(sensories), np.array(vitalities), np.array(psichologicals), np.array(cognitions), np.array(auxiliares)

# Create sequences and targets
look_back = 3
sequences, targets, locomotions, sensories, vitalities, psichologicals, cognitions, auxiliares = create_dataset(data, look_back)

pd.DataFrame(sequences[0]).head()
sequences.shape
sequences.shape[2]

sequences.shape[2]
#pd.DataFrame(targets[0]).head()

auxiliares.shape
auxiliares.shape[1]

targets.shape

# TRAINNING AND TEST

# Assuming 'sequences' and 'auxiliares' are your input features,
# and 'locomotions', 'sensories', etc. are your targets
# Split into 80% training and 20% testing
(
    sequences_train,
    sequences_test,
    auxiliares_train,
    auxiliares_test,
    locomotions_train,
    locomotions_test,
    sensories_train,
    sensories_test,
    vitalities_train,
    vitalities_test,
    psichologicals_train,
    psichologicals_test,
    cognitions_train,
    cognitions_test,
) = train_test_split(
    sequences,
    auxiliares,
    locomotions,
    sensories,
    vitalities,
    psichologicals,
    cognitions,
    test_size=0.2,
    random_state=42,  # Optional: for reproducibility
)


# Define input shapes
sequential_input_shape = (sequences.shape[1], sequences.shape[2])  # Adjust num_sequential_features
auxiliary_input_shape = (auxiliares.shape[1],)  # Adjust num_auxiliary_features

# Create the model
def create_model_mlp():
     # Input layers
    sequential_input = Input(shape=(sequences.shape[1] * sequences.shape[2],), name='sequential_input') # Flatten sequential input
    auxiliary_input = Input(shape=(auxiliares.shape[1],), name='auxiliary_input')

     # Process sequential input with MLP
    sequential_mlp = Dense(64, activation='relu')(sequential_input) # Adjust units and activation as needed
    sequential_mlp = Dropout(0.2)(sequential_mlp)
    sequential_mlp = Dense(32, activation='relu')(sequential_mlp)  # Adjust units and activation as needed
    sequential_mlp = Dropout(0.2)(sequential_mlp)

    # Concatenate sequential and auxiliary inputs
    concatenated_input = Concatenate()([sequential_mlp, auxiliary_input])

    # Shared MLP layers
    shared_mlp = Dense(32, activation='relu')(concatenated_input) # Adjust units and activation as needed
    shared_mlp = Dropout(0.2)(shared_mlp)
    shared_mlp = Dense(16, activation='relu')(shared_mlp) # Adjust units and activation as needed
    shared_mlp = Dropout(0.2)(shared_mlp)

    # Output layers for each task
    l_output = Dense(1, activation='linear', name='l_output')(shared_mlp)
    s_output = Dense(1, activation='linear', name='s_output')(shared_mlp)
    v_output = Dense(1, activation='linear', name='v_output')(shared_mlp)
    p_output = Dense(1, activation='linear', name='p_output')(shared_mlp)
    c_output = Dense(1, activation='linear', name='c_output')(shared_mlp)

    # Create the model
    model = Model(inputs=[sequential_input, auxiliary_input], outputs=[l_output, s_output, v_output, p_output, c_output ])
    return model

# Compile the model
model = create_model_mlp()
optimizer = Adam(learning_rate=0.001)
model.compile(optimizer=optimizer,
              loss=['mse', 'mse', 'mse', 'mse', 'mse'],
              metrics=[['mse','mae'], ['mse','mae'], ['mse','mae'], ['mse','mae'], ['mse','mae']])  # Adjust loss weights if needed

# Resumen del modelo
model.summary()
keras.utils.plot_model(model, 'modelo.png', show_shapes=True)

# Flatten the training and testing data before fitting/evaluating:
sequences_train_flattened = sequences_train.reshape(sequences_train.shape[0], -1)
sequences_test_flattened = sequences_test.reshape(sequences_test.shape[0], -1)

# Train the model
model.fit(x=[sequences_train_flattened, auxiliares_train],
          y=[locomotions_train, sensories_train, vitalities_train, psichologicals_train, cognitions_train],
          epochs=50, batch_size=256, verbose=1, callbacks=[tensorboard_callback])

# Evaluate the model
results = model.evaluate([sequences_test_flattened, auxiliares_test],
                        {'l_output': locomotions_test, 's_output': sensories_test, 'v_output': vitalities_test, 'p_output': psichologicals_test, 'c_output': cognitions_test})


# Extract loss and metrics (order should match compilation and evaluation metrics)
loss = results[0]  # Total loss
l_loss = results[1]  # Loss for 'l_output'
s_loss = results[2]  # Loss for 's_output'
v_loss = results[3]  # Loss for 'v_output'
p_loss = results[4]  # Loss for 'p_output'
c_loss = results[5]  # Loss for 'c_output'

# Assuming correct order, retrieve MAE values
l_mae = results[6]  # MAE for 'l_output'
s_mae = results[7]  # MAE for 's_output'
v_mae = results[8]  # MAE for 'v_output'
p_mae = results[9]  # MAE for 'p_output'
c_mae = results[10]  # MAE for 'c_output'


print(f"\nResultados de la evaluación ORIGINAL:\n"
      f"Pérdida total: {loss:.4f}\n"
      f"Pérdida para l (Regresión): {l_loss:.4f}\n"
      f"Pérdida para s (Regresión): {s_loss:.4f}\n"
      f"Pérdida para v (Regresión): {v_loss:.4f}\n"
      f"Pérdida para p (Regresión): {p_loss:.4f}\n"
      f"Pérdida para c (Regresión): {c_loss:.4f}\n"
      f"MAE para l (Regresión): {l_mae:.4f}\n"
     f"MAE para s (Regresión): {s_mae:.4f}\n"
    f"MAE para v (Regresión): {v_mae:.4f}\n"
     f"MAE para p (Regresión): {p_mae:.4f}\n"
     f"MAE para c (Regresión): {c_mae:.4f}"
      )


# Función para calcular MAE con intervalo de confianza usando Bootstrap (CÓDIGO NUEVO)
def calcular_mae_intervalo_confianza(
    sequences_train, auxiliares_train, locomotions_train, sensories_train, vitalities_train, psichologicals_train, cognitions_train,
    sequences_test, auxiliares_test, locomotions_test, sensories_test, vitalities_test, psichologicals_test, cognitions_test,
    num_ejecuciones=1000
):
    l_maes = []
    s_maes = []
    v_maes = []
    p_maes = []
    c_maes = []

    for _ in range(num_ejecuciones):
        indices_bootstrap = resample(np.arange(len(sequences_test)), replace=True, n_samples=len(sequences_test))

        sequences_bootstrap = sequences_test[indices_bootstrap]
        auxiliares_bootstrap = auxiliares_test[indices_bootstrap]
        locomotions_bootstrap = locomotions_test[indices_bootstrap]
        sensories_bootstrap = sensories_test[indices_bootstrap]
        vitalities_bootstrap = vitalities_test[indices_bootstrap]
        psichologicals_bootstrap = psichologicals_test[indices_bootstrap]
        cognitions_bootstrap = cognitions_test[indices_bootstrap]

        # 2. Evaluate the model on the bootstrap sample (no training in each iteration)
        #predictions = model.predict([sequences_bootstrap, auxiliares_bootstrap])

        # Before evaluating the model, flatten the bootstrap sample
        sequences_bootstrap_flattened = sequences_bootstrap.reshape(sequences_bootstrap.shape[0], -1)

         # Evaluate the model on the bootstrap sample
        results1 = model.evaluate(
        [sequences_bootstrap_flattened, auxiliares_bootstrap],
        [locomotions_bootstrap, sensories_bootstrap, vitalities_bootstrap, psichologicals_bootstrap, cognitions_bootstrap],
        verbose=0  # Set verbose to 0 to suppress output during evaluation
        )

        # Extract MAE values from the results
        l_mae = results1[6]  # MAE for 'l_output' (assuming the correct index)
        s_mae = results1[7]  # MAE for 's_output'
        v_mae = results1[8]  # MAE for 'v_output'
        p_mae = results1[9]  # MAE for 'p_output'
        c_mae = results1[10]  # MAE for 'c_output'

        # Calcula el MAE para cada variable objetivo usando las predicciones (CÓDIGO NUEVO)
       # l_mae = mean_absolute_error(locomotions_bootstrap, predictions[0])
        #s_mae = mean_absolute_error(sensories_bootstrap, predictions[1])
        #v_mae = mean_absolute_error(vitalities_bootstrap, predictions[2])
        #p_mae = mean_absolute_error(psichologicals_bootstrap, predictions[3])
        #c_mae = mean_absolute_error(cognitions_bootstrap, predictions[4])

        l_maes.append(l_mae)
        s_maes.append(s_mae)
        v_maes.append(v_mae)
        p_maes.append(p_mae)
        c_maes.append(c_mae)

    # Función para calcular estadísticas e intervalos de confianza (CÓDIGO NUEVO)
    def calculate_ci(maes):
        mae_promedio = np.mean(maes)
        desviacion_estandar = np.std(maes)
        valor_critico = 1.96  # Para un intervalo de confianza del 95%
        margen_de_error = valor_critico * (desviacion_estandar / np.sqrt(num_ejecuciones))
        limite_inferior = mae_promedio - margen_de_error
        limite_superior = mae_promedio + margen_de_error
        return mae_promedio, (limite_inferior, limite_superior)

    l_results = calculate_ci(l_maes)
    s_results = calculate_ci(s_maes)
    v_results = calculate_ci(v_maes)
    p_results = calculate_ci(p_maes)
    c_results = calculate_ci(c_maes)

    return {
        'l': l_results,
        's': s_results,
        'v': v_results,
        'p': p_results,
        'c': c_results,
    }

# Calcula intervalo de confianza con Bootstrap (CÓDIGO NUEVO)
resultados_bootstrap = calcular_mae_intervalo_confianza(
    sequences_train, auxiliares_train, locomotions_train, sensories_train, vitalities_train, psichologicals_train, cognitions_train,
    sequences_test, auxiliares_test, locomotions_test, sensories_test, vitalities_test, psichologicals_test, cognitions_test,
    num_ejecuciones=1000
)

# Imprime los resultados del Bootstrap (CÓDIGO NUEVO)
for target, (mae_promedio, intervalo_confianza) in resultados_bootstrap.items():
    print(f"MAE para {target} (Regresión): {mae_promedio:.4f} (IC 95%: {intervalo_confianza})")


  data.loc[:, columns_to_scale] = scaled_data
  data.loc[:, columns_to_scale] = scaled_data


Epoch 1/50
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 41ms/step - c_output_loss: 1.1846 - c_output_mae: 0.9786 - c_output_mse: 1.1926 - l_output_loss: 0.6016 - l_output_mae: 0.6808 - l_output_mse: 0.6033 - loss: 3.2052 - p_output_loss: 0.3236 - p_output_mae: 0.4603 - p_output_mse: 0.3243 - s_output_loss: 0.5218 - s_output_mae: 0.6357 - s_output_mse: 0.5225 - v_output_loss: 0.5607 - v_output_mae: 0.6438 - v_output_mse: 0.5624
Epoch 2/50
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 28ms/step - c_output_loss: 0.3419 - c_output_mae: 0.4827 - c_output_mse: 0.3431 - l_output_loss: 0.3874 - l_output_mae: 0.5372 - l_output_mse: 0.3872 - loss: 1.6846 - p_output_loss: 0.3092 - p_output_mae: 0.4460 - p_output_mse: 0.3103 - s_output_loss: 0.3700 - s_output_mae: 0.5195 - s_output_mse: 0.3715 - v_output_loss: 0.2717 - v_output_mae: 0.4315 - v_output_mse: 0.2725
Epoch 3/50
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - c_output_loss: 0.26