In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix
import os
from sklearn.impute import SimpleImputer


import numpy as np
import pandas as pd
import random
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model, clone_model
from tensorflow.keras.layers import Input, LSTM, Dense
from tensorflow.random import set_seed

### Importación de los datos

In [2]:
# Leer el archivo CSV
df = pd.read_csv('diabetic_data.csv')

# Mostrar las primeras filas del dataset
print(df.head())

# Resumen de la información del dataset
print(df.info())

# Mostrar la cantidad de valores nulos por columna
print(df.isnull().sum())

   encounter_id  patient_nbr             race  gender      age weight  \
0       2278392      8222157        Caucasian  Female   [0-10)      ?   
1        149190     55629189        Caucasian  Female  [10-20)      ?   
2         64410     86047875  AfricanAmerican  Female  [20-30)      ?   
3        500364     82442376        Caucasian    Male  [30-40)      ?   
4         16680     42519267        Caucasian    Male  [40-50)      ?   

   admission_type_id  discharge_disposition_id  admission_source_id  \
0                  6                        25                    1   
1                  1                         1                    7   
2                  1                         1                    7   
3                  1                         1                    7   
4                  1                         1                    7   

   time_in_hospital  ... citoglipton insulin  glyburide-metformin  \
0                 1  ...          No      No                   No

In [3]:
# Leer el archivo CSV, tratando '?' como valores nulos
df = pd.read_csv('diabetic_data.csv', na_values='?')

# Mostrar la cantidad de valores nulos por columna
print(df.isnull().sum())

# Guardar el DataFrame modificado en un nuevo archivo CSV
df.to_csv('diabetic_data_modified.csv', index=False)

print("El archivo modificado ha sido guardado como 'diabetic_data_modified.csv'.")


  df = pd.read_csv('diabetic_data.csv', na_values='?')


encounter_id                    0
patient_nbr                     0
race                         2273
gender                          0
age                             0
weight                      98569
admission_type_id               0
discharge_disposition_id        0
admission_source_id             0
time_in_hospital                0
payer_code                  40256
medical_specialty           49949
num_lab_procedures              0
num_procedures                  0
num_medications                 0
number_outpatient               0
number_emergency                0
number_inpatient                0
diag_1                         21
diag_2                        358
diag_3                       1423
number_diagnoses                0
max_glu_serum                   0
A1Cresult                       0
metformin                       0
repaglinide                     0
nateglinide                     0
chlorpropamide                  0
glimepiride                     0
acetohexamide 

### Procesamiento de los datos

In [4]:
# Cargar el dataset
df = pd.read_csv('diabetic_data_modified.csv')

# Asegurarse de manejar correctamente los valores nulos
# Identificar las columnas con valores nulos según la descripción proporcionada
cols_with_missing = ['race', 'weight', 'payer_code', 'medical_specialty', 'diag_1', 'diag_2', 'diag_3']

# Imputar los valores nulos con la moda para variables categóricas
categorical_imputer = SimpleImputer(strategy='most_frequent')
df[cols_with_missing] = categorical_imputer.fit_transform(df[cols_with_missing])

# Limpiar y convertir 'weight' a valores numéricos y luego categorizarlo
def clean_weight(weight_str):
    if weight_str == "?":
        return None
    elif weight_str.startswith(">"):
        return float(weight_str[1:]) + 1  # Incrementar en 1 para asegurar que los límites sean correctos
    elif weight_str.startswith("["):
        return float(weight_str.strip("[]").split("-")[0])
    elif weight_str == "Unknown":
        return None
    else:
        return float(weight_str)

df['weight'] = df['weight'].apply(clean_weight)

# Definir los rangos de peso y codificar 'weight'
weight_ranges = ['[0-25)', '[25-50)', '[50-75)', '[75-100)', '[100-125)', '[125-150)', '[150-175)', '[175-200)', '>200']
df['weight_category'] = pd.cut(df['weight'], bins=[0, 25, 50, 75, 100, 125, 150, 175, 200, float('inf')], labels=weight_ranges, right=False)

# Eliminar la columna original 'weight'
df.drop(columns=['weight'], inplace=True)

# Función para asignar categorías a los códigos ICD-9
def assign_icd_category(icd_code):
    if pd.isnull(icd_code):
        return 'Unknown'
    if icd_code.startswith(('E', 'V')):
        return 'E-V codes'
    else:
        code_number = int(icd_code.split('.')[0])  # Tomar solo el número de código ICD-9
        if 1 <= code_number <= 139:
            return '001-139'
        elif 140 <= code_number <= 239:
            return '140-239'
        elif 240 <= code_number <= 279:
            return '240-279'
        elif 280 <= code_number <= 289:
            return '280-289'
        elif 290 <= code_number <= 319:
            return '290-319'
        elif 320 <= code_number <= 389:
            return '320-389'
        elif 390 <= code_number <= 459:
            return '390-459'
        elif 460 <= code_number <= 519:
            return '460-519'
        elif 520 <= code_number <= 579:
            return '520-579'
        elif 580 <= code_number <= 629:
            return '580-629'
        elif 630 <= code_number <= 679:
            return '630-679'
        elif 680 <= code_number <= 709:
            return '680-709'
        elif 710 <= code_number <= 739:
            return '710-739'
        elif 740 <= code_number <= 759:
            return '740-759'
        elif 760 <= code_number <= 779:
            return '760-779'
        elif 780 <= code_number <= 799:
            return '780-799'
        elif 800 <= code_number <= 999:
            return '800-999'
        else:
            return 'Other'  # En caso de no encontrar una categoría válida

# Aplicar la función a cada columna de diagnóstico
for col in ['diag_1', 'diag_2', 'diag_3']:
    df[col + '_category'] = df[col].apply(assign_icd_category)

# Eliminar las columnas originales de diagnóstico
df.drop(columns=['diag_1', 'diag_2', 'diag_3'], inplace=True)

# Función para asignar valores únicos a las franjas de edad
def age_to_value(age_str):
    age_mapping = {
        '[0-10)': 5,
        '[10-20)': 15,
        '[20-30)': 25,
        '[30-40)': 35,
        '[40-50)': 45,
        '[50-60)': 55,
        '[60-70)': 65,
        '[70-80)': 75,
        '[80-90)': 85,
        '[90-100)': 95
    }
    return age_mapping.get(age_str, None)

# Aplicar la función de agrupamiento de edades
df['age'] = df['age'].apply(age_to_value)

# Aplicar codificación one-hot a las variables categóricas, excluyendo 'age' ya que está mapeada a valores únicos
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
if 'age' in categorical_cols:
    categorical_cols.remove('age')
encoder = OneHotEncoder(drop='first', sparse_output=False)
encoded_cols = pd.DataFrame(encoder.fit_transform(df[categorical_cols]))

# Sustituir las columnas originales con las nuevas codificadas
encoded_cols.columns = encoder.get_feature_names_out(categorical_cols)
df.drop(columns=categorical_cols, inplace=True)
df = pd.concat([df, encoded_cols], axis=1)

# Guardar el resultado en un nuevo archivo CSV
# df.to_csv('processed_data.csv', index=False)

print("Datos procesados y guardados en 'processed_data.csv'.")

  df = pd.read_csv('diabetic_data_modified.csv')


Datos procesados y guardados en 'processed_data.csv'.


### Creación del modelo LSTM

In [7]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import numpy as np
import tensorflow as tf
import pandas as pd

# Función para cargar los datos
def load_data(filepath):
    return pd.read_csv(filepath)

# Función para crear el modelo LSTM
def crear_modelo_lstm(input_shape, learning_rate=0.001):
    model = tf.keras.Sequential([
        tf.keras.layers.LSTM(64, input_shape=input_shape, return_sequences=True),
        tf.keras.layers.LSTM(32),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate), loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Función de entrenamiento en cada dispositivo
def train_on_device(X_train, y_train, input_shape, epochs, learning_rate):
    model = crear_modelo_lstm(input_shape, learning_rate)
    model.fit(X_train, y_train, epochs=epochs, validation_split=0.2, verbose=1)
    return model

# Función para dividir los datos
def split_data(X, y, num_parts):
    X_splits = np.array_split(X, num_parts)
    y_splits = np.array_split(y, num_parts)
    return X_splits, y_splits

# Función para preprocesar datos una sola vez
def preprocess_data(X, y, num_parts):
    X_splits, y_splits = split_data(X, y, num_parts)
    scalers = [StandardScaler().fit(X_split) for X_split in X_splits]
    X_scaled_splits = [scaler.transform(X_split) for scaler, X_split in zip(scalers, X_splits)]
    X_scaled_reshaped = [X_scaled.reshape(-1, X.shape[1], 1) for X_scaled in X_scaled_splits]
    return X_scaled_reshaped, y_splits, scalers

# Función principal para la simulación de Federated Learning
def federated_learning(X, y, num_parts, num_iterations, epochs=5, learning_rate=0.001):
    all_accuracies = []

    X_scaled_splits, y_splits, scalers = preprocess_data(X, y, num_parts)

    for iteration in range(num_iterations):
        print(f'Iteración {iteration + 1}/{num_iterations}')
        local_models = []

        for i in range(num_parts):
            print(f'Entrenando modelo local {i + 1}/{num_parts}')
            X_train_scaled, X_test_scaled, y_train, y_test = train_test_split(X_scaled_splits[i], y_splits[i], test_size=0.4, random_state=iteration, stratify=y_splits[i])
            
            local_model = train_on_device(X_train_scaled, y_train, input_shape=(X.shape[1], 1), epochs=epochs, learning_rate=learning_rate)
            local_models.append(local_model)
        
        # Crear el modelo global combinando los modelos locales
        print('Creando modelo global')
        global_model = crear_modelo_lstm(input_shape=(X.shape[1], 1))
        
        # Promediar los pesos capa por capa (Federated Averaging)
        global_weights = [np.mean([model.get_weights()[layer] for model in local_models], axis=0) for layer in range(len(local_models[0].get_weights()))]
        global_model.set_weights(global_weights)
        
        # Evaluar el modelo global en datos de prueba combinados
        X_test_combined = np.vstack([X_scaled_splits[i] for i in range(num_parts)])
        y_test_combined = np.hstack([y_splits[i] for i in range(num_parts)])
        
        loss, accuracy = global_model.evaluate(X_test_combined, y_test_combined, verbose=0)
        
        all_accuracies.append(accuracy)
        print(f'Accuracy del modelo global después de {iteration + 1} iteración: {accuracy:.5f}\n')
    
    return all_accuracies


# Cargar datos desde el archivo CSV
df = load_data('processed_data.csv')

# Seleccionar características y etiquetas
X = df.drop(columns=['diabetesMed_Yes'])
y = df['diabetesMed_Yes']

# Ejecutar la simulación de Federated Learning
accuracies = federated_learning(X, y, num_parts=5, num_iterations=1, epochs=5, learning_rate=0.001)


Iteración 1/1
Entrenando modelo local 1/5
Epoch 1/5


  super().__init__(**kwargs)


[1m306/306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 181ms/step - accuracy: 0.7344 - loss: 0.5954 - val_accuracy: 0.7266 - val_loss: 0.5869
Epoch 2/5
[1m306/306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 173ms/step - accuracy: 0.7286 - loss: 0.5873 - val_accuracy: 0.7266 - val_loss: 0.5897
Epoch 3/5
[1m306/306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 192ms/step - accuracy: 0.7380 - loss: 0.5759 - val_accuracy: 0.7266 - val_loss: 0.5867
Epoch 4/5
[1m306/306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 206ms/step - accuracy: 0.7358 - loss: 0.5783 - val_accuracy: 0.7266 - val_loss: 0.5866
Epoch 5/5
[1m306/306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 204ms/step - accuracy: 0.7347 - loss: 0.5798 - val_accuracy: 0.7266 - val_loss: 0.5873
Entrenando modelo local 2/5
Epoch 1/5


  super().__init__(**kwargs)


[1m306/306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 219ms/step - accuracy: 0.7515 - loss: 0.5758 - val_accuracy: 0.7503 - val_loss: 0.5596
Epoch 2/5
[1m306/306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 180ms/step - accuracy: 0.7646 - loss: 0.5466 - val_accuracy: 0.7503 - val_loss: 0.5684
Epoch 3/5
[1m306/306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 173ms/step - accuracy: 0.7568 - loss: 0.5580 - val_accuracy: 0.7503 - val_loss: 0.5625
Epoch 4/5
[1m306/306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 154ms/step - accuracy: 0.7559 - loss: 0.5563 - val_accuracy: 0.7503 - val_loss: 0.5624
Epoch 5/5
[1m306/306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 181ms/step - accuracy: 0.7657 - loss: 0.5453 - val_accuracy: 0.7503 - val_loss: 0.5622
Entrenando modelo local 3/5
Epoch 1/5


  super().__init__(**kwargs)


[1m306/306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 152ms/step - accuracy: 0.7560 - loss: 0.5695 - val_accuracy: 0.7544 - val_loss: 0.5740
Epoch 2/5
[1m306/306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 165ms/step - accuracy: 0.7637 - loss: 0.5478 - val_accuracy: 0.7544 - val_loss: 0.5589
Epoch 3/5
[1m306/306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 188ms/step - accuracy: 0.7599 - loss: 0.5507 - val_accuracy: 0.7544 - val_loss: 0.5589
Epoch 4/5
[1m306/306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 200ms/step - accuracy: 0.7582 - loss: 0.5544 - val_accuracy: 0.7544 - val_loss: 0.5577
Epoch 5/5
[1m306/306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 201ms/step - accuracy: 0.7639 - loss: 0.5468 - val_accuracy: 0.7544 - val_loss: 0.5589
Entrenando modelo local 4/5
Epoch 1/5


  super().__init__(**kwargs)


[1m306/306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 211ms/step - accuracy: 0.8037 - loss: 0.5215 - val_accuracy: 0.8019 - val_loss: 0.4980
Epoch 2/5
[1m306/306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 211ms/step - accuracy: 0.7991 - loss: 0.5021 - val_accuracy: 0.8019 - val_loss: 0.4979
Epoch 3/5
[1m306/306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 232ms/step - accuracy: 0.8082 - loss: 0.4891 - val_accuracy: 0.8019 - val_loss: 0.4975
Epoch 4/5
[1m306/306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 221ms/step - accuracy: 0.8040 - loss: 0.4951 - val_accuracy: 0.8019 - val_loss: 0.4979
Epoch 5/5
[1m306/306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 226ms/step - accuracy: 0.8055 - loss: 0.4930 - val_accuracy: 0.8019 - val_loss: 0.5004
Entrenando modelo local 5/5


  super().__init__(**kwargs)


Epoch 1/5
[1m306/306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 232ms/step - accuracy: 0.7984 - loss: 0.5299 - val_accuracy: 0.7904 - val_loss: 0.5135
Epoch 2/5
[1m306/306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 188ms/step - accuracy: 0.7912 - loss: 0.5122 - val_accuracy: 0.7904 - val_loss: 0.5056
Epoch 3/5
[1m306/306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 155ms/step - accuracy: 0.7974 - loss: 0.5030 - val_accuracy: 0.7904 - val_loss: 0.5136
Epoch 4/5
[1m306/306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 163ms/step - accuracy: 0.7977 - loss: 0.5041 - val_accuracy: 0.7904 - val_loss: 0.5132
Epoch 5/5
[1m306/306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 149ms/step - accuracy: 0.8030 - loss: 0.4964 - val_accuracy: 0.7904 - val_loss: 0.5136
Creando modelo global


NameError: name 'X_splits' is not defined