In [3]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix
import os
from sklearn.impute import SimpleImputer

### Importación de los datos

In [4]:
# Leer el archivo CSV
df = pd.read_csv('diabetic_data.csv')

# Mostrar las primeras filas del dataset
print(df.head())

# Resumen de la información del dataset
print(df.info())

# Mostrar la cantidad de valores nulos por columna
print(df.isnull().sum())

   encounter_id  patient_nbr             race  gender      age weight  \
0       2278392      8222157        Caucasian  Female   [0-10)      ?   
1        149190     55629189        Caucasian  Female  [10-20)      ?   
2         64410     86047875  AfricanAmerican  Female  [20-30)      ?   
3        500364     82442376        Caucasian    Male  [30-40)      ?   
4         16680     42519267        Caucasian    Male  [40-50)      ?   

   admission_type_id  discharge_disposition_id  admission_source_id  \
0                  6                        25                    1   
1                  1                         1                    7   
2                  1                         1                    7   
3                  1                         1                    7   
4                  1                         1                    7   

   time_in_hospital  ... citoglipton insulin  glyburide-metformin  \
0                 1  ...          No      No                   No

In [5]:
# Leer el archivo CSV, tratando '?' como valores nulos
df = pd.read_csv('diabetic_data.csv', na_values='?')

# Mostrar la cantidad de valores nulos por columna
print(df.isnull().sum())

# Guardar el DataFrame modificado en un nuevo archivo CSV
df.to_csv('diabetic_data_modified.csv', index=False)

print("El archivo modificado ha sido guardado como 'diabetic_data_modified.csv'.")


  df = pd.read_csv('diabetic_data.csv', na_values='?')


encounter_id                    0
patient_nbr                     0
race                         2273
gender                          0
age                             0
weight                      98569
admission_type_id               0
discharge_disposition_id        0
admission_source_id             0
time_in_hospital                0
payer_code                  40256
medical_specialty           49949
num_lab_procedures              0
num_procedures                  0
num_medications                 0
number_outpatient               0
number_emergency                0
number_inpatient                0
diag_1                         21
diag_2                        358
diag_3                       1423
number_diagnoses                0
max_glu_serum                   0
A1Cresult                       0
metformin                       0
repaglinide                     0
nateglinide                     0
chlorpropamide                  0
glimepiride                     0
acetohexamide 

### Procesamiento de los datos

In [6]:
# Cargar el dataset
df = pd.read_csv('diabetic_data_modified.csv')

# Asegurarse de manejar correctamente los valores nulos
# Identificar las columnas con valores nulos según la descripción proporcionada
cols_with_missing = ['race', 'weight', 'payer_code', 'medical_specialty', 'diag_1', 'diag_2', 'diag_3']

# Imputar los valores nulos con la moda para variables categóricas
categorical_imputer = SimpleImputer(strategy='most_frequent')
df[cols_with_missing] = categorical_imputer.fit_transform(df[cols_with_missing])

# Limpiar y convertir 'weight' a valores numéricos y luego categorizarlo
def clean_weight(weight_str):
    if weight_str == "?":
        return None
    elif weight_str.startswith(">"):
        return float(weight_str[1:]) + 1  # Incrementar en 1 para asegurar que los límites sean correctos
    elif weight_str.startswith("["):
        return float(weight_str.strip("[]").split("-")[0])
    elif weight_str == "Unknown":
        return None
    else:
        return float(weight_str)

df['weight'] = df['weight'].apply(clean_weight)

# Definir los rangos de peso y codificar 'weight'
weight_ranges = ['[0-25)', '[25-50)', '[50-75)', '[75-100)', '[100-125)', '[125-150)', '[150-175)', '[175-200)', '>200']
df['weight_category'] = pd.cut(df['weight'], bins=[0, 25, 50, 75, 100, 125, 150, 175, 200, float('inf')], labels=weight_ranges, right=False)

# Eliminar la columna original 'weight'
df.drop(columns=['weight'], inplace=True)

# Función para asignar categorías a los códigos ICD-9
def assign_icd_category(icd_code):
    if pd.isnull(icd_code):
        return 'Unknown'
    if icd_code.startswith(('E', 'V')):
        return 'E-V codes'
    else:
        code_number = int(icd_code.split('.')[0])  # Tomar solo el número de código ICD-9
        if 1 <= code_number <= 139:
            return '001-139'
        elif 140 <= code_number <= 239:
            return '140-239'
        elif 240 <= code_number <= 279:
            return '240-279'
        elif 280 <= code_number <= 289:
            return '280-289'
        elif 290 <= code_number <= 319:
            return '290-319'
        elif 320 <= code_number <= 389:
            return '320-389'
        elif 390 <= code_number <= 459:
            return '390-459'
        elif 460 <= code_number <= 519:
            return '460-519'
        elif 520 <= code_number <= 579:
            return '520-579'
        elif 580 <= code_number <= 629:
            return '580-629'
        elif 630 <= code_number <= 679:
            return '630-679'
        elif 680 <= code_number <= 709:
            return '680-709'
        elif 710 <= code_number <= 739:
            return '710-739'
        elif 740 <= code_number <= 759:
            return '740-759'
        elif 760 <= code_number <= 779:
            return '760-779'
        elif 780 <= code_number <= 799:
            return '780-799'
        elif 800 <= code_number <= 999:
            return '800-999'
        else:
            return 'Other'  # En caso de no encontrar una categoría válida

# Aplicar la función a cada columna de diagnóstico
for col in ['diag_1', 'diag_2', 'diag_3']:
    df[col + '_category'] = df[col].apply(assign_icd_category)

# Eliminar las columnas originales de diagnóstico
df.drop(columns=['diag_1', 'diag_2', 'diag_3'], inplace=True)

# Función para asignar valores únicos a las franjas de edad
def age_to_value(age_str):
    age_mapping = {
        '[0-10)': 5,
        '[10-20)': 15,
        '[20-30)': 25,
        '[30-40)': 35,
        '[40-50)': 45,
        '[50-60)': 55,
        '[60-70)': 65,
        '[70-80)': 75,
        '[80-90)': 85,
        '[90-100)': 95
    }
    return age_mapping.get(age_str, None)

# Aplicar la función de agrupamiento de edades
df['age'] = df['age'].apply(age_to_value)

# Aplicar codificación one-hot a las variables categóricas, excluyendo 'age' ya que está mapeada a valores únicos
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
if 'age' in categorical_cols:
    categorical_cols.remove('age')
encoder = OneHotEncoder(drop='first', sparse_output=False)
encoded_cols = pd.DataFrame(encoder.fit_transform(df[categorical_cols]))

# Sustituir las columnas originales con las nuevas codificadas
encoded_cols.columns = encoder.get_feature_names_out(categorical_cols)
df.drop(columns=categorical_cols, inplace=True)
df = pd.concat([df, encoded_cols], axis=1)

# Guardar el resultado en un nuevo archivo CSV
# df.to_csv('processed_data.csv', index=False)

print("Datos procesados y guardados en 'processed_data.csv'.")

  df = pd.read_csv('diabetic_data_modified.csv')


Datos procesados y guardados en 'processed_data.csv'.


### Creación del modelo CNN

In [12]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import numpy as np
import tensorflow as tf
import pandas as pd

# Función para cargar los datos
def load_data(filepath):
    return pd.read_csv(filepath)

def crear_modelo_cnn(input_shape, learning_rate=0.001):
    model = tf.keras.Sequential([
        tf.keras.layers.Conv1D(32, 3, activation='relu', input_shape=input_shape),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.MaxPooling1D(2),
        tf.keras.layers.Conv1D(64, 3, activation='relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.MaxPooling1D(2),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(64, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(1, activation='sigmoid', kernel_regularizer=tf.keras.regularizers.l2(0.01))
    ])
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate), loss='binary_crossentropy', metrics=['accuracy'])
    return model

def train_on_device(X_train, y_train, input_shape, epochs, learning_rate):
    model = crear_modelo_cnn(input_shape, learning_rate)
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
    
    history = model.fit(X_train, y_train, epochs=epochs, validation_split=0.2, verbose=1, callbacks=[early_stopping])
    return model, history

# Función para dividir los datos
def split_data(X, y, num_parts):
    X_splits = np.array_split(X, num_parts)
    y_splits = np.array_split(y, num_parts)
    return X_splits, y_splits

# Función principal para la simulación de Federated Learning
def federated_learning(X, y, num_parts, num_iterations, epochs=5, learning_rate=0.001):
    all_accuracies = []

    for iteration in range(num_iterations):
        print(f'Iteración {iteration + 1}/{num_iterations}')
        X_splits, y_splits = split_data(X, y, num_parts)
        local_models = []
        scalers = []

        for i in range(num_parts):
            print(f'Entrenando modelo local {i + 1}/{num_parts}')
            X_train, X_test, y_train, y_test = train_test_split(X_splits[i], y_splits[i], test_size=0.2, random_state=iteration, stratify=y_splits[i])
            
            scaler = StandardScaler()
            X_train_scaled = scaler.fit_transform(X_train).reshape(-1, X_train.shape[1], 1)
            X_test_scaled = scaler.transform(X_test).reshape(-1, X_test.shape[1], 1)
            
            print(f'Tamaño de X_train_scaled: {X_train_scaled.shape}')
            print(f'Tamaño de y_train: {y_train.shape}')
            
            local_model, _ = train_on_device(X_train_scaled, y_train, input_shape=(X_train.shape[1], 1), epochs=epochs, learning_rate=learning_rate)
            local_models.append(local_model)
            scalers.append(scaler)
        
        # Crear el modelo global combinando los modelos locales
        print('Creando modelo global')
        global_model = crear_modelo_cnn(input_shape=(X.shape[1], 1), learning_rate=learning_rate)
        
        # Promediar los pesos capa por capa (Federated Averaging)
        global_weights = [np.mean([model.get_weights()[layer] for model in local_models], axis=0) for layer in range(len(local_models[0].get_weights()))]
        global_model.set_weights(global_weights)
        
        # Evaluar el modelo global en datos de prueba combinados
        X_test_combined = np.vstack([scalers[i].transform(X_splits[i]).reshape(-1, X_splits[i].shape[1], 1) for i in range(num_parts)])
        y_test_combined = np.hstack([y_splits[i] for i in range(num_parts)])
        
        loss, accuracy = global_model.evaluate(X_test_combined, y_test_combined, verbose=0)
        
        all_accuracies.append(accuracy)
        print(f'Accuracy del modelo global después de {iteration + 1} iteración: {accuracy:.5f}\n')
    
    return all_accuracies

# Cargar datos desde el archivo CSV
df = load_data('processed_data.csv')

# Seleccionar características y etiquetas
X = df.drop(columns=['diabetesMed_Yes'])
y = df['diabetesMed_Yes']

# Ejecutar la simulación de Federated Learning
accuracies = federated_learning(X, y, num_parts=5, num_iterations=1, epochs=10, learning_rate=0.001)
print(f'Accuracies: {accuracies}')


Iteración 1/1
Entrenando modelo local 1/5
Tamaño de X_train_scaled: (16283, 213, 1)
Tamaño de y_train: (16283,)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m408/408[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 16ms/step - accuracy: 0.8329 - loss: 1.2353 - val_accuracy: 0.9997 - val_loss: 0.3131
Epoch 2/10
[1m408/408[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 13ms/step - accuracy: 0.9954 - loss: 0.2345 - val_accuracy: 0.9975 - val_loss: 0.1392
Epoch 3/10
[1m408/408[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 14ms/step - accuracy: 0.9972 - loss: 0.1394 - val_accuracy: 1.0000 - val_loss: 0.0800
Epoch 4/10
[1m408/408[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 13ms/step - accuracy: 0.9984 - loss: 0.0811 - val_accuracy: 1.0000 - val_loss: 0.0847
Epoch 5/10
[1m408/408[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 29ms/step - accuracy: 0.9975 - loss: 0.1125 - val_accuracy: 0.9997 - val_loss: 0.0870
Epoch 6/10
[1m408/408[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 19ms/step - accuracy: 0.9982 - loss: 0.0968 - val_accuracy: 0.9975 - val_loss: 0.0993
Entrenando modelo lo

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m408/408[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 15ms/step - accuracy: 0.8500 - loss: 1.2147 - val_accuracy: 0.9988 - val_loss: 0.2720
Epoch 2/10
[1m408/408[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 18ms/step - accuracy: 0.9972 - loss: 0.2069 - val_accuracy: 0.9991 - val_loss: 0.1159
Epoch 3/10
[1m408/408[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 18ms/step - accuracy: 0.9976 - loss: 0.1131 - val_accuracy: 1.0000 - val_loss: 0.0952
Epoch 4/10
[1m408/408[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 20ms/step - accuracy: 0.9970 - loss: 0.1015 - val_accuracy: 0.9994 - val_loss: 0.0787
Epoch 5/10
[1m408/408[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 17ms/step - accuracy: 0.9983 - loss: 0.0870 - val_accuracy: 0.9997 - val_loss: 0.0689
Epoch 6/10
[1m408/408[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 14ms/step - accuracy: 0.9994 - loss: 0.0701 - val_accuracy: 0.9942 - val_loss: 0.0889
Epoch 7/10
[1m408/40

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m408/408[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 17ms/step - accuracy: 0.8507 - loss: 1.2238 - val_accuracy: 0.9982 - val_loss: 0.3059
Epoch 2/10
[1m408/408[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 17ms/step - accuracy: 0.9943 - loss: 0.2298 - val_accuracy: 1.0000 - val_loss: 0.1241
Epoch 3/10
[1m408/408[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 17ms/step - accuracy: 0.9953 - loss: 0.1362 - val_accuracy: 0.9979 - val_loss: 0.1034
Epoch 4/10
[1m408/408[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 16ms/step - accuracy: 0.9959 - loss: 0.1129 - val_accuracy: 1.0000 - val_loss: 0.1096
Epoch 5/10
[1m408/408[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 17ms/step - accuracy: 0.9959 - loss: 0.1115 - val_accuracy: 0.9985 - val_loss: 0.0706
Epoch 6/10
[1m408/408[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 18ms/step - accuracy: 0.9976 - loss: 0.0801 - val_accuracy: 1.0000 - val_loss: 0.0619
Epoch 7/10
[1m408/40

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m408/408[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 18ms/step - accuracy: 0.8216 - loss: 1.3078 - val_accuracy: 1.0000 - val_loss: 0.3291
Epoch 2/10
[1m408/408[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 18ms/step - accuracy: 0.9950 - loss: 0.2538 - val_accuracy: 0.9979 - val_loss: 0.1547
Epoch 3/10
[1m408/408[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 18ms/step - accuracy: 0.9950 - loss: 0.1525 - val_accuracy: 0.9988 - val_loss: 0.1146
Epoch 4/10
[1m408/408[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 18ms/step - accuracy: 0.9969 - loss: 0.1163 - val_accuracy: 0.9997 - val_loss: 0.0912
Epoch 5/10
[1m408/408[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 17ms/step - accuracy: 0.9967 - loss: 0.0992 - val_accuracy: 1.0000 - val_loss: 0.0921
Epoch 6/10
[1m408/408[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 17ms/step - accuracy: 0.9981 - loss: 0.0893 - val_accuracy: 0.9969 - val_loss: 0.0768
Epoch 7/10
[1m408/40

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m408/408[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 19ms/step - accuracy: 0.8041 - loss: 1.3133 - val_accuracy: 0.9988 - val_loss: 0.3592
Epoch 2/10
[1m408/408[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 18ms/step - accuracy: 0.9914 - loss: 0.2776 - val_accuracy: 0.9966 - val_loss: 0.1844
Epoch 3/10
[1m408/408[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 19ms/step - accuracy: 0.9938 - loss: 0.1547 - val_accuracy: 0.9975 - val_loss: 0.1224
Epoch 4/10
[1m408/408[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 18ms/step - accuracy: 0.9961 - loss: 0.1135 - val_accuracy: 0.9957 - val_loss: 0.1425
Epoch 5/10
[1m408/408[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 17ms/step - accuracy: 0.9972 - loss: 0.1120 - val_accuracy: 0.9988 - val_loss: 0.1168
Epoch 6/10
[1m408/408[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 18ms/step - accuracy: 0.9963 - loss: 0.1065 - val_accuracy: 1.0000 - val_loss: 0.0681
Epoch 7/10
[1m408/40