In [165]:
# Importar pandas para manejar el dataset
import pandas as pd
# Cargar el dataset DatosLimpiadosBasicos.csv
dataset = pd.read_csv('DataSets/DatosLimpiadosBasicos.csv')
# Mostrar las primeras filas del dataset
dataset.head()

Unnamed: 0,Diag01,Diag02,Diag03,Diag04,Diag05,Diag06,Diag07,Diag08,Diag09,Diag10,...,Proced24,Proced25,Proced26,Proced27,Proced28,Proced29,Proced30,Edad,Sexo_bin,GRD
0,A41.8,B37.6,I39.8,N10,B96.1,L89.9,L08.9,B96.2,A41.5,J86.9,...,99.84,88.72,90.42,90.52,91.39,91.33,87.03,40,1,184103
1,U07.1,J12.8,R06.0,R05,R50.9,Z29.0,Z01.7,J96.00,J94.2,J92.9,...,91.62,90.43,91.39,90.52,91.32,96.59,90.99,53,1,41013
2,K56.5,R57.2,R57.1,J80,Y95,J15.0,U82.2,B95.6,B96.8,B37.1,...,99.84,91.73,90.53,99.26,89.39,89.66,89.65,65,1,41013
3,K76.8,K66.1,N18.5,D64.9,E87.5,E87.2,J81,N17.8,J44.9,R41.0,...,57.94,0.13,0.17,99.04,99.18,99.21,99.23,61,1,41023
4,T81.0,Y83.2,S31.1,S36.80,W31.62,J96.09,J15.0,U82.2,U07.1,N39.0,...,90.52,91.39,91.32,93.9,99.15,96.59,45.13,30,1,41023


In [164]:
# Crear una nueva columna Diag_Principal con el formato deseado, manejando errores de formato
def parse_diag(diag):
    try:
        if isinstance(diag, float) or diag is None or diag == '':
            return [0, 0, 0, 0]  # Manejar valores NaN o vacíos
        if '.' in diag:
            parts = diag.split('.')
            if len(parts) == 2:
                if len(parts[1]) == 2:
                    return [parts[0][0], parts[0][1:], parts[1][0], parts[1][1]]
                else:
                    return [parts[0][0], parts[0][1:], parts[1], 0]
        else:
            # Manejar casos como Y95
            return [diag[0], diag[1:], 0, 0]
        return None
    except (IndexError, ValueError):
        return None

# Aplicar la función corregida
dataset['Diag_Principal'] = dataset['Diag01'].dropna().apply(parse_diag)

# Mostrar las primeras filas del dataset con la nueva columna
dataset[['Diag01', 'Diag_Principal']].head()

Unnamed: 0,Diag01,Diag_Principal
0,A41.8,"[A, 41, 8]"
1,U07.1,"[U, 07, 1]"
2,K56.5,"[K, 56, 5]"
3,K76.8,"[K, 76, 8]"
4,T81.0,"[T, 81, 0]"


In [154]:
# Crear una nueva columna Diag_Secundario que combine los resultados de Diag02 a Diag35
columns_to_process = [f'Diag{str(i).zfill(2)}' for i in range(2, 36)]

# Modificar la función combine_secondary_diagnoses para reemplazar None por 0
def combine_secondary_diagnoses(row):
    combined = []
    for col in columns_to_process:
        if col in row:
            parsed = parse_diag(row[col])  # parse_diag ya maneja casos vacíos
            combined.extend(parsed)
    return combined

# Actualizar la columna Diag_Secundario con la nueva lógica
dataset['Diag_Secundario'] = dataset.apply(combine_secondary_diagnoses, axis=1)

# Mostrar las primeras filas del dataset con la nueva columna
dataset[['Diag_Secundario']].head()

Unnamed: 0,Diag_Secundario
0,"[B, 37, 6, 0, I, 39, 8, 0, N, 10, 0, 0, B, 96,..."
1,"[J, 12, 8, 0, R, 06, 0, 0, R, 05, 0, 0, R, 50,..."
2,"[R, 57, 2, 0, R, 57, 1, 0, J, 80, 0, 0, Y, 95,..."
3,"[K, 66, 1, 0, N, 18, 5, 0, D, 64, 9, 0, E, 87,..."
4,"[Y, 83, 2, 0, S, 31, 1, 0, S, 36, 8, 0, W, 31,..."


In [155]:
# Crear una nueva columna Diag_Principal con el formato deseado, manejando errores de formato
def parse_proced(proced):
    try:
        if proced == None or proced == '':
            return [0, 0, 0, 0]  # Manejar valores NaN o vacíos
        proced = str(proced)  # Convertir a cadena para asegurar el procesamiento
        if '.' in proced:
            parts = proced.split('.')
            if len(parts) == 2:
                part0 = parts[0]
                part1 = parts[1]

                # Manejar el caso en el que part0 tiene longitud 1
                if len(part0) == 1:
                    part0_main = 0
                    part0_sub = part0
                else:
                    part0_main = part0[0]
                    part0_sub = part0[1:]

                if len(part1) == 2:
                    return [part0_main, part0_sub, part1[0], part1[1]]
                else:
                    return [part0_main, part0_sub, part1, 0]
        return None
    except (IndexError, ValueError):
        return None

# Aplicar la función parse_proced correctamente a la columna Proced01
dataset['Proced_Principal'] = dataset['Proced01'].dropna().apply(parse_proced)
# Mostrar las primeras filas del dataset con la nueva columna
resultado = dataset[['Proced01', 'Proced_Principal']].head()
resultado

Unnamed: 0,Proced01,Proced_Principal
0,86.28,"[8, 6, 2, 8]"
1,31.1,"[3, 1, 1, 0]"
2,86.22,"[8, 6, 2, 2]"
3,54.19,"[5, 4, 1, 9]"
4,54.11,"[5, 4, 1, 1]"


In [156]:
columns_to_process = [f'Proced{str(i).zfill(2)}' for i in range(2, 36)]

# Modificar la función combine_secondary_diagnoses para reemplazar None por 0
def combine_secondary_proced(row):
    combined = []
    for col in columns_to_process:
        if col in row:
            parsed = parse_proced(row[col])  # parse_proced ya maneja casos vacíos
            if parsed is not None:  # Verificar si parsed no es None
                combined.extend(parsed)
            else:
                combined.extend([0, 0, 0, 0])
    return combined

# Actualizar la columna Diag_Secundario con la nueva lógica
dataset['Proced_Secundario'] = dataset.apply(combine_secondary_proced, axis=1)

# Mostrar las primeras filas del dataset con la nueva columna
dataset[['Proced_Secundario']].head()

Unnamed: 0,Proced_Secundario
0,"[3, 4, 9, 1, 8, 8, 3, 8, 8, 7, 4, 1, 8, 8, 7, ..."
1,"[8, 6, 2, 8, 9, 6, 7, 2, 9, 3, 9, 0, 8, 9, 7, ..."
2,"[5, 4, 5, 9, 9, 6, 7, 2, 3, 1, 1, 0, 4, 5, 1, ..."
3,"[3, 9, 9, 8, 8, 6, 2, 8, 5, 4, 6, 2, 9, 6, 7, ..."
4,"[4, 5, 9, 0, 4, 6, 7, 3, 4, 6, 2, 0, 8, 6, 2, ..."


In [157]:
# Descomponer los datos de GRD en las columnas correspondientes
def descomponer_grd(grd):
    try:
        grd_str = str(grd).zfill(6)  # Asegurar que el GRD tenga 7 caracteres
        return [
            int(grd_str[0:2]),  # CDM
            int(grd_str[2]),    # Tipo_GRD
            int(grd_str[3:5]),  # GRD
            int(grd_str[5:7])   # Severidad
        ]
    except ValueError:
        return [None, None, None, None]

# Aplicar la función para descomponer la columna GRD
dataset[['CDM', 'Tipo_GRD', 'GRD_', 'Severidad']] = dataset['GRD'].apply(descomponer_grd).apply(pd.Series)

# Mostrar las primeras filas del dataset con las nuevas columnas
dataset[['GRD', 'CDM', 'Tipo_GRD', 'GRD_', 'Severidad']].head()

Unnamed: 0,GRD,CDM,Tipo_GRD,GRD_,Severidad
0,184103,18,4,10,3
1,41013,4,1,1,3
2,41013,4,1,1,3
3,41023,4,1,2,3
4,41023,4,1,2,3


In [158]:
dataset.head()


Unnamed: 0,Diag01,Diag02,Diag03,Diag04,Diag05,Diag06,Diag07,Diag08,Diag09,Diag10,...,Sexo_bin,GRD,Diag_Principal,Diag_Secundario,Proced_Principal,Proced_Secundario,CDM,Tipo_GRD,GRD_,Severidad
0,A41.8,B37.6,I39.8,N10,B96.1,L89.9,L08.9,B96.2,A41.5,J86.9,...,1,184103,"[A, 41, 8, 0]","[B, 37, 6, 0, I, 39, 8, 0, N, 10, 0, 0, B, 96,...","[8, 6, 2, 8]","[3, 4, 9, 1, 8, 8, 3, 8, 8, 7, 4, 1, 8, 8, 7, ...",18,4,10,3
1,U07.1,J12.8,R06.0,R05,R50.9,Z29.0,Z01.7,J96.00,J94.2,J92.9,...,1,41013,"[U, 07, 1, 0]","[J, 12, 8, 0, R, 06, 0, 0, R, 05, 0, 0, R, 50,...","[3, 1, 1, 0]","[8, 6, 2, 8, 9, 6, 7, 2, 9, 3, 9, 0, 8, 9, 7, ...",4,1,1,3
2,K56.5,R57.2,R57.1,J80,Y95,J15.0,U82.2,B95.6,B96.8,B37.1,...,1,41013,"[K, 56, 5, 0]","[R, 57, 2, 0, R, 57, 1, 0, J, 80, 0, 0, Y, 95,...","[8, 6, 2, 2]","[5, 4, 5, 9, 9, 6, 7, 2, 3, 1, 1, 0, 4, 5, 1, ...",4,1,1,3
3,K76.8,K66.1,N18.5,D64.9,E87.5,E87.2,J81,N17.8,J44.9,R41.0,...,1,41023,"[K, 76, 8, 0]","[K, 66, 1, 0, N, 18, 5, 0, D, 64, 9, 0, E, 87,...","[5, 4, 1, 9]","[3, 9, 9, 8, 8, 6, 2, 8, 5, 4, 6, 2, 9, 6, 7, ...",4,1,2,3
4,T81.0,Y83.2,S31.1,S36.80,W31.62,J96.09,J15.0,U82.2,U07.1,N39.0,...,1,41023,"[T, 81, 0, 0]","[Y, 83, 2, 0, S, 31, 1, 0, S, 36, 8, 0, W, 31,...","[5, 4, 1, 1]","[4, 5, 9, 0, 4, 6, 7, 3, 4, 6, 2, 0, 8, 6, 2, ...",4,1,2,3


In [160]:
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

diag_as_text = [' '.join(seq) for seq in dataset['Diag_Secundario'].astype(str)]

tokenizer_diag = Tokenizer(oov_token='<OOV>')
tokenizer_diag.fit_on_texts(diag_as_text)

sequences_diag = tokenizer_diag.texts_to_sequences(diag_as_text)
padded_sequences_diag = pad_sequences(sequences_diag, padding='post')

print(padded_sequences_diag)
print(len(padded_sequences_diag))
print(padded_sequences_diag[0])

[[ 3 23  3 ...  0  0  0]
 [ 3 18  3 ...  0  0  0]
 [ 3 20  3 ...  0  0  0]
 ...
 [ 3 16  3 ...  0  0  0]
 [ 3 16  3 ...  0  0  0]
 [ 3 34  3 ...  0  0  0]]
14561
[ 3 23  3  3  8 11  3  3 10  3  2  3 16  3  3  8  4  3  3  9  3  2  3 19
  3  3  5  2  3  2  2  3 23  3  3  4 10  3  3  5  3  2  3 26  3  3  9  4
  3  3  4  3  2  3 26  3  3  2  9  3  3  4  3  2  3 23  3  3  4 10  3  3
  6  3  2  3 32  3  3 12  5  3  3 13  3  2  3 18  3  3  9 10  3  3  4  3
  2  3 29  3  3  2 11  3  3  5  3  2  3 28  3  3  4 13  3  2  2  3 19  3
  3  5  9  3  3 13  3  2  3 14  3  3  9 11  3  3  5  3  2  3 22  3  3 10
 12  3  3  4  3  2  3 27  3  3  9  6  3  3  6  3  2  3 19  3  3  8  5  3
  3  4  3  2  3 16  3  3  5  2  3  2  2  3 21  3  3 11  2  3  3 12  3  2
  3 17  3  3  5  4  3  3  6  3  2  3 17  3  3  5  2  3  3  6  3  2  3 14
  3  3 12 10  3  2  2  3 27  3  3 12  2  3  3  4  3  2  3 26  3  3  6  5
  3  3  4  3  2  3  7  3  3  4  8  3  3 13  3  2  3  7  3  3  4  8  3  3
  8  3  2  3  7  3  3  4  4  3  3  