In [40]:
# Importar pandas para manejar el dataset
import pandas as pd
# Cargar el dataset DatosLimpiadosBasicos.csv
dataset = pd.read_csv('DataSets/DatosLimpiadosBasicos.csv')
# Mostrar las primeras filas del dataset
dataset.head()

Unnamed: 0,Diag01,Diag02,Diag03,Diag04,Diag05,Diag06,Diag07,Diag08,Diag09,Diag10,...,Proced24,Proced25,Proced26,Proced27,Proced28,Proced29,Proced30,Edad,Sexo_bin,GRD
0,A41.8,B37.6,I39.8,N10,B96.1,L89.9,L08.9,B96.2,A41.5,J86.9,...,99.84,88.72,90.42,90.52,91.39,91.33,87.03,40,1,184103
1,U07.1,J12.8,R06.0,R05,R50.9,Z29.0,Z01.7,J96.00,J94.2,J92.9,...,91.62,90.43,91.39,90.52,91.32,96.59,90.99,53,1,41013
2,K56.5,R57.2,R57.1,J80,Y95,J15.0,U82.2,B95.6,B96.8,B37.1,...,99.84,91.73,90.53,99.26,89.39,89.66,89.65,65,1,41013
3,K76.8,K66.1,N18.5,D64.9,E87.5,E87.2,J81,N17.8,J44.9,R41.0,...,57.94,0.13,0.17,99.04,99.18,99.21,99.23,61,1,41023
4,T81.0,Y83.2,S31.1,S36.80,W31.62,J96.09,J15.0,U82.2,U07.1,N39.0,...,90.52,91.39,91.32,93.9,99.15,96.59,45.13,30,1,41023


In [41]:
# Crear una nueva columna Diag_Principal con el formato deseado, manejando errores de formato
def parse_diag(diag):
    try:
        if isinstance(diag, float) or diag is None or diag == '':
            return [0, 0, 0, 0]  # Manejar valores NaN o vacíos
        if '.' in diag:
            parts = diag.split('.')
            if len(parts) == 2:
                if len(parts[1]) == 2:
                    return [(parts[0][0]), parts[0][1:], '.' + parts[1]]
                elif len(parts[1]) == 1:  # Manejar casos como "A41.8"
                    return [parts[0][0], parts[0][1:], '.' + parts[1]]

        else:
            # Manejar casos como Y95
            return [diag[0], diag[1:]]
        return None
    except (IndexError, ValueError):
        return None

# Aplicar la función corregida
dataset['Diag_Principal'] = dataset['Diag01'].dropna().apply(parse_diag)

# Mostrar las primeras filas del dataset con la nueva columna
dataset[['Diag01', 'Diag_Principal']].head()

Unnamed: 0,Diag01,Diag_Principal
0,A41.8,"[A, 41, .8]"
1,U07.1,"[U, 07, .1]"
2,K56.5,"[K, 56, .5]"
3,K76.8,"[K, 76, .8]"
4,T81.0,"[T, 81, .0]"


In [42]:
# Crear una nueva columna Diag_Secundario que combine los resultados de Diag02 a Diag35
columns_to_process = [f'Diag{str(i).zfill(2)}' for i in range(2, 36)]

# Modificar la función combine_secondary_diagnoses para reemplazar None por 0
def combine_secondary_diagnoses(row):
    combined = []
    for col in columns_to_process:
        if col in row:
            parsed = parse_diag(row[col])  # parse_diag ya maneja casos vacíos
            combined.extend(parsed)
    return combined

# Actualizar la columna Diag_Secundario con la nueva lógica
dataset['Diag_Secundario'] = dataset.apply(combine_secondary_diagnoses, axis=1)

# Mostrar las primeras filas del dataset con la nueva columna
dataset[['Diag_Secundario']].head()

Unnamed: 0,Diag_Secundario
0,"[B, 37, .6, I, 39, .8, N, 10, B, 96, .1, L, 89..."
1,"[J, 12, .8, R, 06, .0, R, 05, R, 50, .9, Z, 29..."
2,"[R, 57, .2, R, 57, .1, J, 80, Y, 95, J, 15, .0..."
3,"[K, 66, .1, N, 18, .5, D, 64, .9, E, 87, .5, E..."
4,"[Y, 83, .2, S, 31, .1, S, 36, .80, W, 31, .62,..."


In [43]:
# Crear una nueva columna Diag_Principal con el formato deseado, manejando errores de formato
def parse_proced(proced):
    try:
        if proced == None or proced == '':
            return [0, 0, 0, 0]  # Manejar valores NaN o vacíos
        proced = str(proced)  # Convertir a cadena para asegurar el procesamiento
        if '.' in proced:  # Corregir el error tipográfico
            parts = proced.split('.')
            if len(parts) == 2:
                return [parts[0], '.' + parts[1]]

        return None
    except (IndexError, ValueError):
        return None

# Aplicar la función parse_proced correctamente a la columna Proced01
dataset['Proced_Principal'] = dataset['Proced01'].dropna().apply(parse_proced)
# Mostrar las primeras filas del dataset con la nueva columna
resultado = dataset[['Proced01', 'Proced_Principal']].head()
resultado

Unnamed: 0,Proced01,Proced_Principal
0,86.28,"[86, .28]"
1,31.1,"[31, .1]"
2,86.22,"[86, .22]"
3,54.19,"[54, .19]"
4,54.11,"[54, .11]"


In [44]:
columns_to_process = [f'Proced{str(i).zfill(2)}' for i in range(2, 36)]

# Modificar la función combine_secondary_diagnoses para reemplazar None por 0
def combine_secondary_proced(row):
    combined = []
    for col in columns_to_process:
        if col in row:
            parsed = parse_proced(row[col])  # parse_proced ya maneja casos vacíos
            if parsed is not None:  # Verificar si parsed no es None
                combined.extend(parsed)
    return combined

# Actualizar la columna Diag_Secundario con la nueva lógica
dataset['Proced_Secundario'] = dataset.apply(combine_secondary_proced, axis=1)

# Mostrar las primeras filas del dataset con la nueva columna
dataset[['Proced_Secundario']].head()

Unnamed: 0,Proced_Secundario
0,"[34, .91, 88, .38, 87, .41, 88, .75, 88, .01, ..."
1,"[86, .28, 96, .72, 93, .9, 89, .7, 88, .01, 87..."
2,"[54, .59, 96, .72, 31, .1, 45, .13, 87, .41, 8..."
3,"[39, .98, 86, .28, 54, .62, 96, .72, 96, .04, ..."
4,"[45, .9, 46, .73, 46, .2, 86, .28, 54, .19, 89..."


In [45]:
# Descomponer los datos de GRD en las columnas correspondientes
def descomponer_grd(grd):
    try:
        grd_str = str(grd).zfill(6)  # Asegurar que el GRD tenga 7 caracteres
        return [
            int(grd_str[0:2]),  # CDM
            int(grd_str[2]),    # Tipo_GRD
            int(grd_str[3:5]),  # GRD
            int(grd_str[5:7])   # Severidad
        ]
    except ValueError:
        return [None, None, None, None]

# Aplicar la función para descomponer la columna GRD
dataset[['CDM', 'Tipo_GRD', 'GRD_', 'Severidad']] = dataset['GRD'].apply(descomponer_grd).apply(pd.Series)

# Mostrar las primeras filas del dataset con las nuevas columnas
dataset[['GRD', 'CDM', 'Tipo_GRD', 'GRD_', 'Severidad']].head()

Unnamed: 0,GRD,CDM,Tipo_GRD,GRD_,Severidad
0,184103,18,4,10,3
1,41013,4,1,1,3
2,41013,4,1,1,3
3,41023,4,1,2,3
4,41023,4,1,2,3


In [46]:
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences


# Tokenize Diag_Principal
principal_as_text = [' '.join(map(str, seq)) for seq in dataset['Diag_Principal']]
tokenizer_principal = Tokenizer(oov_token='<OOV>')
tokenizer_principal.fit_on_texts(principal_as_text)
sequences_principal = tokenizer_principal.texts_to_sequences(principal_as_text)
padded_principal = pad_sequences(sequences_principal, padding='post')
dataset['Diag_Principal_Token'] = padded_principal.tolist()

# Tokenize Diag_Secundario
secundario_as_text = [' '.join(map(str, seq)) for seq in dataset['Diag_Secundario']]
tokenizer_secundario = Tokenizer(oov_token='<OOV>')
tokenizer_secundario.fit_on_texts(secundario_as_text)
sequences_secundario = tokenizer_secundario.texts_to_sequences(secundario_as_text)
padded_secundario = pad_sequences(sequences_secundario, padding='post')
dataset['Diag_Secundario_Token'] = padded_secundario.tolist()

# Tokenize Proced_Principal
proced_principal_as_text = [' '.join(map(str, seq)) for seq in dataset['Proced_Principal']]
tokenizer_proced_principal = Tokenizer(oov_token='<OOV>')
tokenizer_proced_principal.fit_on_texts(proced_principal_as_text)
sequences_proced_principal = tokenizer_proced_principal.texts_to_sequences(proced_principal_as_text)
padded_proced_principal = pad_sequences(sequences_proced_principal, padding='post')
dataset['Proced_Principal_Token'] = padded_proced_principal.tolist()

# Tokenize Proced_Secundario
proced_secundario_as_text = [' '.join(map(str, seq)) for seq in dataset['Proced_Secundario']]
tokenizer_proced_secundario = Tokenizer(oov_token='<OOV>')
tokenizer_proced_secundario.fit_on_texts(proced_secundario_as_text)
sequences_proced_secundario = tokenizer_proced_secundario.texts_to_sequences(proced_secundario_as_text)
padded_proced_secundario = pad_sequences(sequences_proced_secundario, padding='post')
dataset['Proced_Secundario_Token'] = padded_proced_secundario.tolist()

# Ensure tokenized columns are stored as lists instead of strings
columns_to_convert = ['Diag_Principal_Token', 'Diag_Secundario_Token', 'Proced_Principal_Token', 'Proced_Secundario_Token']
for column in columns_to_convert:
    dataset[column] = dataset[column].apply(lambda x: list(map(int, x.split())) if isinstance(x, str) else x)

print(padded_sequences_diag)
print(len(padded_sequences_diag))
print(padded_sequences_diag[0])
print(tokenizer_diag.word_index)


[[22  7 10 ...  0  0  0]
 [17  4  5 ...  0  0  0]
 [19 12 10 ...  0  0  0]
 ...
 [15  4  2 ...  0  0  0]
 [15  4  2 ...  0  0  0]
 [33  2  9 ...  0  0  0]]
14561
[22  7 10  9 15  7  3  8 18  4  2 22  3  9  4 25  8  3  3 25  2  8  3 22
  3  9  5 31 11  4 12 17  8  9  3 28  2 10  4 27  3 12 18  4  8 12 13  8
 10  4 21  9 11  3 26  8  5  5 18  7  4  3 15  4  2 20 10  2 11 16  4  3
  5 16  4  2  5 13 11  9 26 11  2  3 25  5  4  3  6  3  7 12  6  3  7  7
  6  3  3  5  6  3  4  9  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2
  2  2  2  2  2  2  2  2  2  2  2  2  0  0  0  0  0  0  0  0  0  0  0  0
  0]
{'<OOV>': 1, '0': 2, '9': 3, '1': 4, '2': 5, 'z': 6, '3': 7, '8': 8, '6': 9, '7': 10, '4': 11, '5': 12, 'e': 13, 'o': 14, 'i': 15, 'f': 16, 'j': 17, 'n': 18, 'r': 19, 'k': 20, 'd': 21, 'b': 22, 'p': 23, 'm': 24, 'l': 25, 'g': 26, 'y': 27, 'u': 28, 'h': 29, 'x': 30, 'a': 31, 't': 32, 's': 33, 'w': 34, 'q': 35, 'c': 36, 'v': 37}


In [47]:
dataset.head()

Unnamed: 0,Diag01,Diag02,Diag03,Diag04,Diag05,Diag06,Diag07,Diag08,Diag09,Diag10,...,Proced_Principal,Proced_Secundario,CDM,Tipo_GRD,GRD_,Severidad,Diag_Principal_Token,Diag_Secundario_Token,Proced_Principal_Token,Proced_Secundario_Token
0,A41.8,B37.6,I39.8,N10,B96.1,L89.9,L08.9,B96.2,A41.5,J86.9,...,"[86, .28]","[34, .91, 88, .38, 87, .41, 88, .75, 88, .01, ...",18,4,10,3,"[32, 51, 9]","[34, 22, 30, 10, 8, 11, 19, 12, 34, 27, 6, 48,...","[19, 58]","[29, 11, 13, 27, 19, 43, 13, 24, 13, 18, 13, 5..."
1,U07.1,J12.8,R06.0,R05,R50.9,Z29.0,Z01.7,J96.00,J94.2,J92.9,...,"[31, .1]","[86, .28, 96, .72, 93, .9, 89, .7, 88, .01, 87...",4,1,1,3,"[25, 21, 5]","[16, 63, 11, 23, 83, 2, 23, 70, 23, 40, 5, 3, ...","[60, 6]","[72, 73, 15, 50, 3, 46, 4, 9, 13, 18, 19, 43, ..."
2,K56.5,R57.2,R57.1,J80,Y95,J15.0,U82.2,B95.6,B96.8,B37.1,...,"[86, .22]","[54, .59, 96, .72, 31, .1, 45, .13, 87, .41, 8...",4,1,1,3,"[6, 86, 14]","[23, 80, 4, 23, 80, 6, 16, 33, 53, 54, 16, 119...","[19, 41]","[74, 7, 15, 50, 68, 82, 69, 32, 19, 43, 19, 51..."
3,K76.8,K66.1,N18.5,D64.9,E87.5,E87.2,J81,N17.8,J44.9,R41.0,...,"[54, .19]","[39, .98, 86, .28, 54, .62, 96, .72, 96, .04, ...",4,1,2,3,"[6, 49, 9]","[26, 29, 6, 19, 39, 21, 32, 45, 5, 7, 25, 21, ...","[44, 51]","[6, 98, 72, 73, 74, 89, 15, 50, 15, 40, 19, 43..."
4,T81.0,Y83.2,S31.1,S36.80,W31.62,J96.09,J15.0,U82.2,U07.1,N39.0,...,"[54, .11]","[45, .9, 46, .73, 46, .2, 86, .28, 54, .19, 89...",4,1,2,3,"[20, 31, 3]","[53, 50, 4, 88, 109, 6, 88, 91, 33, 95, 109, 1...","[44, 23]","[69, 46, 86, 53, 86, 101, 72, 73, 74, 12, 4, 9..."


In [48]:
# Save the tokenized dataset to a CSV file in the DataSets folder
dataset.to_csv('DataSets/DataSetTokenizado.csv', index=False)