In [2]:
import numpy as np
import pandas as pd

In [None]:
# Se importan los datos para ambas bases de datos
diagnose_df = pd.read_excel("datos/Datos_diagnosticos.xlsx") # Datos de diagnostico de pacientes
process_df = pd.read_csv('datos/grouped_procedures.csv', delimiter=",") # Datos de procedimientos

process_df = process_df.rename({"Case": "CASE"}, axis="columns")
process_df = process_df.rename({"LOS (days)": "LOS"}, axis="columns")

process_df

Unnamed: 0,CASE,Procedure,LOS
0,13872110,0A,3
1,14035188,0A,3
2,14085514,0A,1
3,14111667,0A,3
4,14111831,0A,2
...,...,...,...
11946,15056499,4-5,1
11947,15056507,4,1
11948,15056517,4,1
11949,15057405,0A,1


In [None]:
# Guardamos el diagnostico de acuerdo a su primera letra
diagnose = [d[0] for d in diagnose_df["Diagnosis"]]

diagnose_df["Diagnosis"] = diagnose

In [None]:
# Se realiza un join entre ambas bases de datos
data = pd.merge(process_df, diagnose_df, on="CASE")

In [None]:
print(len(data["CASE"]))
data

110060


Unnamed: 0,CASE,Procedure,LOS,Seq,PrincSec,Diagnosis
0,13872110,0A,3,1,P,E
1,13872110,0A,3,1,S,Z
2,13872110,0A,3,1,S,D
3,14035188,0A,3,1,P,J
4,14035188,0A,3,1,S,I
...,...,...,...,...,...,...
110055,15057405,0A,1,2,S,D
110056,15057706,1,0,1,P,O
110057,15057706,1,0,2,P,O
110058,15057706,1,0,2,S,Z


In [None]:
# Filtramos los datos solo para el diagnostico principal
data = data[data['PrincSec'] == 'P']

In [None]:
data

Unnamed: 0,CASE,Procedure,LOS,Seq,PrincSec,Diagnosis
0,13872110,0A,3,1,P,E
3,14035188,0A,3,1,P,J
8,14085514,0A,1,1,P,S
11,14111667,0A,3,1,P,E
14,14111831,0A,2,1,P,E
...,...,...,...,...,...,...
110050,15056517,4,1,1,P,G
110052,15057405,0A,1,1,P,U
110053,15057405,0A,1,2,P,N
110056,15057706,1,0,1,P,O


## El archivo categorizacion_filtro.ipynb genera una base de datos que implementa todo el codigo pasado de este cuaderno, ademas de filtrar por cantidad de procedimientos

In [13]:
data = pd.read_csv('data_redes.csv', delimiter=",", )

TypeError: ignored

In [4]:
# Definimos un umbral minimo de probabilidad
umbral = 0.0

# Obtenemos las probabilidades a priori de los diagnosticos
probs_diagnosis = dict()

total = len(data)
diagnosis_count = data.groupby(['Diagnosis']).count()

for diagnose, obs in diagnosis_count.iterrows():
    prob = obs['Procedure']/total
    if prob >= umbral:
        probs_diagnosis[diagnose] = prob

In [5]:
probs_diagnosis

{'A': 0.011461142436871782,
 'B': 0.0020838440794312334,
 'C': 0.04615101740622702,
 'D': 0.023351311595979408,
 'E': 0.03321892620740378,
 'F': 0.00312576611914685,
 'G': 0.06177984800196126,
 'H': 0.003922530031870556,
 'I': 0.10756312821770042,
 'J': 0.04547683255699927,
 'K': 0.08525373866143662,
 'L': 0.007845060063741112,
 'M': 0.08053444471684236,
 'N': 0.040022064231429275,
 'O': 0.06931846040696249,
 'P': 0.003309634714390782,
 'Q': 0.0033709242461387595,
 'R': 0.020348124540328513,
 'S': 0.05129933807305712,
 'T': 0.023780338318215247,
 'U': 0.20630056386369208,
 'Z': 0.07048296151017407}

In [6]:
# Funcion que formatea los datos en un string unido por guiones
def formatter(lista):
    return "-".join(lista)

# Obtenemos una lista de los valores de la columna Case
cases = sorted(list(data['CASE'].values))

# Eliminamos los repetidos

patients = list()
for case in cases:
    if case not in patients:
        patients.append(case)

In [7]:
# Para cada paciente, agrupamos sus diagnosticos
filas = list()

for i in range(len(patients)):
    # Se filtra la base de datos solo con los datos para el paciente i
    filtro = data[data['CASE'] == patients[i]]
    
    # Se obtienen sus diagnosticos
    diagnosticos = set(filtro['Diagnosis'].values)
    diagnosticos = tuple(sorted(diagnosticos))
    
    procedures = filtro['Procedure'].values[0]
    
    case = patients[i]
    los = max(list(filtro['LOS'].values))
    row = (case, diagnosticos, procedures, los)
    
    filas.append(row)

In [8]:
new_df = pd.DataFrame(filas, columns=["Case", "Diagnosis", "Procedure", "LOS"])

In [9]:
new_df['Case'].unique()

array([13872110, 14035188, 14085514, ..., 15056517, 15057405, 15057706])

In [10]:
new_df['Procedure'].unique()

array(['0C', '0A', '0B', '0D', '0F-F', '0F-3', '0F-3-F', '0F',
       '0F-3-4-5-B', '0F-3-5-C', '1', '0D-1-3-4', '0C-1', '0C-1-3-4',
       '0C-1-3', 'F', '1-3-4', '0F-3-B-F', '0B-1-3-4', '1-3', '0C-1-4',
       '0B-1-3', '1-4', '0B-1', '0B-1-4', '0D-1', '0F-3-5', '5-B', '3-4',
       '0E', '0B-B', '0C-5', '0E-5', '0E-3', '0D-1-4', '0F-D', '4',
       '0F-5', '0A-3-4-B', '0C-B', 'C', '0D-1-3', 'B', '0F-3-4-F',
       '0E-5-B', '0A-4-B', '0C-3', '0E-B', '0E-3-F', '0D-3-5', '0F-B',
       '0F-3-5-B', '0A-4', '0F-3-B', '0A-3-4', '0A-1-3-4', '4-5-B', '4-B',
       '0D-3', '0F-4-5-B', '0D-5-B', '3', '0F-3-D', '0D-5', '0F-B-F',
       '0C-5-B', '0B-3', '3-5-B', '3-5', '0A-4-5-B', '0F-5-B', '5',
       '0E-3-5', '0A-B', '3-4-B', '0D-B', '0E-3-4-5', '0E-4-5-B',
       '0F-3-4-B', '0E-4-B', '0E-3-B', '0E-1-3-4', 'D', '0D-4-B', '0E-4',
       '0C-4-B', '0C-4-5-B', '0E-4-5', '4-5', '0B-4-B', '0B-5', '0E-F',
       '0A-3', '3-D', '0C-3-B-F', '0C-4', '0A-C', '3-4-F', '0A-5',
       '0D-3-4-5-B', '0

In [12]:
new_df.head(50)

Unnamed: 0,Case,Diagnosis,Procedure,LOS
0,13872110,"(E,)",0C,3
1,14035188,"(J,)",0C,3
2,14085514,"(S,)",0A,1
3,14111667,"(E,)",0C,3
4,14111831,"(E,)",0B,2
5,14114821,"(Z,)",0B,2
6,14131491,"(Z,)",0D,6
7,14160672,"(K,)",0A,1
8,14178755,"(M, Z)",0F-F,42
9,14178769,"(D,)",0C,3


In [None]:
len(new_df['Procedure'].unique())

103

In [None]:
# Se define una funcion para obtener el conjunto potencia de los diagnosticos

def powerSet(lista, df):
    
    conjunto = list()
    is_contained = True
    for diag in df['Diagnosis']:
        for c in diag:
            if c not in lista:
                is_contained = False
        if is_contained:
            conjunto.append(diag)
        is_contained = True
    conjunto = list(set(conjunto))
    return conjunto

In [None]:
# Se calculan las probabilidades de que este o no el procedimiento 0A, dado un conjunto de diagnosticos
diagnoses = powerSet(list(probs_diagnosis.keys()), new_df)
probs_0A_diagnoses = {'0A': dict(), '-0A': dict()}

total = len(new_df)

for comb in diagnoses:
    filtered_by_comb = new_df[new_df['Diagnosis'] == comb]
    total = len(filtered_by_comb)
    if total > 0:
        OA_total = len(filtered_by_comb[filtered_by_comb['Procedure'] == '0A'])
        prob = OA_total / total

        # Se guarda la probabilidad y el complemento
        probs_0A_diagnoses['0A'][comb] = prob
        probs_0A_diagnoses['-0A'][comb] = 1 - prob

In [None]:
procedure_count = new_df.groupby(['Procedure']).count()

In [None]:
new_df

Unnamed: 0,Case,Diagnosis,Procedure,LOS
0,13872110,"(E,)",0A,3
1,14035188,"(J,)",0A,3
2,14085514,"(S,)",0A,1
3,14111667,"(E,)",0A,3
4,14111831,"(E,)",0A,2
...,...,...,...,...
11946,15056499,"(G,)",4-5,1
11947,15056507,"(G,)",4,1
11948,15056517,"(G,)",4,1
11949,15057405,"(N, U)",0A,1


In [None]:
# Se define una funcion que retorna un arreglo con el intervalo correspondiente para cada dato
def getInterval(dato, intervalos):
    for inter in intervalos:
        if inter[0] <= dato <= inter[1]:
            return inter
        
def makeIntervals(bins):
    intervals = list()
    
    for i in range(len(bins)-1):
        intervals.append((bins[i], bins[i+1]))
    return intervals

In [None]:
# Discretizamos los intervalos de LOS
from sklearn.preprocessing import KBinsDiscretizer

normLOS = np.asarray(new_df['LOS'].astype(float)).reshape(-1, 1)
model = KBinsDiscretizer(n_bins=19, encode='ordinal', strategy='kmeans')

# Se fitean los datos de acuerdo al modelo
print(model.fit(normLOS))

# Intervalos
bins = model.bin_edges_[0]

# Transformamos los bins a intervalos
bins = makeIntervals(bins)

binned_LOS = [getInterval(dato, bins) for dato in new_df['LOS']]

new_df['binnedLOS'] = binned_LOS

KBinsDiscretizer(encode='ordinal', n_bins=19, strategy='kmeans')


In [None]:
new_df

Unnamed: 0,Case,Diagnosis,Procedure,LOS,binnedLOS
0,13872110,"(E,)",0A,3,"(0.8226329899812317, 3.724320317804913)"
1,14035188,"(J,)",0A,3,"(0.8226329899812317, 3.724320317804913)"
2,14085514,"(S,)",0A,1,"(0.8226329899812317, 3.724320317804913)"
3,14111667,"(E,)",0A,3,"(0.8226329899812317, 3.724320317804913)"
4,14111831,"(E,)",0A,2,"(0.8226329899812317, 3.724320317804913)"
...,...,...,...,...,...
11946,15056499,"(G,)",4-5,1,"(0.8226329899812317, 3.724320317804913)"
11947,15056507,"(G,)",4,1,"(0.8226329899812317, 3.724320317804913)"
11948,15056517,"(G,)",4,1,"(0.8226329899812317, 3.724320317804913)"
11949,15057405,"(N, U)",0A,1,"(0.8226329899812317, 3.724320317804913)"


In [None]:
new_df.groupby(['Diagnosis']).count()

Unnamed: 0_level_0,Case,Procedure,LOS,binnedLOS
Diagnosis,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"(A,)",25,25,25,25
"(A, B, K, U)",1,1,1,1
"(A, C, K, U)",1,1,1,1
"(A, C, U)",3,3,3,3
"(A, D)",1,1,1,1
...,...,...,...,...
"(T, U)",120,120,120,120
"(T, U, Z)",12,12,12,12
"(T, Z)",15,15,15,15
"(U, Z)",10,10,10,10


In [None]:
# Se obtienen las probabilidades de cada intervalo, dado o no el 0A
probs_LOS_OA = {'0A': dict(), '-0A': dict()}

OA_filter = filtered_by_0A = new_df[new_df['Procedure'] == '0A']
notOA_filter = filtered_by_0A = new_df[new_df['Procedure'] != '0A']
total_OA = len(OA_filter)
total_notOA = len(notOA_filter)

# Probabilidades dado que se realizo 0A
for _bin in bins:
    bin_total = len(OA_filter[OA_filter['binnedLOS'] == _bin])
    
    # Prob de tener intervalo _bin dado 0A
    prob = bin_total / total_OA
    
    # Se guardan las probabilidades
    probs_LOS_OA['0A'][_bin] = prob
    
# Probabilidades dado que no se realizo 0A
for _bin in bins:
    bin_total = len(notOA_filter[notOA_filter['binnedLOS'] == _bin])
    
    # Prob de tener intervalo _bin dado 0A
    prob = bin_total / total_notOA
    
    # Se guardan las probabilidades
    probs_LOS_OA['-0A'][_bin] = prob
    


In [None]:
# Se agrupan todas las probabilidades a priori encontradas
probs_data = {'diagnosis': probs_diagnosis, '0A|diagnosis': probs_0A_diagnoses, 'LOS|0A': probs_LOS_OA}

In [None]:
def multiplyItems(lista):
    number = 1
    for elem in lista:
        number *= elem
    return number

In [None]:
def predictLOSBin(diagnosis, procedures, bins, probs_data):
    prior_probs = list()
    # Se obtienen las probabilidades a priori de cada diagnostico
    for d in diagnosis:
        if d in probs_data['diagnosis'].keys():
            prior_probs.append(probs_data['diagnosis'][d])
        else:
            return f"Diagnóstico {d} no encontrado"
    # Probabilidades a priori del procedimiento, dado los diagnosticos
    _procedures = '-'.join(procedures)
    _diagnosis = tuple(sorted(diagnosis))
    if _procedures == '0A':
        probs_0A = probs_data['0A|diagnosis']['0A']
        probs_LOS = probs_data['LOS|0A']['0A']
    else:
        probs_0A = probs_data['0A|diagnosis']['-0A']
        probs_LOS = probs_data['LOS|0A']['-0A']
    # Si la combinacion de diagnosticos no existe se retorna 
    if _diagnosis not in probs_0A.keys():
        return f"Para los diagnósticos {_diagnosis} no hay una probabilidad asociada"
    prior_probs.append(probs_0A[_diagnosis])
    
    # Se obtienen las probabilidades para cada intervalo de LOS
    probs = dict()
    path = multiplyItems(prior_probs)
    for b, p in probs_LOS.items():
        probs[b] = p * path
        
    # Se retorna el intervalo con mayor probabilidad y su probabilidad
    best_prob = max(probs.values())
    intervalo = None
    for b, p in probs.items():
        if p == best_prob:
            intervalo = b
            break
    return intervalo
    

In [None]:
def getPrediction(df, bins, probs_data):
    diagnosis = [d for d in df['Diagnosis']]
    procedure = [p.split('-') for p in df['Procedure']]
    
    target_prediction = list()
    
    for i in range(len(df)):
        _bin = predictLOSBin(diagnosis[i], procedure[i], bins, probs_data)
        target_prediction.append(_bin)
    
    return target_prediction
        
        
    
prediction = getPrediction(new_df, bins, probs_data)   # Aca se testea cada entrada de new_df

'''
Se puede descomentar lo siguiente para probar la prediccion en un paciente cualquiera
'''
#diagnosticos = ['E', 'Z']
#procedures = ['0A']

#predictLOSBin(diagnosticos, procedures, bins, probs_data)

In [None]:
desempeño = list()
for i in range(len(new_df['binnedLOS'])):
    if 'no' not in prediction[i]:
        if new_df['binnedLOS'][i] == prediction[i]:
            desempeño.append(1)
        else:
            desempeño.append(0)
            
aciertos = sum(desempeño)
total = len(desempeño)

porcentaje = round((aciertos/total) * 100, 2)

print(f'Porcentaje de aciertos: {porcentaje} %')

Porcentaje de aciertos: 59.3 %


In [None]:
new_df['predicted_LOS_bin'] = prediction

new_df

Unnamed: 0,Case,Diagnosis,Procedure,LOS,binnedLOS,predicted_LOS_bin
0,13872110,"(E,)",0A,3,"(0.8226329899812317, 3.724320317804913)","(0.8226329899812317, 3.724320317804913)"
1,14035188,"(J,)",0A,3,"(0.8226329899812317, 3.724320317804913)","(0.8226329899812317, 3.724320317804913)"
2,14085514,"(S,)",0A,1,"(0.8226329899812317, 3.724320317804913)","(0.8226329899812317, 3.724320317804913)"
3,14111667,"(E,)",0A,3,"(0.8226329899812317, 3.724320317804913)","(0.8226329899812317, 3.724320317804913)"
4,14111831,"(E,)",0A,2,"(0.8226329899812317, 3.724320317804913)","(0.8226329899812317, 3.724320317804913)"
...,...,...,...,...,...,...
11946,15056499,"(G,)",4-5,1,"(0.8226329899812317, 3.724320317804913)","(0.8226329899812317, 3.724320317804913)"
11947,15056507,"(G,)",4,1,"(0.8226329899812317, 3.724320317804913)","(0.8226329899812317, 3.724320317804913)"
11948,15056517,"(G,)",4,1,"(0.8226329899812317, 3.724320317804913)","(0.8226329899812317, 3.724320317804913)"
11949,15057405,"(N, U)",0A,1,"(0.8226329899812317, 3.724320317804913)","(0.8226329899812317, 3.724320317804913)"
