In [1]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency, norm
import math

from pathlib import Path

#from scipy.spatial.distance import yule

In [2]:
# Definição de diretórios e Disco de leitura dos arquivos (Desktop ou Notebook) 
config_dirs = open("prefixo_dados.txt").readlines() 
DIR_PREFIXO = Path(config_dirs[0].strip())

DIRETORIO_PRINCIPAL =  DIR_PREFIXO / "Projeto_PA" / "Projeto_PA_validado"

DIR_BASE = DIRETORIO_PRINCIPAL / "analise_abordagens"

### Funções de medidas de associações: Q de Yule, V Cramer e Kruskal Gamma

In [3]:
def comb(n,k):
    return int(math.factorial(n) / (math.factorial(k)*math.factorial(n - k)))

def yule(m):
    m = np.array(m)
    y = ((m[0][0]*m[1][1]) - (m[0][1]*m[1][0]))   /  ((m[0][0]*m[1][1]) + (m[0][1]*m[1][0]))
    if y < 0:
        return y, 'Associação inversa'
    else:
        if y >= 0 and y <= 0.2:
             return y, 'Associação direta - ausente'
        elif y > 0.2 and y <= 0.5:
             return y, 'Associação direta - pequena'
        elif y > 0.5 and y <= 0.8:
             return y, 'Associação direta - moderada'
        elif y > 0.8:
            return y, 'Associação direta - grande'

#Referência: ausente: [0-0,2]; pequena: (0,2-0,5]; moderada: (0,5-0,8]; grande: > 0,8.","\n",
def cramer_v(m):
    if min(m.shape) < 2:
        print('\n\nDados com dimensão menor que 2!\n\n')
        return 0
    
    chi2, p, dof, ex = chi2_contingency(m)
    v = (chi2/(m.sum().sum() * (min(m.shape) - 1))) ** (0.5)
    
    if v <= 0.2:
        return v, 'Associação Ausente'
    elif v <= 0.5:
        return v, 'Associação Pequena'
    elif v <= 0.8:
        return v, 'Associação Moderada'
    elif v > 0.8:
        return v, 'Associação Grande'
    
def kruskal_g(data):#, ordinal1, ordinal2, orderLabels1, orderLabels2=None):
# myCrosstable = pd.crosstab(data[ordinal1], data[ordinal2])
# 
# myCrosstable = myCrosstable.reindex(orderLabels1)
#         
# if orderLabels2 == None:
#     myCrosstable = myCrosstable[orderLabels1]
# else:
#     myCrosstable = myCrosstable[orderLabels2]
    myCrosstable = data
    nRows = myCrosstable.shape[0]
    nCols = myCrosstable.shape[1]
    
    
    C = [[0 for x in range(nCols)] for y in range(nRows)] 

    # top left part
    for i in range(nRows):
        for j in range(nCols):
            h = i-1
            k = j-1        
            if h>=0 and k>=0:            
                for p in range(h+1):
                    for q in range(k+1):
                        C[i][j] = C[i][j] + list(myCrosstable.iloc[p])[q]

    # bottom right part                    
    for i in range(nRows):
        for j in range(nCols):
            h = i+1
            k = j+1        
            if h<nRows and k<nCols:            
                for p in range(h, nRows):
                    for q in range(k, nCols):
                        C[i][j] = C[i][j] + list(myCrosstable.iloc[p])[q]
                        
    D = [[0 for x in range(nCols)] for y in range(nRows)] 

    # bottom left part
    for i in range(nRows):
        for j in range(nCols):
            h = i+1
            k = j-1        
            if h<nRows and k>=0:            
                for p in range(h, nRows):
                    for q in range(k+1):
                        D[i][j] = D[i][j] + list(myCrosstable.iloc[p])[q]

    # top right part                    
    for i in range(nRows):
        for j in range(nCols):
            h = i-1
            k = j+1        
            if h>=0 and k<nCols:            
                for p in range(h+1):
                    for q in range(k, nCols):
                        D[i][j] = D[i][j] + list(myCrosstable.iloc[p])[q]

    P = 0
    Q = 0
    for i in range(nRows):
        for j in range(nCols):
            P = P + C[i][j] * list(myCrosstable.iloc[i])[j]
            Q = Q + D[i][j] * list(myCrosstable.iloc[i])[j]
               
    GKgamma = (P - Q) / (P + Q)
    
#    if abs(GKgamma) < .10:
#        qual = 'Negligible'
#    elif abs(GKgamma) < .20:
#        qual = 'Weak'
#    elif abs(GKgamma) < .40:
#        qual = 'Moderate'
#    elif abs(GKgamma) < .60:
#        qual = 'Relatively strong'
#    elif abs(GKgamma) < .80:
#        qual = 'Strong'        
#    else:
#        qual = 'Very strong'

    if GKgamma < 0:
        qual = 'Associação Inversa'
    elif GKgamma <= .20:
        qual = 'Associação Direta - Ausente'
    elif GKgamma <= .50:
        qual = 'Associação Direta - Pequena'
    elif GKgamma <= .80:
        qual = 'Associação Direta - Moderada'
    elif GKgamma > .80:
        qual = 'Associação Direta - Grande'        

#    n = myCrosstable.sum().sum()
#    
#    Z1 = GKgamma * ((P + Q) / (n * (1 - GKgamma**2)))**0.5
#    
#    forASE0 = 0
#    forASE1 = 0
#    for i in range(nRows):
#        for j in range(nCols):
#            forASE0 = forASE0 + list(myCrosstable.iloc[i])[j] * (Q * C[i][j] - P * D[i][j])**2
#            forASE1 = forASE1 + list(myCrosstable.iloc[i])[j] * (C[i][j] - D[i][j])**2
#
#    ASE0 = 4 * (forASE0)**0.5 / (P + Q)**2
#    ASE1 = 2 * (forASE1 - (P - Q)**2 / n)**0.5 / (P + Q)        
#    Z2 = GKgamma / ASE0
#    Z3 = GKgamma / ASE1
#    
#    p1 = norm.sf(Z1)
#    p2 = norm.sf(Z2)
#    p3 = norm.sf(Z3)
#    
#    zvalues = [Z1] + [Z2] + [Z3]
#    pvalues = [p1] + [p2] + [p3]
            
    return (GKgamma,qual), 0 #zvalues, pvalues

In [4]:
def aplica_cramer(df, df_saida):
    n_linhas = comb(df.shape[1],2) + 1
    n_colunas = df.shape[1]
    
    lista_cramer_quant = list()
    lista_cramer_quali = list()
    
    resultado = cramer_v(df)
    
    lista_cramer_quant.append(np.round (resultado[0], 4))
    lista_cramer_quali.append(np.round (resultado[1], 4))
    
    for i in range(n_linhas - 1):
        lista_cramer_quant.append(np.nan)
        lista_cramer_quali.append(np.nan)
    
    df_saida['V de Cramer (Quant)'] = lista_cramer_quant
    df_saida['V de Cramer - Quali'] = lista_cramer_quali
    
    return df_saida

def aplica_yule(df, df_saida):
    #n_linhas = comb(df.shape[1],2) + 1
    n_colunas = df.shape[1]
    
    lista_yule_quant = list()
    lista_yule_quali = list()

    lista_yule_quant.append(np.nan)
    lista_yule_quali.append(np.nan)

    for cluster1 in range(1, n_colunas + 1):
        for cluster2 in range(cluster1 + 1, n_colunas + 1):
            resultado = yule(df[[f'cluster {cluster1}', f'cluster {cluster2}']])
            lista_yule_quant.append(resultado[0])
            lista_yule_quali.append(resultado[1])

    df_saida['Q de Yule - Quantitativo'] = lista_yule_quant
    df_saida['Q de Yule - Qualitativo'] = lista_yule_quali
    
    return df_saida

def aplica_kruskal(df, df_saida):
    #n_linhas = comb(df.shape[1],2) + 1
    n_colunas = df.shape[1]
    
    lista_kruskal_quant = list()
    lista_kruskal_quali = list()

    resultado = kruskal_g(df)[0]

    lista_kruskal_quant.append(resultado[0])
    lista_kruskal_quali.append(resultado[1])

    for cluster1 in range(1, n_colunas + 1):
        for cluster2 in range(cluster1 + 1, n_colunas + 1):
            resultado = kruskal_g(df[[f'cluster {cluster1}', f'cluster {cluster2}']])[0]
            lista_kruskal_quant.append(resultado[0])
            lista_kruskal_quali.append(resultado[1])

    df_saida['Gamma de Kruskal - Quantitativo'] = lista_kruskal_quant
    df_saida['Gamma de Kruskal - Qualitativo'] = lista_kruskal_quali
    
    return df_saida

def analisa_genero(df):
    n_colunas = df.shape[1]
    
    df_saida = pd.DataFrame()

    lista = ['Todos os Clusters']
    for cluster1 in range(1, n_colunas + 1):
        for cluster2 in range(cluster1 + 1, n_colunas + 1):
            lista.append(f'Cluster {cluster1} e {cluster2}')
    df_saida['SEXO'] = lista
    
    df_saida = aplica_cramer(df, df_saida)
    
    df_saida = aplica_yule(df, df_saida)
    
    df_saida = aplica_kruskal(df, df_saida)
    
    return df_saida

def analisa_idade(df):
    n_colunas = df.shape[1]
    
    df_saida = pd.DataFrame()

    lista = ['Todos os Clusters']
    for cluster1 in range(1, n_colunas + 1):
        for cluster2 in range(cluster1 + 1, n_colunas + 1):
            lista.append(f'Cluster {cluster1} e {cluster2}')
    
    df_saida['IDADE'] = lista
    
    df_saida = aplica_kruskal(df, df_saida)
    
    return df_saida


def analisa_imc(df):
    n_colunas = df.shape[1]
    
    df_saida = pd.DataFrame()

    lista = ['Todos os Clusters']
    for cluster1 in range(1, n_colunas + 1):
        for cluster2 in range(cluster1 + 1, n_colunas + 1):
            lista.append(f'Cluster {cluster1} e {cluster2}')
    
    df_saida['IMC'] = lista
    
    df_saida = aplica_kruskal(df, df_saida)
    
    return df_saida

def analisa_doencas(df):
    n_colunas = df.shape[1]
    
    df_saida = pd.DataFrame()

    lista = ['Todos os Clusters']
    for cluster1 in range(1, n_colunas + 1):
        for cluster2 in range(cluster1 + 1, n_colunas + 1):
            lista.append(f'Cluster {cluster1} e {cluster2}')
    
    df_saida['PADRÕES'] = lista
    
    df_saida = aplica_cramer(df, df_saida)
    
    return df_saida

In [5]:
#caminho = DIRETORIO_PRINCIPAL / DIR_KMEANS
#caminho
#diretorios = {'dict':DIR_DICT,'kmeans':DIR_KMEANS, 'nmf':DIR_NMF}

nome_abordagem = {'cadastro':"_qtd_approach_cadastro", 'doenca':"_qtd_approach_doenca",
                  'medidas':"_qtd_approach_medidas",'estatistica':"_qtd_approach_estatistica",
                 'escolho_doenca':"_qtd_approach_escolho_doenca", 'categorias':"_qtd_approach_categorias"}

In [6]:
#DIR_DICT / f"Dict_approach_doenca
#_grupo_{grupo}.csv", 

# Análise é uma opção do dicionário criado na célula acima
algoritmo = input("Informe a análise ")

#KMEANS_qtd_approach_categorias.csv
caminho_medidas = DIR_BASE / f"{algoritmo.upper()}{nome_abordagem['medidas']}.csv"
caminho_estatistica = DIR_BASE / f"{algoritmo.upper()}{nome_abordagem['estatistica']}.csv"
caminho_doenca = DIR_BASE / f"{algoritmo.upper( )}{nome_abordagem['doenca']}.csv"
caminho_cadastro = DIR_BASE / f"{algoritmo.upper()}{nome_abordagem['cadastro']}.csv"
caminho_escolho_doenca = DIR_BASE / f"{algoritmo.upper()}{nome_abordagem['escolho_doenca']}.csv"
caminho_categorias = DIR_BASE / f"{algoritmo.upper()}{nome_abordagem['categorias']}.csv"


Informe a análise kmeans


In [7]:

df_categorias = pd.read_csv(caminho_categorias, sep=";", encoding='latin1')
df_categorias.index = df_categorias['Cluster'].values
df_categorias = df_categorias.drop(['Cluster'], axis = 1)
df_categorias

Unnamed: 0,cluster 1,cluster 2,cluster 3,cluster 4,cluster 5,cluster 6,cluster 7,cluster 8
QTD Pacientes,2229.00,1734.00,3077.00,4480.00,2601.00,2697.00,2424.00,1810.00
QTD Pacientes %,10.59,8.24,14.62,21.28,12.36,12.81,11.51,8.60
Masculino,834.00,693.00,1577.00,2127.00,1516.00,1057.00,956.00,624.00
Masculino %,37.42,39.97,51.25,47.48,58.29,39.19,39.44,34.48
Feminino,1395.00,1041.00,1500.00,2353.00,1085.00,1640.00,1468.00,1186.00
...,...,...,...,...,...,...,...,...
Masked %,0.22,0.00,0.00,0.25,0.12,0.22,0.25,0.11
Whitecoat,1590.00,1234.00,2014.00,2888.00,1892.00,1838.00,1723.00,1272.00
Whitecoat %,71.33,71.16,65.45,64.46,72.74,68.15,71.08,70.28
Normotenso,2.00,0.00,0.00,2.00,0.00,2.00,1.00,1.00


In [8]:
df_genero = df_categorias.loc[['Masculino','Feminino'], :]
df_genero


Unnamed: 0,cluster 1,cluster 2,cluster 3,cluster 4,cluster 5,cluster 6,cluster 7,cluster 8
Masculino,834.0,693.0,1577.0,2127.0,1516.0,1057.0,956.0,624.0
Feminino,1395.0,1041.0,1500.0,2353.0,1085.0,1640.0,1468.0,1186.0


In [9]:
df_idade = df_categorias.loc[['Jovens','Adultos','Meia-idade','Idoso','Ancião','Velhice extrema'], :]
df_idade

Unnamed: 0,cluster 1,cluster 2,cluster 3,cluster 4,cluster 5,cluster 6,cluster 7,cluster 8
Jovens,0.0,0.0,29.0,92.0,37.0,0.0,0.0,0.0
Adultos,0.0,0.0,1513.0,2170.0,1795.0,0.0,0.0,0.0
Meia-idade,430.0,0.0,1535.0,2218.0,769.0,0.0,1045.0,0.0
Idoso,1076.0,1277.0,0.0,0.0,0.0,1919.0,1155.0,1131.0
Ancião,682.0,437.0,0.0,0.0,0.0,743.0,223.0,632.0
Velhice extrema,41.0,20.0,0.0,0.0,0.0,35.0,1.0,47.0


In [10]:
df_doencas = df_categorias.loc[['Sistolica Isolada','Diastolica Isolada','Dipping','Non Dipping','Extreme Dipping','Reverse Dipping','Morning Surge','Masked','Whitecoat','Normotenso'], :]
df_doencas


Unnamed: 0,cluster 1,cluster 2,cluster 3,cluster 4,cluster 5,cluster 6,cluster 7,cluster 8
Sistolica Isolada,5.0,0.0,1.0,5.0,1.0,7.0,4.0,5.0
Diastolica Isolada,5.0,0.0,4.0,40.0,12.0,9.0,11.0,3.0
Dipping,0.0,1734.0,3077.0,0.0,1312.0,0.0,198.0,11.0
Non Dipping,0.0,0.0,0.0,3530.0,876.0,2694.0,1627.0,1070.0
Extreme Dipping,314.0,0.0,0.0,657.0,222.0,3.0,123.0,163.0
Reverse Dipping,1915.0,0.0,0.0,292.0,191.0,0.0,476.0,566.0
Morning Surge,0.0,1199.0,1432.0,0.0,642.0,0.0,54.0,1.0
Masked,5.0,0.0,0.0,11.0,3.0,6.0,6.0,2.0
Whitecoat,1590.0,1234.0,2014.0,2888.0,1892.0,1838.0,1723.0,1272.0
Normotenso,2.0,0.0,0.0,2.0,0.0,2.0,1.0,1.0


In [11]:
df_imc = df_categorias.loc[['Abaixo do peso','Peso Normal','Sobrepeso','Obesidade grau 1','Obesidade grau 2','Obesidade Grau 3'], :]
df_imc

Unnamed: 0,cluster 1,cluster 2,cluster 3,cluster 4,cluster 5,cluster 6,cluster 7,cluster 8
Abaixo do peso,27.0,25.0,30.0,44.0,0.0,31.0,0.0,61.0
Peso Normal,884.0,636.0,1145.0,1682.0,0.0,1085.0,0.0,1749.0
Sobrepeso,1217.0,847.0,1902.0,2754.0,56.0,1576.0,54.0,0.0
Obesidade grau 1,101.0,226.0,0.0,0.0,1666.0,5.0,1804.0,0.0
Obesidade grau 2,0.0,0.0,0.0,0.0,603.0,0.0,475.0,0.0
Obesidade Grau 3,0.0,0.0,0.0,0.0,276.0,0.0,91.0,0.0


In [12]:
df_genero

Unnamed: 0,cluster 1,cluster 2,cluster 3,cluster 4,cluster 5,cluster 6,cluster 7,cluster 8
Masculino,834.0,693.0,1577.0,2127.0,1516.0,1057.0,956.0,624.0
Feminino,1395.0,1041.0,1500.0,2353.0,1085.0,1640.0,1468.0,1186.0


In [13]:
df_imc

Unnamed: 0,cluster 1,cluster 2,cluster 3,cluster 4,cluster 5,cluster 6,cluster 7,cluster 8
Abaixo do peso,27.0,25.0,30.0,44.0,0.0,31.0,0.0,61.0
Peso Normal,884.0,636.0,1145.0,1682.0,0.0,1085.0,0.0,1749.0
Sobrepeso,1217.0,847.0,1902.0,2754.0,56.0,1576.0,54.0,0.0
Obesidade grau 1,101.0,226.0,0.0,0.0,1666.0,5.0,1804.0,0.0
Obesidade grau 2,0.0,0.0,0.0,0.0,603.0,0.0,475.0,0.0
Obesidade Grau 3,0.0,0.0,0.0,0.0,276.0,0.0,91.0,0.0


In [14]:
df_idade

Unnamed: 0,cluster 1,cluster 2,cluster 3,cluster 4,cluster 5,cluster 6,cluster 7,cluster 8
Jovens,0.0,0.0,29.0,92.0,37.0,0.0,0.0,0.0
Adultos,0.0,0.0,1513.0,2170.0,1795.0,0.0,0.0,0.0
Meia-idade,430.0,0.0,1535.0,2218.0,769.0,0.0,1045.0,0.0
Idoso,1076.0,1277.0,0.0,0.0,0.0,1919.0,1155.0,1131.0
Ancião,682.0,437.0,0.0,0.0,0.0,743.0,223.0,632.0
Velhice extrema,41.0,20.0,0.0,0.0,0.0,35.0,1.0,47.0


### Análise por GÊNERO

In [15]:
saida_genero = analisa_genero(df_genero)
saida_genero

#  grava arquivo da abordagem por GÊNERO

saida_genero.to_csv(DIR_BASE / "correlacao" / f"{algoritmo.upper()}_analise_sexo.csv", 
                           sep=";", encoding="latin1", index=False)

UFuncTypeError: ufunc 'multiply' did not contain a loop with signature matching types (dtype('<U32'), dtype('<U32')) -> dtype('<U32')

### Análise por IDADE

In [None]:
saida_idade = analisa_idade(df_idade)
saida_idade

#  grava arquivo da abordagem por IDADE

saida_genero.to_csv(DIR_BASE / "correlacao" / f"{algoritmo.upper()}_analise_idade.csv", 
                           sep=";", encoding="latin1", index=False)

### Análise por DOENÇA

In [None]:
saida_doencas = analisa_doencas(df_doencas)
saida_doencas

#  grava arquivo da abordagem por PADRÂO DE DOENÇA

saida_genero.to_csv(DIR_BASE / "correlacao" / f"{algoritmo.upper()}_analise_doenca.csv", 
                           sep=";", encoding="latin1", index=False)

### Análise por IMC

In [None]:
saida_imc = analisa_imc(df_imc)
saida_imc

#  grava arquivo da abordagem por IMC

saida_genero.to_csv(DIR_BASE / "correlacao" / f"{algoritmo.upper()}_analise_imc.csv", 
                           sep=";", encoding="latin1", index=False)

In [None]:
# 3 Abordagens

abordagem_cadastro    = list()
abordagem_medidas     = list()
abordagem_estatistica = list()
abordagem_doenca      = list()
abordagem_escolho_doenca = list()
abordagem_categorias  = list()

try:
    grupo_leitura = 1
    while 1:

        abordagem_estatistica.append(pd.read_csv(f"{caminho_estatistica}_grupo_{grupo_leitura}.csv", sep=";", encoding='latin1'))
        grupo_leitura += 1   
except:
    pass

try:
    grupo_leitura = 1
    while 1:

        abordagem_medidas.append(pd.read_csv(f"{caminho_medidas}_grupo_{grupo_leitura}.csv", sep=";", encoding='latin1'))
        grupo_leitura += 1

except:
    pass

try:
    grupo_leitura = 1
    while 1:

        abordagem_doenca.append(pd.read_csv(f"{caminho_doenca}_grupo_{grupo_leitura}.csv", sep=";", encoding='latin1'))
        grupo_leitura += 1
except:
    pass

try:
    grupo_leitura = 1
    while 1:

        abordagem_cadastro.append(pd.read_csv(f"{caminho_cadastro}_grupo_{grupo_leitura}.csv", sep=";", encoding='latin1'))
        grupo_leitura += 1

except:
    pass

try:
    grupo_leitura = 1
    while 1:

        abordagem_escolho_doenca.append(pd.read_csv(f"{caminho_escolho_doenca}_grupo_{grupo_leitura}.csv", sep=";", encoding='latin1'))
        grupo_leitura += 1

except:
    pass

try:
    grupo_leitura = 1
    while 1:

        abordagem_categorias.append(pd.read_csv(f"{caminho_categorias}_grupo_{grupo_leitura}.csv", sep=";", encoding='latin1'))
        grupo_leitura += 1

except:
    pass

Genero
---
>V de Cramer - Todo o conjunto

>Q de Yule - Pares

Idade
---
>Kruskal Gamma - Todo o conjunto


>Kruskal Gamma - Pares

IMC
---
>Kruskal Gamma - Todo o conjunto

>Kruskal Gamma - Pares