### Este script separa o arquivo xlsx com as anotações por região dos avaliadores em dois arquivos, um contendo o consenso dos avaliadores se a região está visivel e outro com o consenso sobre se a característica da região está presente.  

In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

pd.set_option('display.max_columns', None)
pd.set_option("future.no_silent_downcasting", True)

In [2]:
XLSX_PATH = r'E:\Pedro\Faculdade\FEI-Projeto_Dor\src\Tatiany-Regioes\data\occlusion\Avaliadores_aparato-Modificado.xlsx'

OUTPUT_VISIBLE = r'E:\Pedro\Faculdade\FEI-Projeto_Dor\src\Tatiany-Regioes\data\occlusion\Avaliadores_aparato-consenso-visivel.csv'
OUTPUT_PRESENT = r'E:\Pedro\Faculdade\FEI-Projeto_Dor\src\Tatiany-Regioes\data\occlusion\Avaliadores_aparato-consenso-presente.csv'

In [3]:
data = pd.read_excel(XLSX_PATH)

In [4]:
data.head(5)

Unnamed: 0.1,Unnamed: 0,Punção,Unnamed: 2,BA_R,BE_R,FP_R,FS_R,SN_R,Unnamed: 8,BA_AS,BE_AS,FP_AS,FS_AS,SN_AS,Unnamed: 14,BA_F,BE_F,FP_F,FS_F,SN_F,Unnamed: 20,BA_E,BE_E,FP_E,FS_E,SN_E
0,1_20190322_152828,Sem DOR,,VA,VA,NV,VA,NV,,VA,VP,NV,VA,VA,,VA,VA,NV,VA,VA,,VA,VA,NV,VA,VA
1,1_20190322_152912,Sem DOR,,VA,VA,NV,VP,NV,,VA,VA,NV,VA,NV,,VA,VA,NV,VA,VA,,VA,VA,NV,VA,VA
2,1_20190322_152946,Com DOR,,VP,VP,NV,VP,NV,,VP,VP,NV,VP,NV,,VP,VP,NV,VA,NV,,VP,VP,NV,VP,VP
3,1_20190322_152950,Com DOR,,VP,VP,NV,VP,NV,,VP,VP,NV,VP,NV,,VP,VA,NV,VA,NV,,VP,VP,NV,VP,VP
4,2_20190327_061654,Sem DOR,,NV,NV,VA,VA,VA,,NV,NV,VA,VA,NV,,NV,NV,VA,VA,NV,,VA,VA,VA,VA,VP


In [5]:
# Set image id to be the index
data.set_index('Unnamed: 0', inplace=True)
data.index.name = 'image_id'
# Remove spacing columns with no data
data.drop(['Unnamed: 2', 'Unnamed: 8', 'Unnamed: 14', 'Unnamed: 20'], axis=1, inplace=True)
# Replace String values with integers for 'Punção' column
data['Punção'] = data['Punção'].replace({'Sem DOR': 0, 'Com DOR': 1})

In [6]:
data.head(5)

Unnamed: 0_level_0,Punção,BA_R,BE_R,FP_R,FS_R,SN_R,BA_AS,BE_AS,FP_AS,FS_AS,SN_AS,BA_F,BE_F,FP_F,FS_F,SN_F,BA_E,BE_E,FP_E,FS_E,SN_E
image_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1_20190322_152828,0,VA,VA,NV,VA,NV,VA,VP,NV,VA,VA,VA,VA,NV,VA,VA,VA,VA,NV,VA,VA
1_20190322_152912,0,VA,VA,NV,VP,NV,VA,VA,NV,VA,NV,VA,VA,NV,VA,VA,VA,VA,NV,VA,VA
1_20190322_152946,1,VP,VP,NV,VP,NV,VP,VP,NV,VP,NV,VP,VP,NV,VA,NV,VP,VP,NV,VP,VP
1_20190322_152950,1,VP,VP,NV,VP,NV,VP,VP,NV,VP,NV,VP,VA,NV,VA,NV,VP,VP,NV,VP,VP
2_20190327_061654,0,NV,NV,VA,VA,VA,NV,NV,VA,VA,NV,NV,NV,VA,VA,NV,VA,VA,VA,VA,VP


In [7]:
# Split the target column
dor_series = data['Punção']
data.drop('Punção', axis=1, inplace=True)

In [8]:
# Split the main dataframe into sub dataframes, one for each region, containing all 4 evaluators opinion
cols = data.columns.to_numpy()
BA_cols = cols[0::5]
BE_cols = cols[1::5]
FP_cols = cols[2::5]
FS_cols = cols[3::5]
SN_cols = cols[4::5]

BA_df = data[BA_cols]
BE_df = data[BE_cols]
FP_df = data[FP_cols]
FS_df = data[FS_cols]
SN_df = data[SN_cols]

In [9]:
def replace_str_visible(x):
    """Replace string value for 0 if the string equals to 0 or 1 otherwise"""
    return int(x != 'NV') # 0 se for NV, 1 caso contrario
def replace_str_present(x):
    """Replace string values based on a dict of conditions"""
    replace_dict = {
        'NV': -1,
        'VA': 0,
        'VP': 1
    }
    return replace_dict[x]
def get_most_frequent(arr):
    """Get the most frequent value in an array"""
    uniques, counts = np.unique(arr.astype(int), return_counts=True)
    most_frequent_idx = np.where(counts == counts.max())[0]
    consensus = uniques[most_frequent_idx]
    if len(consensus) > 1:  # Tie
        # Priority = Non visible
        if -1 in consensus:
            return -1
        # Second priority = present
        if 1 in consensus:
            return 1
        # Last priority = not present
        # elif 0 in consensus:
        #     return 0
        else:
            return consensus[0]
    return consensus[0]

replace_str_visible_vect = np.vectorize(replace_str_visible)
replace_str_present_vect = np.vectorize(replace_str_present)

visible_consensus = {}
present_consensus = {}

indexes = []

for region_df, region_name in [(BA_df, 'BA'),(BE_df, 'BE'),(FP_df, 'FP'),(FS_df, 'FS'),(SN_df, 'SN')]:
    visible_matrix = np.full(region_df.shape, fill_value=None)
    present_matrix = np.full(region_df.shape, fill_value=None)
    for idx, col in enumerate(region_df):
        d = region_df[col].to_numpy()

        visible_matrix[:, idx] = replace_str_visible_vect(d)
        present_matrix[:, idx] = replace_str_present_vect(d)

    visible_consensus[region_name] = (visible_matrix.sum(axis=1) > 1).astype(int) # Visivel caso 2 ou mais disseram visivel
    
    present_consensus[region_name] = (np.sum(present_matrix * (present_matrix > 0), axis=1) > 1).astype(int)
    present_consensus[region_name][visible_consensus[region_name] == 0] = -1
    #np.apply_along_axis(get_most_frequent, axis=1, arr=present_matrix)

    indexes.append(region_df.index)

assert all(np.array_equal(idx_arr, indexes[0]) for idx_arr in indexes)

visible_consensus_df = pd.DataFrame(visible_consensus, index=region_df.index)
present_consensus_df = pd.DataFrame(present_consensus, index=region_df.index)

visible_consensus_df.to_csv(OUTPUT_VISIBLE)
present_consensus_df.to_csv(OUTPUT_PRESENT)

In [10]:
for col in visible_consensus_df:
    print(col, visible_consensus_df[col].sum())

BA 39
BE 38
FP 44
FS 44
SN 29


In [11]:
for col in present_consensus_df:
    print(col, len(present_consensus_df) - present_consensus_df[col].value_counts()[-1])

BA 39
BE 38
FP 44
FS 44
SN 29


In [12]:
print(visible_consensus['SN'].sum())
print((present_consensus['SN'] > 0).sum())

29
11


In [13]:
present_consensus['SN'] > -1

array([ True,  True, False, False,  True,  True,  True,  True,  True,
        True,  True,  True, False, False, False, False,  True,  True,
        True,  True,  True,  True,  True,  True, False, False, False,
       False, False, False, False, False, False, False, False, False,
        True,  True,  True,  True,  True,  True, False,  True,  True,
        True,  True,  True])