## This script converts the UNIFESP dataset labels from 2 files: 
- RN.csv (photo id x RN id containing img file names);
- GT.csv (photo_region x rn id containing the actual label);

## To a single file (Region x Image id containing labels)

In [1]:
import pandas as pd

from pathlib import Path

In [2]:
rn_ids_file_path = Path('/home/phdomingues/masters/data/UNIFESP/RN.csv')
rn_labels_file_path = Path('/home/phdomingues/masters/data/UNIFESP/GT.csv')
output_file_path = Path('/home/phdomingues/masters/data/UNIFESP/NEW_GT.csv')

rn_ids = pd.read_csv(rn_ids_file_path, sep=';', index_col='RN')
rn_labels = pd.read_csv(rn_labels_file_path, index_col='RN')

regions_map = {
    'nasolabial_fold': ['SN'],
    'forehead': ['FS'],
    'palpebral_fissure': ['FP'],
    'mouth': ['BA', 'BE']
}

In [3]:
rn_ids.head(5)

Unnamed: 0_level_0,RH,DATA,Foto 1,Foto 2,Foto 3,Foto 4,Foto 5,Foto 6,Foto 7,Foto 8,Foto 9,Foto 10,Foto 11,Foto 12
RN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,10367366,41432,01_113538,01_113607,01_113730,01_113852,01_114251,01_114118,01_113814,01_113837,01_113719,01_113726,01_114413,01_114056
2,1189051,41435,02_114200,02_114225,02_114142,02_114200_2,02_114225_2,02_114042,02_114200_3,02_114225_3,02_114232,02_114411,02_114041,02_114042_2
3,10375851,41467,03_110950,03_111010,03_111021,03_111028,03_111120,03_111130,03_111159,03_111418,03_111426,03_111434,03_111503,03_111612
4,10364739,41467,04_120103,04_120125,04_120143,04_120159,04_120228,04_120313,04_120336,04_120416,04_113302_1,04_113302_2,04_113302_3,04_115047
5,10376181,41471,05_105631,05_105803,05_110009,05_110045,05_110222,05_110248,05_105759,05_110014,05_110159,05_105829,05_105939,05_110026


In [4]:
rn_labels.head(5)

Unnamed: 0_level_0,Foto 1_FS,Foto 1_FP,Foto 1_SN,Foto 1_BA,Foto 1_BE,Foto 2_FS,Foto 2_FP,Foto 2_SN,Foto 2_BA,Foto 2_BE,...,Foto 11_FS,Foto 11_FP,Foto 11_SN,Foto 11_BA,Foto 11_BE,Foto 12_FS,Foto 12_FP,Foto 12_SN,Foto 12_BA,Foto 12_BE
RN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,1,1,1,1,0,0,0,0,0,...,1,1,1,1,1,0,1,1,1,0
2,1,0,0,1,0,1,0,0,1,1,...,1,1,1,1,1,1,1,1,1,1
3,0,1,0,0,0,0,1,0,0,0,...,1,1,1,1,1,1,1,1,1,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,1,1,1,1,1,...,0,0,0,0,0,0,0,1,1,0


In [5]:
def img2rn(image_id, rn_df):
    photo_id = rn_df.columns[rn_df.isin([image_id]).any()][0]
    rn_id = rn_df.loc[rn_df[photo_id] == image_id].index[0]
    return rn_id, photo_id

In [6]:
rn_ids_list = rn_ids.iloc[:,2:].values.reshape(-1)
images = []
labels = {
    'SN': [],
    'FS': [],
    'FP': [],
    'BA': [],
    'BE': []}

for img_id in rn_ids_list:
    images.append(img_id)
    rn_id, photo_id = img2rn(img_id, rn_ids)
    for label in labels:
        labels[label].append(rn_labels.loc[rn_id, f'{photo_id}_{label}'])

## Add the syntethic data to the output df

In [7]:
import json
import numpy as np

syntetic_data_labels_path = Path('/home/phdomingues/masters/data/UNIFESP/syntetic/metadataSyn.json')

In [8]:
with syntetic_data_labels_path.open('r') as f:
    syntetic_labels_data = json.load(f)

In [9]:
# Checking order of nfcs face parts
syntetic_labels_data['info']['NFCS']

['Brow bulge',
 'Eye squeeze',
 'Deepening of nasolabial furrow',
 'Open lips',
 'Mouth stretch (horizontal or vertical)']

In [10]:
syntetic2unifesp_labels = {
    'Brow bulge': 'FS', # Fronte Saliente
    'Eye squeeze': 'FP', # Fenda Palpebral estreitada
    'Deepening of nasolabial furrow': 'SN', # Sulco Nasolabial aprofundado
    'Open lips': 'BA', # Boca Aberta
    'Mouth stretch (horizontal or vertical)': 'BE' # Boca Estirada
}

In [11]:
# Casts all the image ids type to string, so we don't lose leading zeros (e.g. ids like '000004' would turn to '4')
images = list(map(str, images))

for s_img in syntetic_labels_data['images']:
    # Read the image id, that is also the image file name
    img_id = s_img['image_id']
    # Check if the image has already been added
    if img_id in images:
        print(f'Image label already annotated for {img_id}')
        continue
    
    # Get all 3 evaluations as a matrix (1 row = 1 professional / 1 column = 1 NFCS metric)
    nfcs_evaluation_ = np.array([professional_data['NFCS'] for professional_data in s_img['data']['health_professionals'].values()])
    # Take the median to get the majority opinion and cast to integer (was float)
    nfcs_evaluation = np.median(nfcs_evaluation_, axis=0).astype(np.uint8)
    # Uncomment the line below to see the results
    #print(f"- Professional evaluations:\n {nfcs_evaluation_}\n- Median: {nfcs_evaluation}\n{'-'*30}"); time.sleep(0.5)

    images.append(img_id)
    for label_syntetic, label in zip(syntetic_labels_data['info']['NFCS'], nfcs_evaluation):
        region = syntetic2unifesp_labels[label_syntetic]
        labels[region].append(label)

## Save the dataframe as CSV

In [16]:
output_df = pd.DataFrame(labels, index=images)
output_df.index.name = 'image_id'

In [17]:
output_df.to_csv(output_file_path, index=True)

In [18]:
output_df

Unnamed: 0_level_0,SN,FS,FP,BA,BE
image_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
01_113538,1,1,1,1,1
01_113607,0,0,0,0,0
01_113730,1,1,1,1,1
01_113852,1,1,1,1,1
01_114251,1,1,1,0,0
...,...,...,...,...,...
49915,0,0,0,0,0
49971,1,0,1,0,0
49980,1,0,1,1,1
49982,0,0,1,0,0
