In [1]:
from sklearn.metrics import confusion_matrix
from tqdm.notebook import tqdm as tqdm
from datetime import datetime

from Dental_Tool.Data_processing import *
from Dental_Tool.Dental_Model import *
from Dental_Tool.Process_results import *
from Dental_Tool.Dataloader import *

import matplotlib.pyplot as plt
import numpy as np
import keras
import time

Using TensorFlow backend.


In [2]:
directory = [ 
                "Dental_Data/PBL/10_interdental_20201015_multilabel_max4", 
                "Dental_Data/PBL/10_interdental_20201015_multilabel_max4_flip", 
                "Dental_Data/PBL/10_interdental_20201015_clahe_multilabel_max4", 
                "Dental_Data/PBL/10_interdental_20201015_clahe_multilabel_max4_flip"
            ]

directory = [ i + "/mapping.json" for i in directory]
argscale_num = len(directory) * 20 

In [3]:
def load_json(data_list, interdental=False):
            filter_fun = lambda x : { path: max(list(map(int, state))) for path, state in x.items() if max(list(map(int, state))) >= 0 }
               
            interdental_fun = lambda x : { path: state for path, state in x.items() }
            
            results = collections.OrderedDict()
            all_filtering_data, all_keys = [], []
            
            for dataset_path in data_list:
                        mapping_data = json.load(open(dataset_path , "r"))
                        filter_data  = filter_fun(mapping_data) if not interdental else interdental_fun(mapping_data)
                        all_filtering_data.append(filter_data)
                        all_keys.append( list(filter_data.keys()) )
            
            for keys in zip(*all_keys):
                    for key,  data in zip(keys, all_filtering_data):
                            results[key] = data[key]        
            return results

In [4]:
def json_2_dataframe_PBL_inderdental(data, mode=None):
        PBL_Columns = ["Path", "State", "Class", "bone_loss", "furcation", "buccal_furcation", "endo_lesion"]
        
        dataframe = pd.DataFrame(columns=PBL_Columns)
        data_dict, counter = collections.OrderedDict(), 0
        
        molar    = [1, 2, 3, 14, 15, 16, 17, 18, 19, 30, 31, 32]
        premolar = [ 4, 5, 12, 13, 20, 21, 28, 29 ]
        canine   = [ 6, 11, 22, 27                 ]
        incisor  = [ 7, 8 , 9 , 10, 23, 24, 25, 26 ]
        all_molar = molar + premolar
       
        less_data  = [1, 16, 17, 32]
        
        for path, info in data.items():
                state = info["state"]
                item = { 
                          "Path": path,
                          "State": state,
                          "Class": state-1 if state > 1 else 0,
                          **info
                }
                
                path_split = path.split("_")
                
#                 in_dir = path.split("/")[2]

                NN_IDX = 0
                for idx, i in enumerate(path_split):
                        if i == "NN":
                            NN_IDX = idx
                            break
                
                Patrica_IDX = 0
                if NN_IDX == 0:
                        for idx, i in enumerate(path_split):
                                if "Patrica" in i:
                                    Patrica_IDX = idx
                                    break
                    
                    
                original, source = "", ""
                
                if NN_IDX == 0:
                        source = "_".join(path_split[-7:-2]) 
                        original = "_".join(path_split[-7:-3])
                
                else: 
                        source = "_".join(path_split[NN_IDX:-2])
                        original = "_".join(path_split[NN_IDX:-3])
                
                if NN_IDX != 0:
                        if ' ' == path_split[NN_IDX-1][1]:  ID = "_".join(path_split[NN_IDX-2:NN_IDX])
                        else  : ID = path_split[NN_IDX-1]
                
                else: 
                        if ' ' == path_split[Patrica_IDX-1][1]:  ID = "_".join(path_split[Patrica_IDX-2:Patrica_IDX])
                        else  : ID = path_split[Patrica_IDX-1]
                
                item["ID"] = ID
                
                item["tooth_num"] = int(path_split[-3])
                item["ori_src"] = original
                item["source"] = source
                item["side"] = source + "_" + path[-5]

                
                if item["tooth_num"] in molar     : item["tooth_type"] = 0
                elif item["tooth_num"] in premolar: item["tooth_type"] = 1
                elif item["tooth_num"] in canine  : item["tooth_type"] = 2
                elif item["tooth_num"] in incisor : item["tooth_type"] = 3
                else : item["tooth_type"] = -99
                    
                item["side"] = source + "_" + path[-5]
                
                cond_1 = (mode == "molar"    ) and (item["tooth_num"] not in molar    )
                cond_2 = (mode == "premolar" ) and (item["tooth_num"] not in premolar )
                cond_3 = (mode == "canine"   ) and (item["tooth_num"] not in canine   )
                cond_4 = (mode == "incisor"  ) and (item["tooth_num"] not in incisor  )
                cond_5 = (mode == "all_molar") and (item["tooth_num"] not in all_molar)
                
                if cond_1 or cond_2 or cond_3 or cond_4 or cond_5: continue
                    
                item["angle"] = int(path_split[-2].split(".")[0])
                
                data_dict[counter] = item
                counter += 1
        dataframe = dataframe.from_dict(data_dict, "index")
        return dataframe

In [5]:
data = load_json(directory, interdental=True)
dataset = json_2_dataframe_PBL_inderdental(data)

dataset = dataset[ (dataset.state >= 0) & (dataset.bone_loss >= 0) ]
dataset

Unnamed: 0,Path,State,Class,state,bone_loss,furcation,buccal_furcation,endo_lesion,ID,tooth_num,ori_src,source,side,tooth_type,angle
0,Dental_Data/PBL/10_interdental_20201015_multil...,1,0,1,1,-99,-99,-99,000408 102419 x,6,NN_191024_151623_BE78A8,NN_191024_151623_BE78A8_6,NN_191024_151623_BE78A8_6_L,2,-10
1,Dental_Data/PBL/10_interdental_20201015_multil...,1,0,1,1,-99,-99,-99,000408 102419 x,6,NN_191024_151623_BE78A8,NN_191024_151623_BE78A8_6,NN_191024_151623_BE78A8_6_L,2,-10
2,Dental_Data/PBL/10_interdental_20201015_clahe_...,1,0,1,1,-99,-99,-99,000408 102419 x,6,NN_191024_151623_BE78A8,NN_191024_151623_BE78A8_6,NN_191024_151623_BE78A8_6_L,2,-10
3,Dental_Data/PBL/10_interdental_20201015_clahe_...,1,0,1,1,-99,-99,-99,000408 102419 x,6,NN_191024_151623_BE78A8,NN_191024_151623_BE78A8_6,NN_191024_151623_BE78A8_6_L,2,-10
4,Dental_Data/PBL/10_interdental_20201015_multil...,1,0,1,1,0,-999,0,000408 102419 x,6,NN_191024_151623_BE78A8,NN_191024_151623_BE78A8_6,NN_191024_151623_BE78A8_6_R,2,-10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
776155,Dental_Data/PBL/10_interdental_20201015_clahe_...,2,1,2,1,-99,-99,-99,S594966_2 091718 x,26,NN_180917_113933_C0A0B2,NN_180917_113933_C0A0B2_26,NN_180917_113933_C0A0B2_26_L,3,9
776156,Dental_Data/PBL/10_interdental_20201015_multil...,1,0,1,1,0,0,0,S594966_2 091718 x,26,NN_180917_113933_C0A0B2,NN_180917_113933_C0A0B2_26,NN_180917_113933_C0A0B2_26_R,3,9
776157,Dental_Data/PBL/10_interdental_20201015_multil...,1,0,1,1,0,0,0,S594966_2 091718 x,26,NN_180917_113933_C0A0B2,NN_180917_113933_C0A0B2_26,NN_180917_113933_C0A0B2_26_R,3,9
776158,Dental_Data/PBL/10_interdental_20201015_clahe_...,1,0,1,1,0,0,0,S594966_2 091718 x,26,NN_180917_113933_C0A0B2,NN_180917_113933_C0A0B2_26,NN_180917_113933_C0A0B2_26_R,3,9


In [6]:
set(dataset.ID)

{'000408 102419 x',
 '000411 112119 x',
 '001742 082712 x',
 '002456 060517 x',
 '002555 042513 x',
 '003262 103015 x',
 '003615 010816 x',
 '003670 020718 x',
 '004151 091409 x',
 '004359 030716 x',
 '004499 110515 x',
 '005627 120209 x',
 '007274 021016 x',
 '007501 082216 x',
 '008908 090309 x',
 '010801 011019 x',
 '010953 031618 x',
 '019747_1 033011 x',
 '019747_2 061417 x',
 '022198_1 080717 x',
 '022198_2 100919 x',
 '025179_1 082510 x',
 '025179_2 101216 x',
 '026110_1 030513 x',
 '026110_2 061819 x',
 '026118_1 110613 x',
 '026118_2 060718 x',
 '035015_1 022613 x',
 '035015_2 082619 x',
 '043521_1 082113 x',
 '043521_2 082318 x',
 '051282_1 032311 x',
 '051282_2 102115 x',
 '051282_3 102918 x',
 '060011_1 033010 x',
 '060011_2 032817 x',
 '064768_1 022912 x',
 '064768_2 062513 x',
 '068783_1 090309 x',
 '074670_1 020316 x',
 '074670_2 071718 x',
 '077345_1 030719 x',
 '077345_2 082312 x',
 '1028067 062718 x',
 '10689 102418 x',
 '111084_1 071918 x',
 '111084_2 021017 x',
 '13

In [7]:
for i in dataset[dataset.ID == "Patrick"].Path:
        print(i)

In [8]:
def split_K_Fold(dataframe, augscale, fold_num):
        total_stage_3 = len(dataframe[dataframe.State==3])
        
        def get_ID_frequence(dataframe, augscale):
                groups = [ table for patient_ID, table in dataframe.groupby("ID") ]
                ID_groups = dataframe.groupby("ID")
                frequence = []
                total_stage_3 = len(dataframe[dataframe.State==3])
                for group_ID, group_table in ID_groups:
                        frequence.append([group_ID, len(group_table[group_table.State==3]) // augscale])
                return frequence
        
        frequence = get_ID_frequence(dataframe, augscale)
        np.random.shuffle(frequence)
        
        fraction = round( total_stage_3 / augscale / fold_num )

        fold_index = [0]
        count = 0
        for idx, item in enumerate(frequence):
                id_num, freq = item
                if count + freq >= fraction:
                        count = 0
                        fold_index.append(idx)
                count += freq
        
#         fold_index[-1] = len(frequence)
        
        K_fold_df = []
        all_groups = dataframe.groupby("ID")

        for i in range(fold_num):
                one_partition = np.array(frequence[fold_index[i]:fold_index[i+1]])
                one_partition_ids = one_partition[:, 0]
                one_partition_groups = [ all_groups.get_group(patient_ID) for patient_ID in one_partition_ids ]
                one_partition_dataset = pd.concat(one_partition_groups).reset_index(drop=True)
                K_fold_df.append(one_partition_dataset)
                
        return K_fold_df

In [9]:
def get_all_dataset(dataframe, augscale, fold_num):
        K_fold_df = split_K_Fold(dataframe, augscale, fold_num)
        
        
        train = ['train'] * (fold_num - 2)
        order = [ *train, 'valid', 'test']
        order = np.array(order)

        for rotate_times in range(1, fold_num+1) : 
                train_dataset, valid_dataset, test_dataset = pd.DataFrame(), pd.DataFrame(), pd.DataFrame()

                train_index = np.where(order=='train')[0]
                valid_index = np.where(order=='valid')[0][0]
                test_index  = np.where(order=='test')[0][0]

                for idx in train_index:
                        train_dataset   = pd.concat( [train_dataset, K_fold_df[idx] ] ,ignore_index=False )

                valid_dataset = K_fold_df[valid_index]
                test_dataset  = K_fold_df[test_index]

                order = np.roll(order, 1)
                
                yield train_dataset, valid_dataset, test_dataset

In [10]:
def K_Fold_print_class_ratio(dataframe):
        stage_0 = len(dataframe[dataframe["State"] == 0])
        stage_1 = len(dataframe[dataframe["State"] == 1])
        stage_2 = len(dataframe[dataframe["State"] == 2])
        stage_3 = len(dataframe[dataframe["State"] == 3])
        print("Class 0 : %d, Class 1 : %d, Class 2 : %d" % ( (stage_0 + stage_1), stage_2, stage_3 ))
        print("Stage 0 : %d, Stage 1 : %d, Stage 2 : %d, Stage 3 : %d" % ( stage_0, stage_1, stage_2, stage_3 ))

In [11]:
def K_Fold_adjust_class_ratio(dataframe, argscale, classes):
        new_dataset = pd.DataFrame()
        stage_0 = len(dataframe[dataframe["State"] == 0])
        stage_1 = len(dataframe[dataframe["State"] == 1])
        stage_2 = len(dataframe[dataframe["State"] == 2])
        stage_3 = len(dataframe[dataframe["State"] == 3])
        
        min_num = min(stage_0, stage_1, stage_2, stage_3)
        
        if classes == 3 : Class_nums = [ min_num // 2, min_num //2, min_num, min_num ]
        if classes == 2 : Class_nums = [ min_num , min_num, min_num, min_num ]
        
        
        Stages     = [ 0, 1, 2, 3]
        
        for Stage, Class_num in zip(Stages, Class_nums):
                stage_dataset = dataframe[dataframe["State"] == Stage].reset_index(drop=True)
#                 groups = [ stage_dataset.iloc[ i:i+argscale ,:] for i in range(0, len(stage_dataset), argscale) ]
                tooth_group = stage_dataset.groupby("ID")
                groups = [ table for source, table in tooth_group ]
                random.shuffle(groups)
                stage_dataset_shuffle = pd.concat(groups).reset_index(drop=True)
                get_enough_data_flag, count = False, 0

                sample_dict = collections.OrderedDict()
                appear_dict = {}
                while not get_enough_data_flag:        
                        for i in range(0, len(stage_dataset), argscale):
                                same_images = stage_dataset_shuffle.iloc[ i:i+argscale ,:].reset_index(drop=True)
                                if i not in appear_dict : appear_dict[i] = set()
                                
                                while True:
                                        random_idx = random.randint(0, argscale-1) 
                                        if random_idx not in appear_dict[i]: break
                                
                                if len(same_images) != argscale: print(same_images)
                                appear_dict[i].add(random_idx)
                                append_data = same_images.iloc[random_idx, :]
                                sample_dict[count] = append_data.to_dict()
                                count += 1
                                if count >= Class_num:
                                        get_enough_data_flag = True
                                        break
                stage_sample_dataframe = pd.DataFrame().from_dict(sample_dict).T
                new_dataset = pd.concat([new_dataset, stage_sample_dataframe])             
        return new_dataset

In [12]:
def K_Fold_balance_data_generator(dataframe, argscale, classes, batch_size=32, k_fold_num=5):
        for train, valid, test in get_all_dataset(dataframe, argscale, k_fold_num):
                
                print("--------------------Before------------------------")
                
                K_Fold_print_class_ratio(train)
                K_Fold_print_class_ratio(valid)
                K_Fold_print_class_ratio(test)
                
                train_dataset = K_Fold_adjust_class_ratio(train, argscale, classes)
                valid_dataset = K_Fold_adjust_class_ratio(valid, argscale, classes)
                test_dataset  = K_Fold_adjust_class_ratio(test , argscale, classes)
                
                print("--------------------After------------------------")
                K_Fold_print_class_ratio(train_dataset)
                K_Fold_print_class_ratio(valid_dataset)
                K_Fold_print_class_ratio(test_dataset)
                
                print("train ID & valid ID", set(train_dataset.ID) & set(valid_dataset.ID ))
                print("test ID  & valid ID", set(test_dataset.ID ) & set(valid_dataset.ID ))
                print("train ID & test  ID", set(train_dataset.ID) & set(test_dataset.ID  ))
                
                print("-----------------------------------------------")
                
                train_dataset   = shuffle(train_dataset).reset_index(drop=True)
                train_generator = make_generator(train_dataset, batch_size)

                valid_dataset   = shuffle(valid_dataset).reset_index(drop=True)
                valid_generator = make_generator(valid_dataset, batch_size)

                test_dataset    = shuffle(test_dataset).reset_index(drop=True)
                test_generator  = make_generator(test_dataset, batch_size)
                
                yield train_dataset, valid_dataset, test_dataset, train_generator, valid_generator, test_generator

In [13]:
fold_num = 1
class_num = 3
for train_dataset, valid_dataset, test_dataset, train_generator, valid_generator, test_generator in K_Fold_balance_data_generator(dataset, argscale=argscale_num, classes=class_num, batch_size=32, k_fold_num=5):
        if not os.path.isdir(f"balance_dataset/Class_{class_num}_3P_multitask/Fold_{fold_num}"):
                os.makedirs(f"balance_dataset/Class_{class_num}_3P_multitask/Fold_{fold_num}")
        
        train_dataset.to_csv(f"balance_dataset/Class_{class_num}_3P_multitask/Fold_{fold_num}/train_dataset.csv", index=True)
        valid_dataset.to_csv(f"balance_dataset/Class_{class_num}_3P_multitask/Fold_{fold_num}/valid_dataset.csv", index=True)
        test_dataset.to_csv(f"balance_dataset/Class_{class_num}_3P_multitask/Fold_{fold_num}/test_dataset.csv", index=True)
        
        fold_num += 1

--------------------Before------------------------
Class 0 : 268800, Class 1 : 50080, Class 2 : 24320
Stage 0 : 79200, Stage 1 : 189600, Stage 2 : 50080, Stage 3 : 24320
Class 0 : 100320, Class 1 : 26400, Class 2 : 7760
Stage 0 : 26400, Stage 1 : 73920, Stage 2 : 26400, Stage 3 : 7760
Class 0 : 97600, Class 1 : 15280, Class 2 : 7840
Stage 0 : 23120, Stage 1 : 74480, Stage 2 : 15280, Stage 3 : 7840
--------------------After------------------------
Class 0 : 24320, Class 1 : 24320, Class 2 : 24320
Stage 0 : 12160, Stage 1 : 12160, Stage 2 : 24320, Stage 3 : 24320
Class 0 : 7760, Class 1 : 7760, Class 2 : 7760
Stage 0 : 3880, Stage 1 : 3880, Stage 2 : 7760, Stage 3 : 7760
Class 0 : 7840, Class 1 : 7840, Class 2 : 7840
Stage 0 : 3920, Stage 1 : 3920, Stage 2 : 7840, Stage 3 : 7840
train ID & valid ID set()
test ID  & valid ID set()
train ID & test  ID set()
-----------------------------------------------
--------------------Before------------------------
Class 0 : 340080, Class 1 : 68480, 