In [1]:
from torch.utils.data import Dataset
import torch
import numpy as np
import time
import pandas as pd
import pydicom
import sklearn.utils
import xgboost as xgb
import tqdm
import ast

import sys
sys.path.append("../")
from utilities import helper_functions, splitting, augmentations, metrics

## Datasets

In [54]:
class BrandsDataset(Dataset):
    '''
    Brands Dataset
    '''
    
    def __init__(self, img_files, views, labels, rgb, view_append):
        '''
        Initialize the dataset with image files, their corresponding Y labels (in encoded numpy format), the 
        function to pre-process the image, the function to apply transformations (data augmentation) to the image, 
        whether or not you need to convert your images to be converted to rgb (our images are grayscale and we 
        needed to duplicate our grayscale images along 3 channels to convert them to RGB so that they can be input 
        into a pretrained model), and whether you need to flatten (for baselines) the input before feeding it into 
        the model
        '''
        assert len(img_files) == len(labels), "Number of files should match number of targets"
        
        self.img_files = img_files
        self.labels = labels
        self.views = views
        self.rgb = rgb
        self.view_append = view_append
    
    def load_dicom(self, img_path):
        '''
        This function loads an image from a DICOM path. If there is an error with the path, it will print error and 
        return a 256x256 array of zeros
        '''
        try:
            image_info = pydicom.dcmread("../"+ img_path)
            actual_image = image_info.pixel_array
            
        except:
            print(f"Something went wrong with reading file {img_path}")
            actual_image = np.zeros((256,256))
        
        actual_image = helper_functions.prepare_image(actual_image, rgb=self.rgb, channels_first=True).flatten()

        return actual_image
    
    
    def __getitem__(self, index):
        '''
        Get a unique item from the dataset according to index. This is required when building a custom dataloader
        '''
        X = self.load_dicom(self.img_files[index])
        X = X/255.
        if self.view_append:
            if self.views[index] == "AP":
                view = 1
            elif self.views[index] == "L":
                view = 2
            X = np.append(X, view)
            
        return X
        
    
    def __len__(self):
        '''
        Length of the dataset. This is required when building a custom dataloader
        '''
        return len(self.img_files)

## Grid Search Function

In [55]:
def runGridSearch(data, suffix, view):
    print("Preparing Datasets and Models...")
    train_set = data.copy()
    # Creating dataset
    train_dataset = BrandsDataset(train_set['filepath'], train_set['View'], train_set['Label'], rgb = False, view_append=view)

    # Creating dataset
    train_dataset = BrandsDataset(train_set['filepath'], train_set['View'], train_set['Label'], rgb = False, view_append=view)

    # Loading img_paths in dataset to create a Pandas Dataframe
    datasets = {'train': train_dataset}
    tensor_data = {'train': []}
    
    for item in datasets.keys():
        for row in range(len(datasets[item])):
            img_ = datasets[item].__getitem__(row)
            if not np.any(img_):
                print("Moving On...")
                continue
            else:
                tensor_data[item].append(img_)

    # Converting final dataset to NumPy array
    X_train_final = np.array(tensor_data['train'])
    
    # Weighting imablanced classes
    weightings = sklearn.utils.class_weight.compute_class_weight('balanced', np.sort(train_set['Label'].unique()), train_set['Label'])
    weights = pd.DataFrame(weightings, index=np.sort(train_set['Label'].unique())).reset_index()
    weights = pd.merge(left=train_set['Label'], right=weights, how='left', left_on='Label', right_on = 'index')[0]    

    # XGBoost train and test sets
    dtrain = xgb.DMatrix(X_train_final, label=train_set['Label'], weight = weights)
    
    # A parameter grid for XGBoost
    params = {
        'min_child_weight': 5, 
        'gamma': 0.5, 
        'subsample': 1, 
        'max_depth': 4, 
        'objective': 'multi:softmax',
        'eval_metric': 'mlogloss',
        'num_class': len(train_set['Label'].unique())
        }
    num_boost_round = 999
    gridsearch_params = [
    (max_depth, gamma, subsample)
    for max_depth in [4,6, 8]
    for gamma in [0.5,2]
    for subsample in [0.8, 1]
    ]
    
    min_mlogloss = float("Inf")
    best_params = None
    for max_depth, gamma,subsample in gridsearch_params:
        print("CV with max_depth={}, min_gamma={}, min_subsample={}".format(
                                 max_depth,
                                 gamma, subsample))
        # Update our parameters
        params['max_depth'] = max_depth
        params['gamma'] = gamma
        params['subsample'] = subsample
        # Run CV
        cv_results = xgb.cv(
            params,
            dtrain,
            num_boost_round=num_boost_round,
            seed=42,
            nfold=5,
            metrics={'mlogloss'},
            early_stopping_rounds=10
        )
        # Update best mlogloss
        mean_mlogloss = cv_results['test-mlogloss-mean'].min()
        boost_rounds = cv_results['test-mlogloss-mean'].idxmin()
        print("\tMLOGLOSS {} for {} rounds".format(mean_mlogloss, boost_rounds))
        if mean_mlogloss < min_mlogloss:
            min_mlogloss = mean_mlogloss
            best_params = (max_depth,gamma, subsample)
    print("Best params: {}, {}, {} mlogloss: {}".format(best_params[0], best_params[1], best_params[2], min_mlogloss))
    
    params['max_depth'] = best_params[0]
    params['gamma'] = best_params[1]
    params['subsample'] = best_params[2]
    
    pd.DataFrame(params, index=[0]).to_csv(f"results/baselines/tuning/xgboosttuning-{suffix}-view{view}.csv")
    
    return params

In [57]:
data = pd.read_csv("../Master_Posterior_HDW.csv")
suffix = 'posterior'
view=True

In [58]:
params = runGridSearch(data, suffix, view)

Preparing Datasets and Models...
CV with max_depth=4, min_gamma=0.5, min_subsample=0.8
	MLOGLOSS 1.3995362 for 0 rounds
CV with max_depth=4, min_gamma=0.5, min_subsample=1
	MLOGLOSS 1.3685988000000002 for 1 rounds
CV with max_depth=4, min_gamma=2, min_subsample=0.8
	MLOGLOSS 1.4019492 for 0 rounds
CV with max_depth=4, min_gamma=2, min_subsample=1
	MLOGLOSS 1.3703438000000001 for 1 rounds
CV with max_depth=6, min_gamma=0.5, min_subsample=0.8
	MLOGLOSS 1.4002912 for 0 rounds
CV with max_depth=6, min_gamma=0.5, min_subsample=1
	MLOGLOSS 1.3689008 for 0 rounds
CV with max_depth=6, min_gamma=2, min_subsample=0.8
	MLOGLOSS 1.4019492 for 0 rounds
CV with max_depth=6, min_gamma=2, min_subsample=1
	MLOGLOSS 1.367848 for 1 rounds
CV with max_depth=8, min_gamma=0.5, min_subsample=0.8
	MLOGLOSS 1.4002914 for 0 rounds
CV with max_depth=8, min_gamma=0.5, min_subsample=1
	MLOGLOSS 1.3689008 for 0 rounds
CV with max_depth=8, min_gamma=2, min_subsample=0.8
	MLOGLOSS 1.4019494 for 0 rounds
CV with max_d

## Trainin Final Models Function

In [101]:
data = pd.read_csv("../Master_Anterior_HDW.csv")
suffix = 'anterior'
view=True
iterations = 10

In [102]:
def trainWithSignificance(data, suffix, view, iterations):
    # Selecting best hyperparameters
    df = pd.read_csv(f"results/baselines/tuning/xgboosttuning-{suffix}-view{view}.csv")
    best_params = df[['max_depth', 'subsample', 'gamma']].to_dict('records')[0] # Hard coding!
    params = {'min_child_weight': 5, 
                    'objective': 'multi:softprob', 
                    'eval_metric': 'mlogloss', 
                    'num_class': len(data['Label'].unique())}
    params.update(best_params)
    print("Parameters:", params)
    
    # Initializing storage variables
    f1_scores = []
    precisions = []
    recalls = []
    aucs = []
    
    # Getting statistical significance
    for i in tqdm.notebook.tqdm(range(iterations)):
        print("Setting Up Data...")
        # Splitting dataset, combining train and val into one train set
        train_set, val_set, test_set = splitting.split_data_2(data, suffix, return_data=True, save_data = False)
        
        # Creating a PyTorch dataset
        train_dataset = BrandsDataset(train_set['filepath'], train_set['View'], train_set['Label'], rgb = False, view_append=view)
        val_dataset = BrandsDataset(val_set['filepath'], val_set['View'], val_set['Label'], rgb = False, view_append=view)
        test_dataset = BrandsDataset(test_set['filepath'], test_set['View'], test_set['Label'], rgb = False, view_append=view)

        # Loading img_paths in dataset to create a Pandas Dataframe
        datasets = {'train': train_dataset, 'val': val_dataset, 'test': test_dataset}
        tensor_data = {'train': [], 'val': [], 'test': []}
        
        # Going through Pytorch datasets and appending to a list
        for item in datasets.keys():
            for row in range(len(datasets[item])):
                img_ = datasets[item].__getitem__(row)
                if not np.any(img_):
                    print("Moving On...")
                    continue
                else:
                    tensor_data[item].append(img_)

        # Converting final list datasets to NumPy arrays
        X_train_final = np.array(tensor_data['train'])
        print("Train Shape", X_train_final.shape)
        X_val_final = np.array(tensor_data['val'])
        print("Val Shape", X_val_final.shape)        
        X_test_final = np.array(tensor_data['test']) 
        print("Test Shape", X_test_final.shape)
        
        # Weighting imablanced classes
        weightings = sklearn.utils.class_weight.compute_class_weight('balanced', np.sort(train_set['Label'].unique()), train_set['Label'])
        weights = pd.DataFrame(weightings, index=np.sort(train_set['Label'].unique())).reset_index()
        weights = pd.merge(left=train_set['Label'], right=weights, how='left', left_on='Label', right_on = 'index')[0] 
        
        # XGBoost train, val and test sets
        dtrain = xgb.DMatrix(X_train_final, label=train_set['Label'], weight = weights)
        dval = xgb.DMatrix(X_val_final, label=val_set['Label'])
        dtest = xgb.DMatrix(X_test_final, label=test_set['Label'])
        
        # Training the model
        print("Training Final XGB Model...")
        model = xgb.train(params,
                          dtrain,
                          num_boost_round=999,
                          evals=[(dval, "Val")],
                          early_stopping_rounds=10)
        print("Best MLogloss: {:.2f} with {} rounds".format(model.best_score, model.best_iteration+1))
        
        # Evaluating the model
        predictions_, probabilities_ = model.predict(dtest).argmax(axis=1), model.predict(dtest)
        
        # Calculating metrics
        f1_score, precision, recall, auc, _ = metrics.metrics_function(y_predicted=predictions_, 
                                                                        y_probs=probabilities_, 
                                                                        y_true=test_set['Label'])
        
        # Appending metrics
        f1_scores.append(f1_score)
        precisions.append(precision)
        recalls.append(recall)
        aucs.append(auc)
    
    # Compiling results from all trials and exporting to a CSV
    print(f"Compiling Results from {iterations} trials...")
    compiled_numeric = metrics.compile_numeric_results(f1_scores, precisions, recalls, aucs)
    compiled_numeric.index.name = 'Score'
    compiled_numeric.to_csv(f"results/baselines/metrics/xgboost-view{view}-{suffix}"+ "_numeric.csv", index=True)

In [103]:
trainWithSignificance(data, suffix, view, iterations)

Parameters: {'min_child_weight': 5, 'objective': 'multi:softprob', 'eval_metric': 'mlogloss', 'num_class': 4, 'max_depth': 6, 'subsample': 1, 'gamma': 2}


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

Setting Up Data...
Train Shape (937, 65537)
Val Shape (261, 65537)
Test Shape (445, 65537)
Training Final XGB Model...
[0]	Val-mlogloss:1.30924
Will train until Val-mlogloss hasn't improved in 10 rounds.
[1]	Val-mlogloss:1.27750
[2]	Val-mlogloss:1.23857
[3]	Val-mlogloss:1.20193
[4]	Val-mlogloss:1.19100
[5]	Val-mlogloss:1.15992
[6]	Val-mlogloss:1.13921
[7]	Val-mlogloss:1.11893
[8]	Val-mlogloss:1.09957
[9]	Val-mlogloss:1.09482
[10]	Val-mlogloss:1.08626
[11]	Val-mlogloss:1.08291
[12]	Val-mlogloss:1.07329
[13]	Val-mlogloss:1.06486
[14]	Val-mlogloss:1.06372
[15]	Val-mlogloss:1.06472
[16]	Val-mlogloss:1.06096
[17]	Val-mlogloss:1.05462
[18]	Val-mlogloss:1.05148
[19]	Val-mlogloss:1.05083
[20]	Val-mlogloss:1.04801
[21]	Val-mlogloss:1.04942
[22]	Val-mlogloss:1.05055
[23]	Val-mlogloss:1.05013
[24]	Val-mlogloss:1.05301
[25]	Val-mlogloss:1.05277
[26]	Val-mlogloss:1.05260
[27]	Val-mlogloss:1.05248
[28]	Val-mlogloss:1.05240
[29]	Val-mlogloss:1.05234
[30]	Val-mlogloss:1.05231
Stopping. Best iteration:

Train Shape (992, 65537)
Val Shape (224, 65537)
Test Shape (427, 65537)
Training Final XGB Model...
[0]	Val-mlogloss:1.26286
Will train until Val-mlogloss hasn't improved in 10 rounds.
[1]	Val-mlogloss:1.16115
[2]	Val-mlogloss:1.09050
[3]	Val-mlogloss:1.04857
[4]	Val-mlogloss:1.01770
[5]	Val-mlogloss:0.98913
[6]	Val-mlogloss:0.97107
[7]	Val-mlogloss:0.95565
[8]	Val-mlogloss:0.94514
[9]	Val-mlogloss:0.93301
[10]	Val-mlogloss:0.92695
[11]	Val-mlogloss:0.92088
[12]	Val-mlogloss:0.91991
[13]	Val-mlogloss:0.91757
[14]	Val-mlogloss:0.91993
[15]	Val-mlogloss:0.91743
[16]	Val-mlogloss:0.91656
[17]	Val-mlogloss:0.91813
[18]	Val-mlogloss:0.91769
[19]	Val-mlogloss:0.91799
[20]	Val-mlogloss:0.91768
[21]	Val-mlogloss:0.91681
[22]	Val-mlogloss:0.91658
[23]	Val-mlogloss:0.91639
[24]	Val-mlogloss:0.91623
[25]	Val-mlogloss:0.91609
[26]	Val-mlogloss:0.91597
[27]	Val-mlogloss:0.91542
[28]	Val-mlogloss:0.91358
[29]	Val-mlogloss:0.91350
[30]	Val-mlogloss:0.91343
[31]	Val-mlogloss:0.91338
[32]	Val-mlogloss:

  _warn_prf(average, modifier, msg_start, len(result))


## Inspecting Results

In [14]:
suffix = 'posterior'
view=True

In [15]:
results = pd.read_csv(f"results/baselines/metrics/xgboost-view{view}-{suffix}"+ "_numeric.csv")
results['Confidence Intervals'] =results['Confidence Intervals'].apply(lambda x: np.array(ast.literal_eval(x)))
display(results)

Unnamed: 0,Score,Mean,Confidence Intervals,Support
0,F1,0.408972,"[0.3558041901048561, 0.4621405620657587]",10
1,Precision,0.430244,"[0.3441836171383514, 0.5163037535675206]",10
2,Recall,0.427084,"[0.37072581624311063, 0.48344170108697304]",10
3,AUC,0.482312,"[0.43351183950255046, 0.5311119427621224]",10


In [16]:
pd.concat([results[['Score','Mean']], results['Confidence Intervals'] - results['Mean']], axis = 1)

Unnamed: 0,Score,Mean,0
0,F1,0.408972,"[-0.05316818598045131, 0.05316818598045131]"
1,Precision,0.430244,"[-0.08606006821458462, 0.08606006821458456]"
2,Recall,0.427084,"[-0.056357942421931095, 0.05635794242193132]"
3,AUC,0.482312,"[-0.04880005162978596, 0.04880005162978596]"


In [12]:
suffix = 'posterior'
view=False

In [13]:
df = pd.read_csv(f"results/baselines/tuning/xgboosttuning-{suffix}-view{view}.csv")
best_params = df[['max_depth', 'subsample', 'gamma']].to_dict('records')[0] # Hard coding!
params = {'min_child_weight': 5, 
                'objective': 'multi:softprob', 
                'eval_metric': 'mlogloss'}
params.update(best_params)
print("Parameters:", params)

Parameters: {'min_child_weight': 5, 'objective': 'multi:softprob', 'eval_metric': 'mlogloss', 'max_depth': 8, 'subsample': 1, 'gamma': 2}
