In [1]:
from torch.utils.data import Dataset
import torch
import numpy as np
import time
import pandas as pd
import pydicom
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
import tqdm
import ast

import sys
sys.path.append("../")
from utilities import helper_functions, splitting, augmentations, metrics

## Datasets

In [56]:
class BrandsDataset(Dataset):
    '''
    Brands Dataset
    '''
    
    def __init__(self, img_files, views, labels, rgb, view_append):
        '''
        Initialize the dataset with image files, their corresponding Y labels (in encoded numpy format), the 
        function to pre-process the image, the function to apply transformations (data augmentation) to the image, 
        whether or not you need to convert your images to be converted to rgb (our images are grayscale and we 
        needed to duplicate our grayscale images along 3 channels to convert them to RGB so that they can be input 
        into a pretrained model), and whether you need to flatten (for baselines) the input before feeding it into 
        the model
        '''
        assert len(img_files) == len(labels), "Number of files should match number of targets"
        
        self.img_files = img_files
        self.labels = labels
        self.rgb = rgb
        self.views = views
        self.view_append = view_append
    
    def load_dicom(self, img_path):
        '''
        This function loads an image from a DICOM path. If there is an error with the path, it will print error and 
        return a 256x256 array of zeros
        '''
        try:
            image_info = pydicom.dcmread("../"+ img_path)
            actual_image = image_info.pixel_array
            
        except:
            print(f"Something went wrong with reading file {img_path}")
            actual_image = np.zeros((256,256))
        
        actual_image = helper_functions.prepare_image(actual_image, rgb=self.rgb, channels_first=True).flatten()
        
        return actual_image
    
    
    def __getitem__(self, index):
        '''
        Get a unique item from the dataset according to index. This is required when building a custom dataloader
        '''
        X = self.load_dicom(self.img_files[index])
        X = X/255.
        if self.view_append:
            if self.views[index] == "AP":
                view = 1
            elif self.views[index] == "L":
                view = 2
            X = np.append(X, view)
            
        return X
        
    
    def __len__(self):
        '''
        Length of the dataset. This is required when building a custom dataloader
        '''
        return len(self.img_files)

## Grid Search Function

In [65]:
def runGridSearch(data, suffix, view):
    print("Preparing Datasets and Models...")
    train_set = data.copy()
    # Creating dataset
    train_dataset = BrandsDataset(train_set['filepath'], train_set['View'], train_set['Label'], rgb = False, view_append=view)

    # Loading img_paths in dataset to create a Pandas Dataframe
    datasets = {'train': train_dataset}
    tensor_data = {'train': []}
    
    for item in datasets.keys():
        for row in range(len(datasets[item])):
            img_ = datasets[item].__getitem__(row)
            if not np.any(img_):
                print("Moving On...")
                continue
            else:
                tensor_data[item].append(img_)

    # Converting final dataset to NumPy array
    X_train_final = np.array(tensor_data['train'])

    # Creating a Logistic Regression Model
    lr = LogisticRegression(random_state=0, max_iter=50, class_weight = 'balanced', solver = 'liblinear', multi_class = 'ovr')

    # Selecting hyperparameters to fine tune
    parameter_grid = {
        'C':[0.01, 0.1, 1],
        'penalty': ['l1', 'l2']
    }
    
    # 5-fold Cross Val Grid Search using weighted F1
    clf = GridSearchCV(lr, parameter_grid, scoring = "f1_weighted", cv = 5, n_jobs=-1)
    
    print("Doing Grid Search and Selecting Best Params...")
    # Fitting the final model
    since = time.time()
    clf.fit(X_train_final, train_set['Label'])
    elapsed = time.time() - since
    print("Elapsed:", elapsed)
    
    df = pd.DataFrame(clf.cv_results_).sort_values(by=['param_penalty','mean_test_score'], ascending=[True, False]).reset_index(drop=True)
    df.to_csv(f"results/baselines/tuning/logregtuning-{suffix}-view{view}.csv", index=False)    

In [71]:
data = pd.read_csv("../Master_Posterior_HDW.csv")
suffix = 'posterior'
view=False

In [72]:
runGridSearch(data, suffix, view)

Preparing Datasets and Models...
Doing Grid Search and Selecting Best Params...
Elapsed: 24.347995042800903


## Training Final Model

In [107]:
data = pd.read_csv("../Master_Posterior_HDW.csv")
suffix = 'posterior'
view=False
regularization_type = 'l2'
iterations = 20

In [108]:
def trainWithSignificance(data, suffix, view, regularization_type, iterations):
    # Selecting best hyperparameters
    df = pd.read_csv(f"results/baselines/tuning/logregtuning-{suffix}-view{view}.csv")
    if regularization_type == 'l1':
        best_c = df.loc[0, 'param_C'] # Best L1 is at the top according to system above. This is hard coding!
    elif regularization_type == 'l2':
        best_c = df.loc[3, 'param_C'] # best L2 is in row 3 according to system above. This is hard coding!
    print("Best Regularization", best_c)
    
    # Intializing storage variables
    f1_scores = []
    precisions = []
    recalls = []
    aucs = []
    confusion_matrices = []
    
    # Getting statistical significance
    for i in tqdm.notebook.tqdm(range(iterations)):
        print("Setting Up Data...")
        # Splitting dataset, combining train and val into one train set
        train_set, val_set, test_set = splitting.split_data_2(data, suffix, return_data=True, save_data = False)
        train_set = pd.concat([train_set, val_set], axis=0).reset_index(drop=True)
        
        # Creating a PyTorch dataset
        train_dataset = BrandsDataset(train_set['filepath'], train_set['View'], train_set['Label'], rgb = False, view_append=view)
        test_dataset = BrandsDataset(test_set['filepath'], test_set['View'], test_set['Label'], rgb = False, view_append=view)

        # Loading img_paths in dataset to create a Pandas Dataframe
        datasets = {'train': train_dataset, 'test': test_dataset}
        tensor_data = {'train': [], 'test': []}
        
        # Going through Pytorch datasets and appending to a list
        for item in datasets.keys():
            for row in range(len(datasets[item])):
                img_ = datasets[item].__getitem__(row)
                if not np.any(img_):
                    print("Moving On...")
                    continue
                else:
                    tensor_data[item].append(img_)

        # Converting final list datasets to NumPy arrays
        X_train_final = np.array(tensor_data['train'])
        print("Train Shape", X_train_final.shape)
        print("Unique Train Brands:", train_set['Label'].unique())
        X_test_final = np.array(tensor_data['test']) 
        print("Test Shape", X_test_final.shape)
        print("Unique Test Brands:", test_set['Label'].unique())
        
        # Training the model
        print(f"Training Final {regularization_type} Model...")
        lr = LogisticRegression(penalty = regularization_type, C = best_c, random_state=0, max_iter=10000, 
                                   class_weight = 'balanced', solver = 'liblinear', multi_class = 'ovr') 
        lr.fit(X_train_final, train_set['Label'])
        
        # Evaluating the model
        predictions_, probabilities_ = lr.predict(X_test_final), lr.predict_proba(X_test_final)
        
        # Calculating metrics
        f1_score, precision, recall, auc, _ = metrics.metrics_function(y_predicted=predictions_, 
                                                                        y_probs=probabilities_, 
                                                                        y_true=test_set['Label'])
        
        # Appending metrics
        f1_scores.append(f1_score)
        precisions.append(precision)
        recalls.append(recall)
        aucs.append(auc)
        confusion_matrices.append(_)
    
    # Compiling results from all trials and exporting to a CSV
    print(f"Compiling Results from {iterations} trials...")
    compiled_numeric = metrics.compile_numeric_results(f1_scores, precisions, recalls, aucs)
    compiled_numeric.index.name = 'Score'
    compiled_numeric.to_csv(f"results/baselines/metrics/{regularization_type}logreg-view{view}-{suffix}"+ "_numeric.csv", index=True)
    
    return confusion_matrices

In [109]:
trainWithSignificance(data, suffix, view, regularization_type, iterations)

Best Regularization 0.1


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))

Setting Up Data...
Train Shape (159, 65536)
Unique Train Brands: [2. 0. 1. 3.]
Test Shape (71, 65536)
Unique Test Brands: [0. 1. 2. 3.]
Training Final l2 Model...
Setting Up Data...
Train Shape (163, 65536)
Unique Train Brands: [2. 0. 3. 1.]
Test Shape (67, 65536)
Unique Test Brands: [0. 1. 2. 3.]
Training Final l2 Model...


  _warn_prf(average, modifier, msg_start, len(result))


Setting Up Data...
Train Shape (178, 65536)
Unique Train Brands: [2. 0. 3. 1.]
Test Shape (52, 65536)
Unique Test Brands: [2. 3. 1. 0.]
Training Final l2 Model...
Setting Up Data...
Train Shape (165, 65536)
Unique Train Brands: [2. 0. 3. 1.]
Test Shape (65, 65536)
Unique Test Brands: [3. 0. 1. 2.]
Training Final l2 Model...
Setting Up Data...
Train Shape (160, 65536)
Unique Train Brands: [2. 0. 1. 3.]
Test Shape (70, 65536)
Unique Test Brands: [1. 0. 3. 2.]
Training Final l2 Model...
Setting Up Data...
Train Shape (165, 65536)
Unique Train Brands: [2. 0. 1. 3.]
Test Shape (65, 65536)
Unique Test Brands: [3. 1. 2. 0.]
Training Final l2 Model...
Setting Up Data...
Train Shape (182, 65536)
Unique Train Brands: [2. 0. 1. 3.]
Test Shape (48, 65536)
Unique Test Brands: [2. 3. 0. 1.]
Training Final l2 Model...
Setting Up Data...
Train Shape (178, 65536)
Unique Train Brands: [2. 0. 1. 3.]
Test Shape (52, 65536)
Unique Test Brands: [0. 2. 3. 1.]
Training Final l2 Model...
Setting Up Data...
Tra

  _warn_prf(average, modifier, msg_start, len(result))


Setting Up Data...
Train Shape (174, 65536)
Unique Train Brands: [0. 1. 3. 2.]
Test Shape (56, 65536)
Unique Test Brands: [0. 2. 1. 3.]
Training Final l2 Model...


  _warn_prf(average, modifier, msg_start, len(result))


Setting Up Data...
Train Shape (163, 65536)
Unique Train Brands: [0. 3. 1. 2.]
Test Shape (67, 65536)
Unique Test Brands: [0. 1. 2. 3.]
Training Final l2 Model...


  _warn_prf(average, modifier, msg_start, len(result))


Setting Up Data...
Train Shape (178, 65536)
Unique Train Brands: [0. 1. 2. 3.]
Test Shape (52, 65536)
Unique Test Brands: [0. 3. 2. 1.]
Training Final l2 Model...
Setting Up Data...
Train Shape (163, 65536)
Unique Train Brands: [0. 3. 1. 2.]
Test Shape (67, 65536)
Unique Test Brands: [2. 0. 3. 1.]
Training Final l2 Model...


  _warn_prf(average, modifier, msg_start, len(result))


Setting Up Data...
Train Shape (168, 65536)
Unique Train Brands: [0. 2. 3. 1.]
Test Shape (62, 65536)
Unique Test Brands: [0. 3. 2. 1.]
Training Final l2 Model...
Setting Up Data...
Train Shape (155, 65536)
Unique Train Brands: [0. 1. 2. 3.]
Test Shape (75, 65536)
Unique Test Brands: [2. 3. 1. 0.]
Training Final l2 Model...
Setting Up Data...
Train Shape (156, 65536)
Unique Train Brands: [0. 2. 3. 1.]
Test Shape (74, 65536)
Unique Test Brands: [2. 0. 1. 3.]
Training Final l2 Model...
Setting Up Data...
Train Shape (181, 65536)
Unique Train Brands: [2. 0. 3. 1.]
Test Shape (49, 65536)
Unique Test Brands: [0. 1. 2. 3.]
Training Final l2 Model...
Setting Up Data...
Train Shape (172, 65536)
Unique Train Brands: [2. 0. 1. 3.]
Test Shape (58, 65536)
Unique Test Brands: [0. 2. 3. 1.]
Training Final l2 Model...
Setting Up Data...
Train Shape (168, 65536)
Unique Train Brands: [0. 2. 3. 1.]
Test Shape (62, 65536)
Unique Test Brands: [2. 3. 0. 1.]
Training Final l2 Model...
Setting Up Data...
Tra

[array([[27, 12,  5,  3],
        [ 6,  1,  0,  0],
        [ 5,  1,  0,  1],
        [ 8,  2,  0,  0]]), array([[32, 10,  4,  0],
        [ 9,  1,  0,  0],
        [ 1,  0,  0,  0],
        [ 9,  1,  0,  0]]), array([[19,  7, 11,  1],
        [ 6,  0,  0,  1],
        [ 0,  0,  1,  0],
        [ 2,  0,  4,  0]]), array([[24,  3,  3,  4],
        [16,  0,  0,  2],
        [ 6,  0,  0,  1],
        [ 2,  0,  4,  0]]), array([[26, 12,  3,  6],
        [ 9,  1,  0,  0],
        [ 7,  0,  0,  0],
        [ 6,  0,  0,  0]]), array([[26,  8,  0,  0],
        [18,  0,  0,  0],
        [ 6,  0,  0,  1],
        [ 3,  0,  3,  0]]), array([[22,  6,  2,  4],
        [ 6,  0,  0,  1],
        [ 1,  0,  0,  0],
        [ 4,  0,  2,  0]]), array([[25, 11,  5,  2],
        [ 1,  0,  1,  0],
        [ 1,  0,  0,  0],
        [ 5,  1,  0,  0]]), array([[30,  2,  0,  0],
        [18,  0,  0,  0],
        [ 7,  0,  0,  0],
        [ 5,  1,  0,  0]]), array([[20,  5,  0,  2],
        [18,  0,  0,  0],
   

## Inspecting Results

In [5]:
suffix = 'posterior'
view=True
regularization_type = 'l2'

In [6]:
results = pd.read_csv(f"results/baselines/metrics/{regularization_type}logreg-view{view}-{suffix}"+ "_numeric.csv")
results['Confidence Intervals'] =results['Confidence Intervals'].apply(lambda x: np.array(ast.literal_eval(x)))
display(results)

Unnamed: 0,Score,Mean,Confidence Intervals,Support
0,F1,0.398056,"[0.3459387916012953, 0.45017255237060044]",20
1,Precision,0.39111,"[0.32463881671766814, 0.4575813366430911]",20
2,Recall,0.42684,"[0.38596863855614316, 0.4677116101473494]",20
3,AUC,0.402446,"[0.357630703735772, 0.44726066622320093]",20


In [7]:
pd.concat([results[['Score','Mean']], results['Confidence Intervals'] - results['Mean']], axis = 1)

Unnamed: 0,Score,Mean,0
0,F1,0.398056,"[-0.052116880384652564, 0.052116880384652564]"
1,Precision,0.39111,"[-0.06647125996271147, 0.06647125996271147]"
2,Recall,0.42684,"[-0.040871485795603124, 0.040871485795603124]"
3,AUC,0.402446,"[-0.044814981243714525, 0.044814981243714413]"


In [8]:
suffix = 'anterior'
view=True
regularization_type = 'l2'

In [9]:
df = pd.read_csv(f"results/baselines/tuning/logregtuning-{suffix}-view{view}.csv")
if regularization_type == 'l1':
    best_c = df.loc[0, 'param_C'] # Best L1 is at the top according to system above. This is hard coding!
elif regularization_type == 'l2':
    best_c = df.loc[3, 'param_C'] # best L2 is in row 3 according to system above. This is hard coding!
print("Best Regularization", best_c)

Best Regularization 0.1
