In [2]:
import os
import torch
import pandas as pd
import numpy as np
from datasets import NoduleDataset, SubtypedDataLoader


ModuleNotFoundError: No module named 'datasets'

In [None]:
if torch.cuda.is_available():
    print("Good to go!")
    DEVICE = torch.device("cuda")
else:
    print("Using cpu")
    DEVICE = torch.device("cpu")

# Testing SubtypedDataLoader

In [3]:
lidc_subtyped = pd.read_csv('../data/lidc_subtyped.csv')
lidc_subtyped.head(5)

Unnamed: 0.1,Unnamed: 0,Nodule_id,malignancy,subtype
0,0,1197,benign,0benign
1,1,1208,benign,1benign
2,2,1109,benign,1benign
3,3,1259,benign,0benign
4,4,1050,benign,1benign


In [8]:
def getNormed(this_array, this_min = 0, this_max = 255, set_to_int = True):
    
    rat = (this_max - this_min)/(this_array.max() - this_array.min())
    this_array = this_array * rat
    this_array -= this_array.min()
    this_array += this_min
    if set_to_int:
        return this_array.to(dtype= torch.int)
    return this_array

In [27]:
def getImages(image_folder):
    '''
        Input:
        image_folder: directory of the image files

        Output:
        m1: list of the labels encountered (1,2,4,5)
        m2: list of binary labels encountered (benign, malignant)
        diff: list of any nodes with discrepency to CSV labels

    '''
    
    M_benign = []
    L_benign = []
    
    M_malignant = []
    L_malignant = []

    

    lidc = pd.read_csv('../data/lidc_subtyped.csv')
    for dir1 in os.listdir(image_folder):
  
        if dir1 == 'Malignancy_3':
            continue

        for file in os.listdir(os.path.join(image_folder, dir1)):
#             malignancy_orig = int(dir1[-1])
#             m1.append(malignancy_orig)

            temp_nodule_ID = file.split('.')[0]
            subtype = lidc[lidc['Nodule_id']==int(temp_nodule_ID)]['subtype'].iloc[0] 
            
            if subtype == '0benign':
                image_array = M_benign
            elif subtype == '1benign':
                image_array = L_benign
            elif subtype == '1malignant':
                image_array = M_malignant
            else:
                image_array = L_malignant
            
            image = np.loadtxt(os.path.join(image_folder, dir1,file))
            image = torch.from_numpy(image).to(DEVICE)
            rgb_image = torch.stack((image,image,image), dim = 0)
            rgb_image = getNormed(rgb_image)
            rgb_image = rgb_image / 255 

            image_array.append(rgb_image)


    return M_benign, L_benign, M_malignant, L_malignant


In [28]:
M_benign, L_benign, M_malignant, L_malignant = getImages('../data/LIDC(MaxSlices)_Nodules')

In [32]:
print(f'''Size of M_benign: {len(M_benign)}, Size of L_benign: {len(L_benign)}, 
Size of M_malignant: {len(M_malignant)}, Size of L_malignant: {len(L_malignant)}''')

Size of M_benign: 519, Size of L_benign: 512, 
Size of M_malignant: 430, Size of L_malignant: 202


In [56]:
subclass_data = {'M_benign': (M_benign, torch.zeros(519)), 
                 'L_benign': (L_benign, torch.zeros(512)),
                 'M_malignant': (M_malignant, torch.ones(430)),
                 'L_malignant': (L_malignant, torch.ones(202))}

In [162]:
a = SubtypedDataLoader(subclass_data, 100)

In [164]:
for idx, b in enumerate(a):
    print(idx)

0
1
2


In [92]:
from itertools import cycle
from torch.utils.data import DataLoader

subclass_Dataset =  NoduleDataset(*subclass_data['L_malignant'])
subclass_iterLoader = cycle(iter(DataLoader(subclass_Dataset, 200, shuffle=True)))

In [158]:
next(a)

StopIteration: 

# Testing DomainBeds dataloader

In [172]:
from fast_data_loader import InfiniteDataLoader

In [165]:
class MultipleDomainDataset:
    N_STEPS = 5001           # Default, subclasses may override
    CHECKPOINT_FREQ = 100    # Default, subclasses may override
    N_WORKERS = 1 #8           # Default, subclasses may override
    ENVIRONMENTS = None      # Subclasses should override
    INPUT_SHAPE = None       # Subclasses should override

    def __getitem__(self, index):
        return self.datasets[index]

    def __len__(self):
        return len(self.datasets)


In [170]:
class ClusterdLIDC(MultipleDomainDataset):
    def __init__(self, root, environments, input_shape,
                 num_classes):
        super().__init__()
        if root is None:
            raise ValueError('Data directory not specified!')
        
        #hard coded: M_benign, L_benign, M_malignant, L_malignant 
        all_images = getImages(root)
        
        self.datasets = []
        
        for i in range(len(environments)):
            images = all_images[i] 
            labels = torch.zeros(len(images)) if i < 2 else torch.ones(len(images))
            self.datasets.append(NoduleDataset(images,labels))

        self.input_shape = input_shape
        self.num_classes = num_classes


In [171]:
datasets = ClusterdLIDC('LIDC(MaxSlices)_Nodules',['M_benign', 'L_benign', 'M_malignant', 'L_malignant'], None, 4)

In [211]:
    train_loaders = [InfiniteDataLoader(
        dataset=env, #each subclass DataSet
        weights=None, #None
        batch_size=200,
        num_workers=1)
        for i, env in enumerate(datasets)]

In [277]:
train_loaders

[<fast_data_loader.InfiniteDataLoader at 0x24ab8fa6830>,
 <fast_data_loader.InfiniteDataLoader at 0x24ab8fe3f40>,
 <fast_data_loader.InfiniteDataLoader at 0x24ab8f361d0>,
 <fast_data_loader.InfiniteDataLoader at 0x24ab8f371f0>]

In [212]:
train_minibatches_iterator = zip(*train_loaders)

In [278]:
train_minibatches_iterator

<zip at 0x24ab4c97700>

In [280]:
x = next(train_minibatches_iterator)
x[0][0]

tensor([[[[0.1020, 0.0784, 0.0549,  ..., 0.4431, 0.4510, 0.4314],
          [0.0941, 0.0745, 0.0706,  ..., 0.4510, 0.4275, 0.4314],
          [0.0784, 0.0627, 0.0627,  ..., 0.4549, 0.4471, 0.4549],
          ...,
          [0.1294, 0.1216, 0.0706,  ..., 0.4745, 0.4784, 0.4941],
          [0.1529, 0.1216, 0.0784,  ..., 0.4902, 0.4941, 0.4980],
          [0.1804, 0.1137, 0.0627,  ..., 0.4980, 0.5098, 0.5137]],

         [[0.1020, 0.0784, 0.0549,  ..., 0.4431, 0.4510, 0.4314],
          [0.0941, 0.0745, 0.0706,  ..., 0.4510, 0.4275, 0.4314],
          [0.0784, 0.0627, 0.0627,  ..., 0.4549, 0.4471, 0.4549],
          ...,
          [0.1294, 0.1216, 0.0706,  ..., 0.4745, 0.4784, 0.4941],
          [0.1529, 0.1216, 0.0784,  ..., 0.4902, 0.4941, 0.4980],
          [0.1804, 0.1137, 0.0627,  ..., 0.4980, 0.5098, 0.5137]],

         [[0.1020, 0.0784, 0.0549,  ..., 0.4431, 0.4510, 0.4314],
          [0.0941, 0.0745, 0.0706,  ..., 0.4510, 0.4275, 0.4314],
          [0.0784, 0.0627, 0.0627,  ..., 0

# Now we look at the 'debugging' model (main.py)

In [1]:
import loss
import torch
import pandas as pd
from sklearn.preprocessing import StandardScaler
import models
import train
from datasets import NoduleDataset, SubtypedDataLoader
from fast_data_loader import InfiniteDataLoader

id_name = 'noduleID'
feature_names = ['Area', 'ConvexArea', 'Perimeter', 'ConvexPerimeter', 'EquivDiameter',
                 'MajorAxisLength', 'MinorAxisLength', 'SuperscribedDiameter',
                 'Elongation', 'Compactness', 'Eccentricity', 'Solidity', 'Extent',
                 'Circularity', 'RadialDistanceSD', 'SecondMoment', 'Roughness',
                 'MaxIntensity', 'MeanIntensity', 'SDIntensity', 'MinIntensityBG',
                 'MaxIntensityBG', 'MeanIntensityBG', 'SDIntensityBG',
                 'IntensityDifference', 'markov1', 'markov2', 'markov3', 'markov4',
                 'markov5', 'gabormean_0_0', 'gaborSD_0_0', 'gabormean_0_1',
                 'gaborSD_0_1', 'gabormean_0_2', 'gaborSD_0_2', 'gabormean_1_0',
                 'gaborSD_1_0', 'gabormean_1_1', 'gaborSD_1_1', 'gabormean_1_2',
                 'gaborSD_1_2', 'gabormean_2_0', 'gaborSD_2_0', 'gabormean_2_1',
                 'gaborSD_2_1', 'gabormean_2_2', 'gaborSD_2_2', 'gabormean_3_0',
                 'gaborSD_3_0', 'gabormean_3_1', 'gaborSD_3_1', 'gabormean_3_2',
                 'gaborSD_3_2', 'Contrast', 'Correlation', 'Energy', 'Homogeneity',
                 'Entropy', 'x_3rdordermoment', 'Inversevariance', 'Sumaverage',
                 'Variance', 'Clustertendency']
label_name = 'malignancy'
device = "cuda" if torch.cuda.is_available() else "cpu"
training_fraction = 0.8
batch_size = 4
epoch_size = 23

is_gdro = False

groupdro_hparams = {"groupdro_eta": 0}

In [2]:
def preprocess_data(df):
    # select features and labels
    df = df.loc[:, [id_name, *feature_names, label_name]]

    # remove malignancy = 3
    df = df[df[label_name] != 3]

    # binarize the remaining malignancy [1,2] -> 0, [4,5] -> 1
    df[label_name] = [int(m - 3 > 0) for m in df[label_name]]

    # normalize numeric features
    df.loc[:, feature_names] = StandardScaler().fit_transform(df.loc[:, feature_names].values)

    return df


def split_to_tensors(df, frac):
    # separate into training and test sets
    training_df = df.sample(frac=frac)
    test_df = df.drop(training_df.index)

    # tensorify
    training_data = torch.FloatTensor(training_df.loc[:, feature_names].values).to(device)
    training_labels = torch.LongTensor(training_df.loc[:, label_name].values).to(device)
    test_data = torch.FloatTensor(test_df.loc[:, feature_names].values).to(device)
    test_labels = torch.LongTensor(test_df.loc[:, label_name].values).to(device)

    return training_data, training_labels, test_data, test_labels


def create_dataloaders(df):
    training_data, training_labels, test_data, test_labels = split_to_tensors(df, training_fraction)

    # wrap with datasets and dataloaders
    train_dataloader = iter(InfiniteDataLoader(NoduleDataset(training_data, training_labels), batch_size=batch_size))
    test_dataloader = iter(InfiniteDataLoader(NoduleDataset(test_data, test_labels), batch_size=batch_size))

    return train_dataloader, test_dataloader


def create_subtyped_dataloaders(df, subtype_df):
    def get_subtype_data(subtype_name):
        return df.loc[
               [subtype_df.at[nodule_id, "subtype"] == subtype_name
                if nodule_id in subtype_df["Nodule_id"].values else False
                for nodule_id in df[id_name]], :]

    subtype_names = subtype_df["subtype"].unique()
    subtype_dfs = {name: get_subtype_data(name) for name in subtype_names}

    # separate into training and test sets
    training_subtype_data = test_subtype_data = {}
    for name in subtype_dfs:
        training_data, training_labels, test_data, test_labels = split_to_tensors(subtype_dfs[name], training_fraction)

        training_subtype_data[name] = (training_data, training_labels)
        test_subtype_data[name] = (test_data, test_labels)

    # wrap with datasets and dataloaders
    train_dataloader = SubtypedDataLoader(training_subtype_data, batch_size=batch_size)
    test_dataloader = SubtypedDataLoader(test_subtype_data, batch_size=batch_size)

    return train_dataloader, test_dataloader

In [6]:
 # import data
df = pd.read_csv("../data/LIDC_20130817_AllFeatures2D_MaxSlicePerNodule_inLineRatings.csv")
subtype_df = pd.read_csv("../data/lidc_subtyped.csv")

# preprocess data
df = preprocess_data(df)
subtype_df.index = subtype_df["Nodule_id"].values

# create the training and testing dataloaders
if is_gdro:
    train_dataloader, test_dataloader = create_subtyped_dataloaders(df, subtype_df)
else:
    train_dataloader, test_dataloader = create_dataloaders(df)

In [233]:
next(train_dataloader)[1]

tensor([1, 1, 1, 1])

In [247]:
a = torch.arange(4).unsqueeze(1)
a

tensor([[0],
        [1],
        [2],
        [3]])

In [249]:
b = torch.tensor([[0,0,-1,-1,-1], [1,-1,-1,-1,-1],[2,2,2,2,2],[3,3,-1,-1,-1]])

In [251]:
(b==a).sum(1)

tensor([2, 1, 5, 2])