In [1]:
import os
import torch
import pandas as pd
import numpy as np
from datasets import NoduleDataset
from dataloaders import SubtypedDataLoader

In [2]:
if torch.cuda.is_available():
    print("Good to go!")
    DEVICE = torch.device("cuda")
else:
    print("Using cpu")
    DEVICE = torch.device("cpu")

Using cpu


In [3]:
lidc_subtyped = pd.read_csv('./data/lidc_spic_subgrouped.csv', index_col=0)
lidc_subtyped.head(5)

Unnamed: 0,noduleID,malignancy,spiculation,malignancy_b,spiculation_b,subgroup
0,1,5.0,5.0,malignant,marked,marked_malignant
1,2,5.0,1.0,malignant,unmarked,unmarked_malignant
2,3,5.0,2.0,malignant,marked,marked_malignant
3,4,4.0,2.0,malignant,marked,marked_malignant
4,5,5.0,1.0,malignant,unmarked,unmarked_malignant


In [7]:
def getNormed(this_array, this_min = 0, this_max = 255, set_to_int = True):
    
    rat = (this_max - this_min)/(this_array.max() - this_array.min())
    this_array = this_array * rat
    this_array -= this_array.min()
    this_array += this_min
    if set_to_int:
        return this_array.to(dtype= torch.int)
    return this_array

In [9]:
def getImages(image_folder):
    '''
        Input:
        image_folder: directory of the image files

        Output:
        m1: list of the labels encountered (1,2,4,5)
        m2: list of binary labels encountered (benign, malignant)
        diff: list of any nodes with discrepency to CSV labels

    '''
    
    train_img = []
    train_label = []
    
    marked_benign = []
    unmarked_benign = []
    
    marked_malignant = []
    unmarked_malignant = []

    

    lidc = pd.read_csv('./data/lidc_spic_subgrouped.csv')
    train_test = pd.read_csv('./data/lidc_train_test_split_stratified.csv')
    for dir1 in os.listdir(image_folder):
  
        if dir1 == 'Malignancy_3':
            continue

        for file in os.listdir(os.path.join(image_folder, dir1)):


            temp_nodule_ID = file.split('.')[0]
            subtype = lidc[lidc['noduleID']==int(temp_nodule_ID)]['subgroup'].iloc[0]
            malignancy = lidc[lidc['noduleID']==int(temp_nodule_ID)]['malignancy'].iloc[0]
            
            train_type = train_test[train_test['noduleID'] ==int(temp_nodule_ID)]['dataset'].iloc[0]
            
            
            image = np.loadtxt(os.path.join(image_folder, dir1,file))
            image = torch.from_numpy(image).to(DEVICE)
            rgb_image = torch.stack((image,image,image), dim = 0)
            rgb_image = getNormed(rgb_image)
            rgb_image = rgb_image / 255 

            
            
            if train_type == 'train':
                train_img.append(rgb_image)
                train_label.append(torch.tensor(1) if malignancy > 3 else torch.tensor(0))
                
                continue
            
            if subtype == 'marked_benign':
                image_array = marked_benign
            elif subtype == 'unmarked_benign':
                image_array = unmarked_benign
            elif subtype == 'marked_malignant':
                image_array = marked_malignant
            else:
                image_array = unmarked_malignant
            
            image_array.append(rgb_image)
 


    return train_img, train_label, marked_benign, unmarked_benign, marked_malignant, unmarked_malignant


In [10]:
train_img, train_label, marked_benign, unmarked_benign, marked_malignant, unmarked_malignant = getImages('./LIDC(MaxSlices)_Nodules(fixed)')

In [15]:
len(train_img)

1219

In [19]:
len(marked_benign)

17

In [20]:
len(unmarked_benign)

149

In [21]:
len(marked_malignant)

83

In [22]:
len(unmarked_malignant)

47

80, 20 train test split

In [11]:
import random

In [12]:
random.seed(95)

In [13]:
random.sample([1,2,3,4], k=2)

[2, 4]

#model train!!!

In [109]:
from models import VGGNet
from loss import ERMLoss
from datasets import NoduleDataset
from dataloaders import InfiniteDataLoader

import train

In [111]:
model = VGGNet()

In [112]:
loss_fn = ERMLoss(model,torch.nn.CrossEntropyLoss(),{})

In [113]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=0.005)


In [114]:
train_dataset = NoduleDataset(train_img, train_label)

In [115]:
train_loader = InfiniteDataLoader(train_dataset, 40)

In [None]:
epochs = 40
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    train.train(train_loader, model, loss_fn, optimizer)
                # train.test(test_dataloader, model)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
