# This notebook uses LUNA16 dataset, to train a classifier that classifies a region as nodule or no nodule

The luna16 dataset contains several subsets, this notebook requires the subset(num) and location of annotations file

## Getting the training data

In [1]:
from numpy.random import randint
import numpy as np

### Utility functions for the function create_data which creates training data

In [2]:
def check_less(num):
    if num - 18 < 0:
        return num  
    else:
        return num - 18
    
def get_patch_from_list(lung_img, coords, window_size = 10):
	shape = lung_img.shape
	output = []
	lung_img = lung_img + 1024
	for i in range(len(coords)):
		patch =   lung_img[coords[i][0] - 18: coords[i][0] + 18,
						   coords[i][1] - 18: coords[i][1] + 18,
						   coords[i][2] - 18: coords[i][2] + 18]			   
		output.append(patch)
	return output

'''
Sample a random point from the image and return the coordinates. 
'''
def get_point(shape):
	x = randint(64, shape[2] - 64)
	y = randint(64, shape[1] - 64)
	z = randint(20, shape[0] - 20)
	return np.asarray([z, y, x])

In [8]:
import SimpleITK as sitk
import os
import pickle

much_data = []
def create_data(path, train_csv_path):
    coords, trainY = [], []
    with open(train_csv_path, 'rb') as f:
        lines = f.readlines()
        counter = 0
        for line in lines:
            row = line.decode().split(',')
            
            all_images = []
            all_labels = []
            
            
            if os.path.isfile(path + row[0] + '.mhd') == False:
                continue

            lung_img = sitk.GetArrayFromImage(sitk.ReadImage(path + row[0] + '.mhd'))

            for i in range(-5, 5, 3):
                for j in range(-5, 5, 3):
                    for k in range(-2, 3, 2):
                        coords.append([int(float(row[3])) + k, int(float(row[2])) + j, int(float(row[1])) + i])
                        trainY.append(True)
                        
            for i in range(60):
                coords.append(get_point(lung_img.shape))
                trainY.append(False)

            trainX = get_patch_from_list(lung_img, coords)
            

            
            for elem,x in zip(trainX,trainY):
                if elem.shape[0]==36 and elem.shape[1]==36 and elem.shape[2]==36:
                    all_images.append(elem)
                    all_labels.append(x)
            

            pickle.dump(np.asarray(all_images), open('C:\\Users\\dc\\Documents\\riya\\Lung-Cancer-Detection-master\\nodule_2\\traindata_' + str(counter) + '_Xtrain.p', 'wb'))
            pickle.dump(np.asarray(all_labels, dtype = bool),  open('C:\\Users\\dc\\Documents\\riya\\Lung-Cancer-Detection-master\\nodule_2\\traindata_' + str(counter) + '_Ytrain.p', 'wb'))

            counter = counter + 1
            
            coords, trainY = [], []
    

In [9]:
create_data('D:\\riya\\DL-MP\\data\\', 'D:\\riya\\DL-MP\\CSVFILES\\CSVFILES\\annotations.csv')

### Data is created in nodules_2 folder, divide into train and val. After this network is trained. There are 2 classes - nodule or no-nodule

In [3]:
from numpy.random import randint
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Convolution3D, MaxPooling3D
from keras.utils import np_utils
from keras import backend as K

'''
Creates a keras model with 3D CNNs and returns the model.
'''
def classifier(input_shape, kernel_size, pool_size):
	model = Sequential()

	model.add(Convolution3D(16, kernel_size[0], kernel_size[1], kernel_size[2],
	                        border_mode='valid',
	                        input_shape=input_shape, data_format = 'channels_first'))
	model.add(Activation('relu'))
	model.add(MaxPooling3D(pool_size=pool_size, data_format = 'channels_first') )
	model.add(Convolution3D(32, kernel_size[0], kernel_size[1], kernel_size[2], data_format = 'channels_first'))
	model.add(Activation('relu'))
	model.add(MaxPooling3D(pool_size=pool_size, data_format = 'channels_first'))
	model.add(Convolution3D(64, kernel_size[0], kernel_size[1], kernel_size[2], data_format = 'channels_first'))
	model.add(Activation('relu'))
	model.add(MaxPooling3D(pool_size=pool_size, data_format = 'channels_first'))
	model.add(Dropout(0.25))

	model.add(Flatten())
	model.add(Dense(512))
	model.add(Activation('relu'))
	model.add(Dropout(0.5))
	model.add(Dense(128))
	model.add(Activation('relu'))
	model.add(Dropout(0.5))
	model.add(Dense(2))
	model.add(Activation('softmax'))

	return model

In [4]:
import pickle,sys
import numpy as np
from keras.layers.core import Activation, Reshape

def train_classifier(input_shape):
    val_x = []
    val_y = []
    model = classifier(input_shape, (3, 3, 3), (2, 2, 2))
    model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
    
    for i in range(801,1186):
        file_name = 'C:\\Users\\dc\\Documents\\riya\\Lung-Cancer-Detection-master\\nodule_2\\val\\traindata_'+str(i)+'_Xtrain.p'
        f = open(file_name,'rb')
        file_data = pickle.load(f)
        
        file_name_y = 'C:\\Users\\dc\\Documents\\riya\\Lung-Cancer-Detection-master\\nodule_2\\val\\traindata_'+str(i)+'_Ytrain.p'
        f_y = open(file_name_y,'rb')
        file_data_y = pickle.load(f_y)
        for j in range(len(file_data)):
            val_x.append(file_data[j].reshape(1,36,36,36))
            #val_y.append(file_data_y[j])
            if file_data_y[j] == True:
                val_y.append([1,0])
            else:
                val_y.append([0,1])
            
    print(np.array(val_x).shape)
    print(np.array(val_y).shape)
    for i in range(224, 235):
        train_x = []
        train_y = []
        file_name = 'C:\\Users\\dc\\Documents\\riya\\Lung-Cancer-Detection-master\\nodule_2\\train\\traindata_'+str(i)+'_Xtrain.p'
        f = open(file_name,'rb')
        file_data = pickle.load(f)
        
        file_name_y = 'C:\\Users\\dc\\Documents\\riya\\Lung-Cancer-Detection-master\\nodule_2\\train\\traindata_'+str(i)+'_Ytrain.p'
        f_y = open(file_name_y,'rb')
        file_data_y = pickle.load(f_y)
        
        for j in range(len(file_data)):
            #train_x.append(Reshape((36,36,36) + (1,),input_shape = (36,36,36))(np.ndarray.tolist(file_data[j])))
            train_x.append(file_data[j].reshape((1,36,36,36)))
            if file_data_y[j] == True:
                train_y.append([1,0])
            else:
                train_y.append([0,1])
        
        #x = Reshape(input_shape + (1, ), input_shape=input_shape)(inputs)
        model.train_on_batch(np.array(train_x), np.array(train_y), sample_weight=None)
        print('network trained')
        
        
        
        val_x = val_x[:108]
        for num in range(0,len(val_x),108):
            print('accuracy for test is ')
            print (model.test_on_batch(np.array(val_x[num:num + 108]), np.array(val_y[num:num+108]), sample_weight=None))
        
    model.save('C:\\Users\\dc\\Documents\\riya\\Lung-Cancer-Detection-master\\Output\\model.h5')



In [5]:
train_classifier((1,36,36,36))


  app.launch_new_instance()


(31390, 1, 36, 36, 36)
(31390, 2)
network trained
accuracy for test is 
[8.954497, 0.44444445]
network trained
accuracy for test is 
[8.799113, 0.44444445]
network trained
accuracy for test is 
[7.163598, 0.5555556]
network trained
accuracy for test is 
[7.163598, 0.5555556]
network trained
accuracy for test is 
[7.163598, 0.5555556]
network trained
accuracy for test is 
[7.163598, 0.5555556]
network trained
accuracy for test is 
[7.163598, 0.5555556]
network trained
accuracy for test is 
[7.163598, 0.5555556]
network trained
accuracy for test is 
[7.163598, 0.5555556]
network trained
accuracy for test is 
[7.163598, 0.5555556]
network trained
accuracy for test is 
[7.163598, 0.5555556]
