# Baseline Model for cs231n/bmi260 Project

Housekeeping / Preprocessing the data 

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import sampler
from skimage import transform
import pandas as pd
import torchvision.datasets as dset
import torchvision.transforms as T
import pydicom
import os
import numpy as np
import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as plt
from tqdm import tnrange, tqdm_notebook

In [2]:

#Change to where your dataset is stored
scans_path = "../../Data/Clean_train_test_dataset/Dataset/"

list_of_scans = os.listdir(scans_path)
print(list_of_scans)
# for figuring out the controls lets experiment with slice 122 of slice 2
scan_num = 3
scan_path = os.path.join(scans_path,list_of_scans[scan_num])
list_of_slices = os.listdir(scan_path)
slice_num = 10
slice_path = os.path.join(scan_path,list_of_slices[slice_num])
print(slice_path)

# read in the full path to the file as ds
ds=pydicom.read_file(slice_path) # you may have to use pydicom instead of dicom 

['135', '132', '59', '92', '66', '57', '168', '157', '150', '159', '166', '35', '56', '105', '58', '134', '94', '160', '167', '169', '.DS_Store', '174', '173', '9', '142', '7', '89', '116', '45', '127', '80', '74', '120', '143', '17', '144', '172', '8', '181', '175', '21', '121', '119', '126', '128', '153', '154', '36', '165', '131', '65', '136', '62', '109', '54', '107', '138', '53', '163', '155', '152', '137', '108', '130', '90', '46', '112', '83', '77', '123', '48', '70', '84', '124', '184', '177', '183', '23', '3', '12', '179', '76', '82', '47', '78', '147', '182', '.listing', '149', '171', '185']
../../Data/Clean_train_test_dataset/Dataset/92/CT-1298-0031.dcm


In [3]:
rawimg= ds.pixel_array
plt.imshow(rawimg, cmap='viridis')
plt.show()
print(type(rawimg), np.mean(ds.pixel_array), rawimg.dtype)

<class 'numpy.ndarray'> 530.0752830505371 uint16


In [4]:
num_scans = len(list_of_scans)
max_num_slices = 0

for scan_num in range(num_scans):
    scan_path = os.path.join(scans_path,list_of_scans[scan_num])
    if (not os.path.isdir(scan_path)) or (list_of_scans[scan_num] == "HRCT_pilot"):
        continue
    list_of_slices = os.listdir(scan_path)
    num_slices = len(list_of_slices)
    num_slices = len(list_of_slices)
    if (num_slices > max_num_slices):
        max_num_slices = num_slices

In [5]:
#maximum number of slices per scan 
max_num_slices = 62
print(list_of_scans, len(list_of_scans))

X = np.zeros((num_scans-2, 512, 512, max_num_slices))
counter = 0;
#produce X array 
for scan_num in tnrange(num_scans):
    scan_path = os.path.join(scans_path,list_of_scans[scan_num])
    if (not os.path.isdir(scan_path)) or (list_of_scans[scan_num] == "HRCT_pilot"):
        #print('bad', list_of_scans[scan_num])
        continue
    list_of_slices = os.listdir(scan_path)
    num_slices = len(list_of_slices)
    scan_idx = int(list_of_scans[scan_num])
    
    for slice_num in range(num_slices):
        if (list_of_slices[slice_num][-4:] != ".dcm"):
            #print(list_of_slices[slice_num])
            continue
            
        slice_path = os.path.join(scan_path,list_of_slices[slice_num])
        ds=pydicom.read_file(slice_path)
        #print(slice_path, ds.RescaleSlope, ds.RescaleIntercept)
        hu_img = ds.pixel_array*ds.RescaleSlope + ds.RescaleIntercept
        if(hu_img.shape != (512,512)):
            hu_img = transform.resize(hu_img, (512,512), mode='constant')
            
        slice_idx_num = int(ds.InstanceNumber)
        X[counter, :, :, slice_idx_num] = hu_img
    counter += 1
    
print(X.shape)
np.save("X", X)

['135', '132', '59', '92', '66', '57', '168', '157', '150', '159', '166', '35', '56', '105', '58', '134', '94', '160', '167', '169', '.DS_Store', '174', '173', '9', '142', '7', '89', '116', '45', '127', '80', '74', '120', '143', '17', '144', '172', '8', '181', '175', '21', '121', '119', '126', '128', '153', '154', '36', '165', '131', '65', '136', '62', '109', '54', '107', '138', '53', '163', '155', '152', '137', '108', '130', '90', '46', '112', '83', '77', '123', '48', '70', '84', '124', '184', '177', '183', '23', '3', '12', '179', '76', '82', '47', '78', '147', '182', '.listing', '149', '171', '185'] 91



(89, 512, 512, 62)


In [6]:
df=np.asarray(pd.read_csv('../../Data/Clean_train_test_dataset/Labels.csv', sep=',',header=None).values)
print(df)
y = np.zeros((X.shape[0], 1))

counter = 0
for scan_num in tnrange(num_scans):
    
    #get rid of other entries in dir 
    scan_path = os.path.join(scans_path,list_of_scans[scan_num])
    if (not os.path.isdir(scan_path)) or (list_of_scans[scan_num] == "HRCT_pilot"):
        continue
    
    idx = np.where(df == int(list_of_scans[scan_num]))[0][0]
    y[counter] = df[idx, 1]
    counter+=1
np.save("y", y)
print(y)

[[  3   0]
 [  7   2]
 [  8   0]
 [  9   0]
 [ 12   2]
 [ 17   1]
 [ 21   1]
 [ 23   2]
 [ 35   2]
 [ 36   2]
 [ 45   1]
 [ 46   1]
 [ 47   1]
 [ 48   1]
 [ 53   0]
 [ 54   0]
 [ 56   0]
 [ 57   0]
 [ 58   0]
 [ 59   0]
 [ 60   2]
 [ 62   0]
 [ 65   2]
 [ 66   2]
 [ 70   2]
 [ 74   0]
 [ 76   0]
 [ 77   0]
 [ 78   0]
 [ 80   0]
 [ 82   0]
 [ 83   2]
 [ 84   0]
 [ 89   0]
 [ 90   0]
 [ 92   0]
 [ 94   0]
 [105   2]
 [107   2]
 [108   2]
 [109   2]
 [112   1]
 [116   2]
 [119   2]
 [120   0]
 [121   0]
 [123   2]
 [124   2]
 [126   2]
 [127   2]
 [128   0]
 [130   2]
 [131   2]
 [132   2]
 [134   2]
 [135   2]
 [136   0]
 [137   2]
 [138   2]
 [142   0]
 [143   2]
 [144   0]
 [147   2]
 [149   2]
 [150   2]
 [152   1]
 [153   1]
 [154   1]
 [155   1]
 [157   1]
 [159   2]
 [160   0]
 [163   1]
 [165   0]
 [166   1]
 [167   0]
 [168   0]
 [169   2]
 [171   2]
 [172   2]
 [173   2]
 [174   2]
 [175   0]
 [177   2]
 [179   2]
 [181   0]
 [182   2]
 [183   2]
 [184   0]
 [185   0]]



[[2.]
 [2.]
 [0.]
 [0.]
 [2.]
 [0.]
 [0.]
 [1.]
 [2.]
 [2.]
 [1.]
 [2.]
 [0.]
 [2.]
 [0.]
 [2.]
 [0.]
 [0.]
 [0.]
 [2.]
 [2.]
 [2.]
 [0.]
 [0.]
 [2.]
 [0.]
 [2.]
 [1.]
 [2.]
 [0.]
 [0.]
 [0.]
 [2.]
 [1.]
 [0.]
 [2.]
 [0.]
 [0.]
 [0.]
 [1.]
 [0.]
 [2.]
 [2.]
 [0.]
 [1.]
 [1.]
 [2.]
 [0.]
 [2.]
 [2.]
 [0.]
 [0.]
 [2.]
 [0.]
 [2.]
 [2.]
 [0.]
 [1.]
 [1.]
 [1.]
 [2.]
 [2.]
 [2.]
 [0.]
 [1.]
 [1.]
 [2.]
 [0.]
 [2.]
 [1.]
 [2.]
 [0.]
 [2.]
 [0.]
 [2.]
 [2.]
 [2.]
 [0.]
 [2.]
 [2.]
 [0.]
 [0.]
 [1.]
 [0.]
 [2.]
 [2.]
 [2.]
 [2.]
 [0.]]


In [7]:
print(X.shape, y.shape)

(89, 512, 512, 62) (89, 1)


In [15]:
df=np.asarray(pd.read_csv('../../Data/Clean_train_test_dataset/Labels.csv', sep=',',header=None).values)
# print(df)
y = []
cntr_entry = 0
for scan_num in tnrange(num_scans):
    scan_path = os.path.join(scans_path,list_of_scans[scan_num])
    if (not os.path.isdir(scan_path)) or (list_of_scans[scan_num] == "HRCT_pilot"):
        continue
    list_of_slices = os.listdir(scan_path)
    num_slices = len(list_of_slices)
    scan_idx = int(list_of_scans[scan_num])
    cntr_slice = 0
    idx = np.where(df == int(list_of_scans[scan_num]))[0][0]
    slice_label = df[idx, 1]
    for slice_num in range(num_slices):
        if (list_of_slices[slice_num][-4:] != ".dcm"):
            continue
        
        y.append([scan_idx, cntr_slice, slice_label])
        cntr_slice += 1
        cntr_entry += 1
        
df_new = pd.DataFrame(y)
df_new.to_csv('./slice_labels_clean.csv', header=False, index=False)




In [16]:
print(df_new)

        0   1  2
0     135   0  2
1     135   1  2
2     135   2  2
3     135   3  2
4     135   4  2
5     135   5  2
6     135   6  2
7     135   7  2
8     135   8  2
9     135   9  2
10    135  10  2
11    135  11  2
12    135  12  2
13    135  13  2
14    135  14  2
15    135  15  2
16    135  16  2
17    135  17  2
18    135  18  2
19    135  19  2
20    135  20  2
21    135  21  2
22    135  22  2
23    135  23  2
24    135  24  2
25    135  25  2
26    132   0  2
27    132   1  2
28    132   2  2
29    132   3  2
...   ...  .. ..
2327  171  15  2
2328  171  16  2
2329  171  17  2
2330  171  18  2
2331  171  19  2
2332  171  20  2
2333  171  21  2
2334  171  22  2
2335  185   0  0
2336  185   1  0
2337  185   2  0
2338  185   3  0
2339  185   4  0
2340  185   5  0
2341  185   6  0
2342  185   7  0
2343  185   8  0
2344  185   9  0
2345  185  10  0
2346  185  11  0
2347  185  12  0
2348  185  13  0
2349  185  14  0
2350  185  15  0
2351  185  16  0
2352  185  17  0
2353  185  18 