In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from skimage.io import imread # for importing tiff images
import pydicom as dicom # for handling dicom files
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import cv2

Anyways, let's read in the file and take a look at some of the variables.

In [None]:
images = pd.read_csv('/kaggle/input/siim-medical-images/overview.csv')
images.head(10)
images.info()

From here, we'll import the images and stack them into a numpy array.

In [None]:
images["tiff_path"] = '/kaggle/input/siim-medical-images/tiff_images/' + images["tiff_name"]
images["dicom_path"] = '/kaggle/input/siim-medical-images/dicom_dir/' + images["dicom_name"]

The first step to preprocess the data is to take all of the images, and stack them into one numpy array.

In [None]:
tiff_images = []

for path in images["tiff_path"]:
    im = imread(path)
    tiff_images.append(im)
        
tiff_images = np.array(tiff_images, dtype = 'float64')
print(tiff_images.shape)

Let's go ahead and take a look at the images with their respective ages, ids and Contrasts.

In [None]:
fig, ax = plt.subplots(4,4, figsize = (16,20))
for i,im in enumerate(tiff_images[:16]):
    ax[i//4, i%4].matshow(im, cmap = 'gray')
    ax[i//4, i%4].axis('off')
    ax[i//4, i%4].set_title('Age: {}\nID: {} Contrast: {}'.format(images.loc[i, 'Age'], images.loc[i, 'id'], images.loc[i, 'Contrast']))
plt.show()

Here, we'll take the DICOM image data and out it into a numpy array.

In [None]:
dicom_images = []

for path in images["dicom_path"]:
    im = dicom.read_file(path)
    dicom_images.append(im.pixel_array)
        
dicom_images = np.array(dicom_images, dtype = 'float64')

Let's go ahead and take a look at the images with their respective ages, ids and Contrasts.

Let's take a look at the some of the histograms for the data.

In [None]:
fig, ax = plt.subplots(4,4, figsize = (16,20))

for i,im in enumerate(dicom_images[:8]):
    ax[i//4, i%4].imshow(im, cmap = plt.cm.bone)
    ax[i//4, i%4].set_title('Age: {}\nID: {} Contrast: {}'.format(images.loc[i, 'Age'], images.loc[i, 'id'], images.loc[i, 'Contrast']))

for i,im in enumerate(dicom_images[50:58]):
    ax[(i+8)//4, (i+8)%4].imshow(im, cmap = plt.cm.bone)
    ax[(i+8)//4, (i+8)%4].set_title('Age: {}\nID: {} Contrast: {}'.format(images.loc[i+50, 'Age'], images.loc[i+50, 'id'], images.loc[i+50, 'Contrast']))
plt.show()

In [None]:
fig, ax = plt.subplots(4,4, figsize = (16,20))

for i,im in enumerate(dicom_images[:8]):
    im = np.uint8((im/np.amax(im))*255)
    ax[i//4, i%4].imshow(im, cmap = plt.cm.bone)
    ax[i//4, i%4].set_title('Age: {}\nID: {} Contrast: {}'.format(images.loc[i, 'Age'], images.loc[i, 'id'], images.loc[i, 'Contrast']))

for i,im in enumerate(dicom_images[50:58]):
    im = np.uint8((im/np.amax(im))*255)
    ax[(i+8)//4, (i+8)%4].imshow(im, cmap = plt.cm.bone)
    ax[(i+8)//4, (i+8)%4].set_title('Age: {}\nID: {} Contrast: {}'.format(images.loc[i+50, 'Age'], images.loc[i+50, 'id'], images.loc[i+50, 'Contrast']))
plt.show()

In [None]:
fig, ax = plt.subplots(4,4, figsize = (16,20))

for i,im in enumerate(dicom_images[:8]):
    im = ((im/np.amax(im))*255).astype('uint8')
    hist = cv2.calcHist([im], [0], None, [256], [0, 256])
    ax[i//4, i%4].set_xlim([0, 256])
    ax[i//4, i%4].plot(hist)
    ax[i//4, i%4].set_title('Age: {}\nID: {} Contrast: {}'.format(images.loc[i, 'Age'], images.loc[i, 'id'], images.loc[i, 'Contrast']))

for i,im in enumerate(dicom_images[50:58]):
    im = ((im/np.amax(im))*255).astype('uint8')
    hist = cv2.calcHist([im], [0], None, [256], [0, 256])
    ax[(i+8)//4, (i+8)%4].set_xlim([0, 256])
    ax[(i+8)//4, (i+8)%4].plot(hist)
    ax[(i+8)//4, (i+8)%4].set_title('Age: {}\nID: {} Contrast: {}'.format(images.loc[i+50, 'Age'], images.loc[i+50, 'id'], images.loc[i+50, 'Contrast']))
plt.show()

Let's note here that we want the ratio of True and False for the contrasts to be approximately equal, so let's see if we can guarantee that.

Let's implement a Random Forest sort of as a litmus test to see how accurate the classification is with the raw pixel data. As preparation, let's split the data into training data and testing data with the test set being 25%.

In [None]:
#standardize the pixel values to the range [0,1] by diving by the max pixel value in the set
std_dicom_images = dicom_images/np.amax(dicom_images)
#split the data into training and test data
train_data, test_data, train_labels, test_labels = train_test_split(std_dicom_images, np.array(images['Contrast']), test_size = 0.25, random_state = 245325)
dz,dx,dy = train_data.shape
flat_train_data = train_data.reshape((dz, dx*dy))
dz,dx,dy = test_data.shape
flat_test_data = test_data.reshape((dz, dx*dy))

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score

rf = RandomForestClassifier()
print("Cross Validation Scores: {}".format(cross_val_score(rf, flat_train_data, train_labels, cv = 5)))
print()
rf.fit(flat_train_data, train_labels)
predictions = rf.predict(flat_test_data)
print(classification_report(test_labels,predictions))

In [None]:
fig, ax = plt.subplots(5,5, figsize = (25,33))

for i, im in enumerate(test_data):
    ax[i//5,i%5].imshow(im, cmap = plt.cm.bone)
    ax[i//5,i%5].set_title("Predicted: {}\nActual Val: {}".format(predictions[i], test_labels[i]))

plt.show()

Let's test out a CNN network on 