In [None]:
#Unzipping very large files: https://stackoverflow.com/questions/339053/how-do-you-unzip-very-large-files-in-python

## Note: The below code extracts the Stanford Chexpert Dataset, which must be downloaded through the Stanford ML group website.

To run the code below, ensure the Chexpert-V1.0-small.zip file is in the same directory as this notebook. Note that the file made by this code is very large. Use with caution.


In [None]:
import torch
import torchvision
import os
import matplotlib.pyplot as plt
import glob
import pandas as pd



In [None]:
base_dir = os.path.join('.', 'input', 'pulmonary-chest-xray-abnormalities\\')

base_dir

In [None]:
#get all the Montgomery set files

mont_dir = ".\\input\\Montgomery\\MontgomerySet\\"
mont_paths = []
for filename in glob.iglob(mont_dir + "**/*", recursive=True):
     mont_paths.append(filename)

In [None]:
#get all the China set files

shen_dir = ".\\input\\ChinaSet_AllFiles\\ChinaSet_AllFiles\\"
shen_paths = []
for filename in glob.iglob(shen_dir + "**/*", recursive=True):
     shen_paths.append(filename)

In [None]:
print('Montgomery Files', len(mont_paths))
print('Shenzhen Files', len(shen_paths))

In [None]:
#### Now combine al the files into a dataframe: all_paths_df

In [None]:
all_paths_df = pd.DataFrame(dict(path = mont_paths + shen_paths))
all_paths_df['source'] = all_paths_df['path'].map(lambda x: x.split('\\')[3])
all_paths_df['file_id'] = all_paths_df['path'].map(lambda x: os.path.splitext(os.path.basename(x))[0])
all_paths_df['patient_group']  = all_paths_df['file_id'].map(lambda x: x.split('_')[0])

all_paths_df['file_ext'] = all_paths_df['path'].map(lambda x: os.path.splitext(x)[1][1:])
all_paths_df = all_paths_df[all_paths_df.file_ext.isin(['png', 'txt'])]
all_paths_df['pulm_state']  = all_paths_df['file_id'].map(lambda x: int(x.split('_')[-1]))
all_paths_df.sample(5)

## Create Report DF

In [None]:
clean_patients_df = all_paths_df.pivot_table(index = ['patient_group', 'pulm_state', 'file_id'], 
                                             columns=['file_ext'], 
                                             values = 'path', aggfunc='first').reset_index()
clean_patients_df.sample(5)
from warnings import warn
def report_to_dict(in_path):
    with open(in_path, 'r') as f:
        all_lines = [x.strip() for x in f.read().split('\n')]
    info_dict = {}
    try:
        if "Patient's Sex" in all_lines[0]:
            info_dict['age'] = all_lines[1].split(':')[-1].strip().replace('Y', '')
            info_dict['gender'] = all_lines[0].split(':')[-1].strip()
            info_dict['report'] = ' '.join(all_lines[2:]).strip()
        else:
            info_dict['age'] = all_lines[0].split(' ')[-1].replace('yrs', '').replace('yr', '')
            info_dict['gender'] = all_lines[0].split(' ')[0].strip()
            info_dict['report'] = ' '.join(all_lines[1:]).strip()
        
        info_dict['gender'] = info_dict['gender'].upper().replace('FEMALE', 'F').replace('MALE', 'M').replace('FEMAL', 'F')[0:1]
        if 'month' in info_dict.get('age', ''):
            info_dict.pop('age') # invalid
        if 'day' in info_dict.get('age', ''):
            info_dict.pop('age') # invalid
        elif len(info_dict.get('age',''))>0:
            info_dict['age'] = float(info_dict['age'])
        else:
            info_dict.pop('age')
        return info_dict
    except Exception as e:
        print(all_lines)
        warn(str(e), RuntimeWarning)
        return {}
report_df = pd.DataFrame([dict(**report_to_dict(c_row.pop('txt')), **c_row) 
              for  _, c_row in clean_patients_df.iterrows()])
report_df.sample(5)

In [None]:
## Segmentation

In [None]:
#get all the mask files
#mask_path = os.path.join("D:\\", "Documents", "Medical", "TB","Lung Segmentation","masks")
mask_path = os.path.join('.', 'input', 'masks')
#mask directory
masks = os.listdir(mask_path)


#clean it up to align with images names: Remove the .png and the _mask(from China masks)


mask_ids_temp = [fName.split(".png")[0] for fName in masks]

mask_ids = [fName.split("_mask")[0] for fName in mask_ids_temp]

#The total # of masks
mask_file_names = [os.path.join(mask_path, mask) for mask in masks]

#masks

#Total number of modified masks - China masks
check = [i for i in masks if "mask" in i]
print("Total mask that has modified name:",len(check))

## ??? There seems to be 704 masks before modification

In [None]:
#get all the image files
image_path = os.path.join('.', 'input',"CXR_png")

#image directory
images = os.listdir(image_path)

#clean it up to align with images names: Remove the .png and the _mask(from China masks)
image_ids = [fName.split(".png")[0] for fName in images]
#mask_file_names = [fName.split("_mask")[0] for fName in mask_id]

image_file_names = [os.path.join(image_path, image) for image in images]

#The total # of images
print('Total X-ray images: ', len(image_file_names))

In [None]:
#Put all the names into a dataframe for convenience
images_df = pd.DataFrame()
images_df['xrays'] = image_file_names
images_df['file_id'] = image_ids
images_df['has_mask'] = images_df['file_id'].isin(mask_ids)

images_with_masks_df = images_df[images_df['file_id'].isin(mask_ids)]

images_with_masks_df['masks'] = mask_file_names

print("There are {} x-rays with masks".format(len(images_with_masks_df)))
images_df
print("True indicates the x-ray has a mask:")
images_df['has_mask'].value_counts()

In [None]:
from sklearn.model_selection import train_test_split
#Do a train-test split
#??? So, here we are splitting the xrays from the masks, in segmentation we are trying to predict the mask.
# We use 90% of the data for the training set.
train_x,test_x,train_y,test_y = train_test_split(images_with_masks_df['xrays'],
                                                   images_with_masks_df['masks'],test_size    = 0.1,
                                                   random_state = 42)

#size of the training set should be 90% of 704
#len(train_x)

In [None]:
#We are also going to make a validation set.
trainx,validationx,trainy,validationy = train_test_split(train_x,train_y,test_size = 0.1,random_state = 42)

#len(trainx)

In [None]:
#Put all these data sets into data frames
train_df = pd.DataFrame(index=trainx.index)
train_df['xrays'] = trainx
train_df['masks'] = trainy

test_df = pd.DataFrame(index=test_x.index)
test_df['xrays'] = test_x
test_df['masks'] = test_y

validation_df = pd.DataFrame(index=validationx.index)
validation_df['xrays'] = validationx
validation_df['masks'] = validationy

In [None]:
## Now that we have a dataframe of training and test examples, can we mask them?

## Need a train info dataframe

In [None]:
train_info_loc = os.path.join(".", "CheXpert-v1.0-small") #Need the file path to the CheXpert-V1.0-small file (this must be downloaded independently through Stanford ML)
train_file_name = "train.csv"
train_info = pd.read_csv(os.path.join(train_info_loc, train_file_name))


In [None]:
train_info.head()

In [None]:
train_info.fillna(0, inplace=True)
train_info.head()

## create a new dataframe with a column for complete path and diagnostic columns of interest:


In [None]:
train_df = pd.DataFrame(index=train_info.index)
data_df = train_info.iloc[:, 5:].copy()
data_df['xrays'] = [os.path.join('.', x) for x in train_info['Path'].values]

In [None]:
data_df.head()

In [None]:
import pickle

train_df.to_pickle("train_df.pkl")

## get and view file from the data_df (checking understanding of file formats)

In [None]:
rnd_xray = data_df['xrays'][354]

In [None]:
from PIL import Image

img = Image.open(rnd_xray) #Note, these .jpg files are PIL objects...

img

In [None]:
## Let's convert it to a tensor

from torchvision import transforms

convert_tensor = transforms.ToTensor()

img_t = convert_tensor(img)


print(img_t.shape)

shifted = img_t.permute(1, 2, 0)

print(shifted.shape)

In [None]:
#Create test train split

from sklearn.model_selection import GroupShuffleSplit

# Initialize the GroupShuffleSplit.
gss = GroupShuffleSplit(n_splits=1, test_size=0.01)

# Get the indexers for the split.
idx1, idx2 = next(gss.split(data_df, groups=data_df.index))

# Get the split DataFrames.
df1, df2 = data_df.iloc[idx1], data_df.iloc[idx2]



#Just use a slice of the images for now:

train_temp_df = df1.sample(100000)
test_temp_df = df2.copy()

In [None]:
train_temp_df['xrays'][432]

In [None]:
len(train_temp_df)

## Now is the time to regroup and think about what you are doing. What do you need to accomplish and how will you get there?

## 
1) Load 1 Resnet pretrained model
2) Apply this model to data (What's the input and what's the output?)

In [None]:
import torch.nn as nn
## Try implementing a Resnet from scratch  (tutorial here: https://www.youtube.com/watch?v=DkNIBBBvcPs)


class resblock(nn.Module):
    def __init__(self, in_channels, out_channels, identity_downsample = None, stride = 1):
        super(resblock, self).__init__()
        self.expansion = 4 # "number of channels after a block is 4x what it was when it entered"
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size = 1, stride =1, padding = 0)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.conv2 = nn.Conv2d(in_channels, out_channels, kernel_size = 1, stride = stride, padding =1)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.conv3 = nn.Conv2d(out_channels, out_channels *self.expansion, kernel_size =1, stride=1, padding = 0)
        self.bn3 = nn.BatchNorm2d(out_channels*self.expansion)
        self.relu = nn.ReLU()
        self.identity_downsample = identity_downsample
        
    def forward(self, x):
        identity = x
        
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.conv2(x)
        x = self.bn2(x)
        x = self.relu(x)
        x = self.conv3(x)
        x = self.bn3(x)
        
        if self.identity_downsample is not None:
            identity = self.identity_downsample(identity)
            
            print('SHAPES:')
            print(x.shape)
            print(identity.shape)
        
        x += identity
        x = self.relu(x)
        return x
    

class ResNet(nn.Module): # note, the layers argument corresponds to the number of resnet blocks
    def __init__(self, resblock, layers, image_channels, num_classes):
        super(ResNet, self).__init__()
        self.in_channels = 64
        self.conv1 = nn.Conv2d(image_channels, 64, kernel_size=7, stride=2, padding=3)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU()
        self.maxpool = nn.MaxPool2d(kernel_size = 3, stride=2, padding=1)
        
        #ResNet layers
        
        self.layer1 = self._make_layer(resblock, layers[0], out_channels=64, stride=1)
        self.layer2 = self._make_layer(resblock, layers[1], out_channels=128, stride=2)
        self.layer3 = self._make_layer(resblock, layers[2], out_channels=256, stride=2)
        self.layer4 = self._make_layer(resblock, layers[3], out_channels=512, stride=2)
        
        self.avgpool = nn.AdaptiveAvgPool2d((1,1))
        self.fc = nn.Linear(512*4, num_classes)
        
    def forward(self, x):
        x = self.conv1(x)
        print(x.shape)
        x = self.bn1(x)
        print(x.shape)
        x = self.relu(x)
        print(x.shape)
        x = self.maxpool(x)
        print(x.shape)
        
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        
        x = self.avgpool(x)
        x = x.reshape(x.shape[0], -1)
        x = self.fc(x)
        return x
        
        
    def _make_layer(self, resblock, num_residual_blocks, out_channels, stride):
        identity_downsample = None
        layers = []
        
        if stride != 1 or self.in_channels != out_channels * 4:
            identity_downsample = nn.Sequential(nn.Conv2d(self.in_channels, out_channels *4, kernel_size = 1,
                                                         stride = stride),
                                               nn.BatchNorm2d(out_channels*4))
        
        layers.append(resblock(self.in_channels, out_channels, identity_downsample, stride)) #changes the number of channels
        self.in_channels = out_channels * 4
        
        for i in range(num_residual_blocks - 1):
            layers.append(resblock(self.in_channels, out_channels))
            
        return nn.Sequential(*layers)
        
        

In [None]:
#initialize resnet 50 with our parameters, 1 channel for grayscale images, 14 classes.

def ResNet50(img_channels=1, num_classes=14):
    return ResNet(resblock, [3, 4, 6, 3], img_channels, num_classes)

In [None]:
def test():
    net = ResNet50()
    x = torch.randn(2, 1, 224, 224)
    y = net(x).to('cuda')
    print(y.shape)
    
test()