inspired from https://www.kaggle.com/code/paultimothymooney/predict-idc-in-breast-cancer-histology-images and https://www.kaggle.com/code/zfturbo/mnist-with-mobilenet-pytorch-gpu

*Step 1: Import Modules*

In [2]:
import pandas as pd
import numpy as np
import os
from glob import glob
import itertools
import fnmatch
import random
import matplotlib.pylab as plt
import seaborn as sns
import cv2
#from scipy.misc import imresize, imread
import sklearn
from sklearn import model_selection
from sklearn.model_selection import train_test_split, KFold, cross_val_score, StratifiedKFold, learning_curve, GridSearchCV
from sklearn.metrics import confusion_matrix, make_scorer, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
#import keras
#from keras import backend as K
#from keras.callbacks import Callback, EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
#from keras.preprocessing.image import ImageDataGenerator
#from keras.utils.np_utils import to_categorical
#from keras.models import Sequential, model_from_json
#from keras.optimizers import SGD, RMSprop, Adam, Adagrad, Adadelta
#from keras.layers import Dense, Dropout, Activation, Flatten, BatchNormalization, Conv2D, MaxPool2D, MaxPooling2D
%matplotlib inline

*Step 2: Explore Data*

In [3]:
imagePatches = glob('./archive/IDC_regular_ps50_idx5/**/*.png', recursive=True)
for filename in imagePatches[0:10]:
    print(filename)

./archive/IDC_regular_ps50_idx5/9255/1/9255_idx5_x3101_y1001_class1.png
./archive/IDC_regular_ps50_idx5/9255/1/9255_idx5_x2901_y1251_class1.png
./archive/IDC_regular_ps50_idx5/9255/1/9255_idx5_x1401_y1001_class1.png
./archive/IDC_regular_ps50_idx5/9255/1/9255_idx5_x2801_y1151_class1.png
./archive/IDC_regular_ps50_idx5/9255/1/9255_idx5_x2901_y301_class1.png
./archive/IDC_regular_ps50_idx5/9255/1/9255_idx5_x2951_y1201_class1.png
./archive/IDC_regular_ps50_idx5/9255/1/9255_idx5_x2701_y901_class1.png
./archive/IDC_regular_ps50_idx5/9255/1/9255_idx5_x2951_y1301_class1.png
./archive/IDC_regular_ps50_idx5/9255/1/9255_idx5_x2451_y651_class1.png
./archive/IDC_regular_ps50_idx5/9255/1/9255_idx5_x1501_y851_class1.png


*Step 3: Preprocess Data*

In [4]:
patternZero = '*class0.png'
patternOne = '*class1.png'
classZero = fnmatch.filter(imagePatches, patternZero)
classOne = fnmatch.filter(imagePatches, patternOne)
print("IDC(-)\n\n",classZero[0:5],'\n')
print("IDC(+)\n\n",classOne[0:5])

IDC(-)

 ['./archive/IDC_regular_ps50_idx5/9255/0/9255_idx5_x2551_y1001_class0.png', './archive/IDC_regular_ps50_idx5/9255/0/9255_idx5_x2301_y851_class0.png', './archive/IDC_regular_ps50_idx5/9255/0/9255_idx5_x1951_y1401_class0.png', './archive/IDC_regular_ps50_idx5/9255/0/9255_idx5_x2151_y1451_class0.png', './archive/IDC_regular_ps50_idx5/9255/0/9255_idx5_x1851_y801_class0.png'] 

IDC(+)

 ['./archive/IDC_regular_ps50_idx5/9255/1/9255_idx5_x3101_y1001_class1.png', './archive/IDC_regular_ps50_idx5/9255/1/9255_idx5_x2901_y1251_class1.png', './archive/IDC_regular_ps50_idx5/9255/1/9255_idx5_x1401_y1001_class1.png', './archive/IDC_regular_ps50_idx5/9255/1/9255_idx5_x2801_y1151_class1.png', './archive/IDC_regular_ps50_idx5/9255/1/9255_idx5_x2901_y301_class1.png']


In [5]:
def proc_images(lowerIndex,upperIndex):
    """
    Returns two arrays: 
        x is an array of resized images
        y is an array of labels
    """ 
    x = []
    y = []
    WIDTH = 50
    HEIGHT = 50
    for img in imagePatches[lowerIndex:upperIndex]:
        full_size_image = cv2.imread(img)
        x.append(cv2.resize(full_size_image, (WIDTH,HEIGHT), interpolation=cv2.INTER_CUBIC))
        if img in classZero:
            y.append(0)
        elif img in classOne:
            y.append(1)
        else:
            return
    return x,y

In [6]:
X,Y = proc_images(0,9000)
df = pd.DataFrame()
df["images"]=X
df["labels"]=Y
X2=df["images"]
Y2=df["labels"]
X2=np.array(X2)
imgs0=[]
imgs1=[]
imgs0 = X2[Y2==0] # (0 = no IDC, 1 = IDC)
imgs1 = X2[Y2==1] 

In [7]:
def describeData(a,b):
    print('Total number of images: {}'.format(len(a)))
    print('Number of IDC(-) Images: {}'.format(np.sum(b==0)))
    print('Number of IDC(+) Images: {}'.format(np.sum(b==1)))
    print('Percentage of positive images: {:.2f}%'.format(100*np.mean(b)))
    print('Image shape (Width, Height, Channels): {}'.format(a[0].shape))
describeData(X2,Y2)

Total number of images: 9000
Number of IDC(-) Images: 5763
Number of IDC(+) Images: 3237
Percentage of positive images: 35.97%
Image shape (Width, Height, Channels): (50, 50, 3)


In [8]:
dict_characters = {0: 'IDC(-)', 1: 'IDC(+)'}
print(df.head(10))
print("")
print(dict_characters)

                                              images  labels
0  [[[173, 135, 195], [166, 122, 175], [145, 94, ...       1
1  [[[163, 117, 174], [152, 107, 161], [176, 137,...       1
2  [[[141, 96, 184], [159, 112, 184], [149, 107, ...       1
3  [[[124, 71, 132], [139, 95, 162], [133, 84, 15...       1
4  [[[132, 77, 127], [151, 101, 160], [140, 94, 1...       1
5  [[[160, 122, 189], [164, 125, 172], [170, 131,...       1
6  [[[113, 61, 111], [110, 57, 101], [90, 38, 81]...       1
7  [[[130, 83, 149], [157, 114, 179], [154, 113, ...       1
8  [[[132, 96, 134], [217, 200, 224], [170, 135, ...       1
9  [[[146, 101, 155], [145, 101, 150], [184, 149,...       1

{0: 'IDC(-)', 1: 'IDC(+)'}


The data is scaled from 0 to 256 but we want it to be scaled from 0 to 1. This will make the data compatible with a wide variety of different classification algorithms.  We also want to set aside 20% of the data for testing. This will make the trained model less prone to overfitting.  And finally, we will use an oversampling strategy to deal with the imbalanced class sizes.

In [7]:
X=np.array(X)
X2=X/255.0

X_train, X_test, Y_train, Y_test = train_test_split(X2, Y, test_size=0.2)
print(X.shape, X_train.shape)
# Reduce Sample Size for DeBugging
X_train2 = X_train[0:300000] 
Y_train2 = Y_train[0:300000]
X_test2 = X_test[0:300000] 
Y_test2 = Y_test[0:300000]

print("Training Data Shape:", X_train2.shape)
print("Testing Data Shape:", X_test2.shape)


(9000, 50, 50, 3) (7200, 50, 50, 3)
Training Data Shape: (7200, 50, 50, 3)
Testing Data Shape: (1800, 50, 50, 3)


In [8]:
X_trainShape = X_train2.shape[1]*X_train2.shape[2]*X_train2.shape[3]
X_testShape = X_test2.shape[1]*X_test2.shape[2]*X_test2.shape[3]
X_trainFlat = X_train2.reshape(X_train2.shape[0], X_trainShape)
X_testFlat = X_test2.reshape(X_test2.shape[0], X_testShape)

In [9]:
for i in range(len(X_trainFlat)):
    height, width, channels = 50,50,3
    X_trainFlat2 = X_trainFlat.reshape(len(X_trainFlat),channels,height,width)

In [10]:
for i in range(len(X_testFlat)):
    height, width, channels = 50,50,3
    X_testFlat2 = X_testFlat.reshape(len(X_testFlat),channels,height,width)

In [12]:
print(X_trainFlat2.shape)

(7200, 3, 50, 50)


In [11]:
from torch.utils.data import Dataset, DataLoader

In [12]:
class BreastDataset(Dataset):
    def __init__(self,data,labels,transform=None, target_transform=None) -> None:
        super().__init__()
        self.data=data
        self.labels=labels
        self.transform = transform
        self.target_transform = target_transform
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, index):
        image,label= self.data[index],self.labels[index]
        if self.transform:
            image = self.transform(image)
        if self.target_transform:
            label = self.target_transform(label)
        return image,label

In [13]:
train_dataset = BreastDataset(X_trainFlat2,Y_train2)
test_dataset = BreastDataset(X_testFlat2,Y_test2)
train_loader = DataLoader(train_dataset, batch_size=32)#, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)#, shuffle=True)

In [14]:
from torchvision.models.mobilenet import mobilenet_v2
import torch
import torch.optim as optim
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import StepLR
from torch.nn import CrossEntropyLoss

In [15]:
import torch.nn as nn

In [29]:
def train(model, device, train_loader,labels, optimizer, epoch):
    log_interval = 10
    loss_func = CrossEntropyLoss()
    model.train()
    for batch_idx, (data,target) in enumerate(train_loader):
        optimizer.zero_grad()
        output=model(data)
        loss = loss_func(output, target)
        loss.backward()
        model.optimizer.step()
        if batch_idx % log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))


In [25]:
def tst(model, device, test_loader,labels):
    model.eval()
    test_loss = 0
    correct = 0
    loss_func = CrossEntropyLoss()
    with torch.no_grad():
        for data, target in test_loader:
            output = model(data)
            test_loss += loss_func(output, target)
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader)

    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))

In [28]:
class MobileNet(nn.Module):
    def __init__(self, optimizer, scheduler,num_classes=2):
        super(MobileNet, self).__init__()
        self.layers = mobilenet_v2(pretrained=True,)
        self.layers.classifier[1] = torch.nn.Linear(in_features=self.layers.classifier[1].in_features, out_features=10)
        self.optimizer = optimizer
        self.scheduler = scheduler
    def forward(self, x):
        return self.layers(x)

In [19]:
for i in train_loader:
    a,b=i
    print(a.shape)
    break

torch.Size([32, 3, 50, 50])


In [27]:
batch_size = 1000
learning_rate = 1.0
reduce_lr_gamma = 0.7
epochs = 1
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('Device: {} Epochs: {} Batch size: {}'.format(device, epochs, batch_size))

kwargs = {'batch_size': batch_size}
if torch.cuda.is_available():
    kwargs.update({'num_workers': 1, 'pin_memory': True})
optimizer = optim.Adadelta(model.parameters(), lr=learning_rate)

scheduler = StepLR(optimizer, step_size=1, gamma=reduce_lr_gamma)

model = MobileNet(scheduler)
model.double()
model.to(device)
for epoch in range(1, epochs + 1):
    train(model, device, train_loader,Y_train2, optimizer, epoch)
    tst(model, device, test_loader,Y_test2)
    scheduler.step()


Device: cpu Epochs: 1 Batch size: 1000






KeyboardInterrupt: 