<a href="https://colab.research.google.com/github/moh53/Graduation-Project-FMS/blob/main/Method_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ***load Kaggle breast cancer datase***

In [None]:
from google.colab import files
files.upload()

In [None]:
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download -d paultimothymooney/breast-histopathology-images

In [None]:
from zipfile import ZipFile
file_name = "breast-histopathology-images.zip"
with ZipFile(file_name, 'r')as zip:
  data= zip.extractall()
  print('Done')

# ***import Liberary ***

In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
import tarfile

import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
from torchvision.datasets.utils import download_url
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader
import torchvision.transforms as tt
from torch.utils.data import random_split
from torchvision.utils import make_grid
from torchvision import transforms
from collections import Counter

import torchvision .transforms as transforms
import torchvision.datasets as datasets
from torch.utils.data import WeightedRandomSampler

In [None]:
!pip install torchstain
import torchstain

In [None]:
# Load all image data
data_dir = os.getcwd()
folder_name = "IDC_regular_ps50_idx5"
image_folders = os.path.join(data_dir, folder_name)

transform = transforms.Compose([
                      transforms.RandomRotation(15),
                      
                      transforms.Resize((100, 100)),
                      transforms.ToTensor(),transforms.Lambda(lambda x: x*255),
                      transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
                      transforms.ColorJitter(brightness=0, contrast=0, saturation=0, hue=0)])
torch_normalizer = torchstain.MacenkoNormalizer(backend='torch')


images = []
for file in os.listdir(image_folders):
    images.append(ImageFolder(os.path.join(image_folders, file), transform=transform))
datasets = torch.utils.data.ConcatDataset(images)

# ***Prepare data for training, validation and test***

In [None]:
# Prepare data for training, validation and test
random_seed = 42
torch.manual_seed(random_seed)

test_size = 38000
train_size = len(datasets) - test_size
train_ds, test_ds = random_split(datasets, [train_size, test_size])

val_size = 38000
train_size = len(train_ds) - val_size
train_ds, val_ds = random_split(train_ds, [train_size, val_size])

len(train_ds), len(val_ds), len(test_ds)

In [None]:
# Determine the number of samples for each class
i=0
for dataset in datasets.datasets:
    if i==0:
        result = Counter(dataset.targets)
        i += 1
    else:
        result += Counter(dataset.targets)

result = dict(result)
print("""Total Number of Images for each Class:
    Class 0 (No Breast Cancer): {}
    Class 1 (Breast Cancer present): {}""".format(result[0], result[1]))

In [None]:
# Pie chart (Number of Samples For Each Class)
labels = "0: benign (No Cancer)", "1: malignant (Have Cancer)"
total = result[0] + result[1]
sizes = [result[0]/total, result[1]/total]
explode = (0, 0.1)  # only "explode" the 2nd slice (i.e. 'Hogs')

fig1, ax1 = plt.subplots()
ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=90)
ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.title("Number of Samples by Class")
plt.show()
plt.savefig("number_of_samples_breakdown")

# ***Traning without Oversampling technique ***

In [None]:
train_data = DataLoader(train_ds, shuffle=True, num_workers=4, pin_memory=True, batch_size= 200)
val_data = DataLoader(val_ds, shuffle=True, num_workers=4, pin_memory=True, batch_size= 200)
test_data = DataLoader(test_ds, shuffle=True, num_workers=4, pin_memory=True, batch_size= 200)

## ***ResNet50 without OverSampling***

In [None]:
import torchvision.models as models
device = torch.device("cuda" if torch.cuda.is_available() 
                                  else "cpu")
model = models.resnet50(pretrained=True)
torch.save(model.state_dict(), 'model_weights.pth')

In [None]:
from torch import optim
for param in model.parameters():
    param.requires_grad = False
    
model.fc = nn.Sequential(nn.Linear(2048, 512),
                                 nn.ReLU(),
                                 nn.Dropout(0.2),
                                 nn.Linear(512, 2),
                                 nn.LogSoftmax(dim=1))
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.fc.parameters(), lr=0.003)
model.to(device)

In [None]:
n_epochs = 10
print_every = 100
valid_loss_min = np.Inf
val_loss = []
val_acc = []
train_loss = []
train_acc = []
total_step = len(train_data)
for epoch in range(1, n_epochs+1):
    running_loss = 0.0
    correct = 0
    total=0
    print(f'Epoch {epoch}\n')
    for batch_idx, (data_, target_) in enumerate(train_data):
        
        data_ = data_.to(device)
        target_ = target_.to(device)
        #batch_idx = batch_idx.to(device)  #this will move inout to your device        
        #batch = batch.to(device)  #this will move inout to your device        

        optimizer.zero_grad()
        
        outputs = model(data_)
        loss = criterion(outputs, target_)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _,pred = torch.max(outputs, dim=1)
        correct += torch.sum(pred==target_).item()
        total += target_.size(0)
        if (batch_idx) % 200 == 0:
            print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 
                   .format(epoch, n_epochs, batch_idx, total_step, loss.item()))
    train_acc.append(100 * correct / total)
    train_loss.append(running_loss/total_step)
    print(f'\ntrain-loss: {np.mean(train_loss):.4f}, train-acc: {(100 * correct/total):.4f}')
    batch_loss = 0
    total_t=0
    correct_t=0
    with torch.no_grad():
        model.eval()
        for data_t, target_t in (val_data):
            data_t, target_t = data_t.to(device), target_t.to(device)
            outputs_t = model(data_t)
            loss_t = criterion(outputs_t, target_t)
            batch_loss += loss_t.item()
            _,pred_t = torch.max(outputs_t, dim=1)
            correct_t += torch.sum(pred_t==target_t).item()
            total_t += target_t.size(0)
        val_acc.append(100 * correct_t/total_t)
        val_loss.append(batch_loss/len(val_data))
        network_learned = batch_loss < valid_loss_min
        print(f'validation loss: {np.mean(val_loss):.4f}, validation acc: {(100 * correct_t/total_t):.4f}\n')

        
        if network_learned:
            valid_loss_min = batch_loss
            torch.save(model.state_dict(), 'resnet.pt')
            print('Improvement-Detected, save-model')
    model.train()

### **ResNet50 Performance**

In [None]:
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import confusion_matrix, precision_score, recall_score
from sklearn.metrics import confusion_matrix
import seaborn as sn
import pandas as pd
 
y_pred = []
y_true = []
 
# iterate over test data
for inputs, labels in test_data: 
        inputs = inputs.to(device)
        labels = labels.to(device)
        output = model(inputs) # Feed Network
 
        output = (torch.max(torch.exp(output), 1)[1]).data.cpu().numpy()
        y_pred.extend(output) # Save Prediction
        
        labels = labels.data.cpu().numpy()
        y_true.extend(labels) # Save Truth
 
# constant for classes
classes = ('negative', 'posative')

print(precision_score(y_true, y_pred))
print(recall_score(y_true, y_pred))
# Build confusion matrix
cf_matrix = confusion_matrix(y_true, y_pred)
df_cm = pd.DataFrame(cf_matrix/np.sum(cf_matrix) *10, index = [i for i in classes],
                     columns = [i for i in classes])
plt.figure(figsize = (12,7))
sn.heatmap(df_cm, annot=True)
plt.savefig('output.png')

In [None]:
plt.figure(figsize=(10, 7))
plt.plot(train_acc, color='green', label='train accuracy')
plt.plot(val_acc, color='blue', label='validataion accuracy')
plt.legend()
plt.savefig('accuracy.png')
plt.show()


In [None]:
plt.figure(figsize=(10, 7))
plt.plot(train_loss, color='green', label='train loss')
plt.plot(val_loss, color='blue', label='validataion loss')
plt.legend()
plt.savefig('loss.png')
plt.show()


## **ResNet18 without oversampling**

In [None]:
import torchvision.models as models
device = torch.device("cuda" if torch.cuda.is_available() 
                                  else "cpu")
model = models.resnet18(pretrained=True)
torch.save(model.state_dict(), 'model_weights.pth')

In [None]:
from torch import optim
for param in model.parameters():
    param.requires_grad = False
    
model.fc = nn.Sequential(nn.Linear(512, 512),
                                 nn.ReLU(),
                                 nn.Dropout(0.2),
                                 nn.Linear(512, 2),
                                 nn.LogSoftmax(dim=1))
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.fc.parameters(), lr=0.003)
model.to(device)

In [None]:
n_epochs = 10
print_every = 100
valid_loss_min = np.Inf
val_loss = []
val_acc = []
train_loss = []
train_acc = []
total_step = len(train_data)
for epoch in range(1, n_epochs+1):
    running_loss = 0.0
    correct = 0
    total=0
    print(f'Epoch {epoch}\n')
    for batch_idx, (data_, target_) in enumerate(train_data):
        
        data_ = data_.to(device)
        target_ = target_.to(device)
        #batch_idx = batch_idx.to(device)  #this will move inout to your device        
        #batch = batch.to(device)  #this will move inout to your device        

        optimizer.zero_grad()
        
        outputs = model(data_)
        loss = criterion(outputs, target_)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _,pred = torch.max(outputs, dim=1)
        correct += torch.sum(pred==target_).item()
        total += target_.size(0)
        if (batch_idx) % 200 == 0:
            print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 
                   .format(epoch, n_epochs, batch_idx, total_step, loss.item()))
    train_acc.append(100 * correct / total)
    train_loss.append(running_loss/total_step)
    print(f'\ntrain-loss: {np.mean(train_loss):.4f}, train-acc: {(100 * correct/total):.4f}')
    batch_loss = 0
    total_t=0
    correct_t=0
    with torch.no_grad():
        model.eval()
        for data_t, target_t in (val_data):
            data_t, target_t = data_t.to(device), target_t.to(device)
            outputs_t = model(data_t)
            loss_t = criterion(outputs_t, target_t)
            batch_loss += loss_t.item()
            _,pred_t = torch.max(outputs_t, dim=1)
            correct_t += torch.sum(pred_t==target_t).item()
            total_t += target_t.size(0)
        val_acc.append(100 * correct_t/total_t)
        val_loss.append(batch_loss/len(val_data))
        network_learned = batch_loss < valid_loss_min
        print(f'validation loss: {np.mean(val_loss):.4f}, validation acc: {(100 * correct_t/total_t):.4f}\n')

        
        if network_learned:
            valid_loss_min = batch_loss
            torch.save(model.state_dict(), 'resnet.pt')
            print('Improvement-Detected, save-model')
    model.train()

### **ResNet18 Performance**

In [None]:
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import confusion_matrix, precision_score, recall_score
from sklearn.metrics import confusion_matrix
import seaborn as sn
import pandas as pd
 
y_pred = []
y_true = []
 
# iterate over test data
for inputs, labels in test_data: 
        inputs = inputs.to(device)
        labels = labels.to(device)
        output = model(inputs) # Feed Network
 
        output = (torch.max(torch.exp(output), 1)[1]).data.cpu().numpy()
        y_pred.extend(output) # Save Prediction
        
        labels = labels.data.cpu().numpy()
        y_true.extend(labels) # Save Truth
 
# constant for classes
classes = ('negative', 'posative')

print(precision_score(y_true, y_pred))
print(recall_score(y_true, y_pred))
# Build confusion matrix
cf_matrix = confusion_matrix(y_true, y_pred)
df_cm = pd.DataFrame(cf_matrix/np.sum(cf_matrix) *10, index = [i for i in classes],
                     columns = [i for i in classes])
plt.figure(figsize = (12,7))
sn.heatmap(df_cm, annot=True)
plt.savefig('output.png')

In [None]:
plt.figure(figsize=(10, 7))
plt.plot(train_acc, color='green', label='train accuracy')
plt.plot(val_acc, color='blue', label='validataion accuracy')
plt.legend()
plt.savefig('accuracy.png')
plt.show()


In [None]:
plt.figure(figsize=(10, 7))
plt.plot(train_loss, color='green', label='train loss')
plt.plot(val_loss, color='blue', label='validataion loss')
plt.legend()
plt.savefig('loss.png')
plt.show()


## **VGG16 without oversampling**

In [None]:
import torch.onnx as onnx
import torchvision.models as models
device = torch.device("cuda" if torch.cuda.is_available() 
                                  else "cpu")
vgg16 = models.vgg16(pretrained=True)
torch.save(vgg16.state_dict(), 'model_weights.pth')

In [None]:
# change the number of classes 
vgg16.classifier[6].out_features = 2
# freeze convolution weights
for param in vgg16.features.parameters():
    param.requires_grad = False

In [None]:
from torch import optim
# optimizer
optimizer = optim.Adam(vgg16.classifier.parameters(), lr=0.001)
# loss function
criterion = nn.CrossEntropyLoss()
model = vgg16.to(device)

In [None]:
n_epochs = 10
print_every = 100
valid_loss_min = np.Inf
val_loss = []
val_acc = []
train_loss = []
train_acc = []
total_step = len(train_data)
for epoch in range(1, n_epochs+1):
    running_loss = 0.0
    correct = 0
    total=0
    print(f'Epoch {epoch}\n')
    for batch_idx, (data_, target_) in enumerate(train_data):
        
        data_ = data_.to(device)
        target_ = target_.to(device)
        #batch_idx = batch_idx.to(device)  #this will move inout to your device        
        #batch = batch.to(device)  #this will move inout to your device        

        optimizer.zero_grad()
        
        outputs = model(data_)
        loss = criterion(outputs, target_)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _,pred = torch.max(outputs, dim=1)
        correct += torch.sum(pred==target_).item()
        total += target_.size(0)
        if (batch_idx) % 200 == 0:
            print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 
                   .format(epoch, n_epochs, batch_idx, total_step, loss.item()))
    train_acc.append(100 * correct / total)
    train_loss.append(running_loss/total_step)
    print(f'\ntrain-loss: {np.mean(train_loss):.4f}, train-acc: {(100 * correct/total):.4f}')
    batch_loss = 0
    total_t=0
    correct_t=0
    with torch.no_grad():
        model.eval()
        for data_t, target_t in (val_data):
            data_t, target_t = data_t.to(device), target_t.to(device)
            outputs_t = model(data_t)
            loss_t = criterion(outputs_t, target_t)
            batch_loss += loss_t.item()
            _,pred_t = torch.max(outputs_t, dim=1)
            correct_t += torch.sum(pred_t==target_t).item()
            total_t += target_t.size(0)
        val_acc.append(100 * correct_t/total_t)
        val_loss.append(batch_loss/len(val_data))
        network_learned = batch_loss < valid_loss_min
        print(f'validation loss: {np.mean(val_loss):.4f}, validation acc: {(100 * correct_t/total_t):.4f}\n')

        
        if network_learned:
            valid_loss_min = batch_loss
            torch.save(model.state_dict(), 'resnet.pt')
            print('Improvement-Detected, save-model')
    model.train()

### **VGG Performance**

In [None]:
plt.figure(figsize=(10, 7))
plt.plot(train_acc, color='green', label='train accuracy')
plt.plot(val_acc, color='blue', label='validataion accuracy')
plt.legend()
plt.savefig('accuracy.png')
plt.show()


In [None]:
plt.figure(figsize=(10, 7))
plt.plot(train_loss, color='green', label='train loss')
plt.plot(val_loss, color='blue', label='validataion loss')
plt.legend()
plt.savefig('loss.png')
plt.show()


# ***Traning on Oversampeled preprocessing***

In [None]:
random_seed = 42
torch.manual_seed(random_seed)
class_weights = [1, 3]
sample_weights = [0] * len(train_ds)
for idx, (data, label) in enumerate(train_ds):
    class_weight = class_weights[label]
    sample_weights[idx] = class_weight
sampler = WeightedRandomSampler(sample_weights, num_samples= len(sample_weights), replacement= True)
train_data = DataLoader(train_ds, batch_size= 200, sampler=sampler)

In [None]:
num_posative = 0
num_negative = 0
for data, label in train_data:
    num_negative += torch.sum(label==0)
    num_posative += torch.sum(label==1)
print(num_negative)
print(num_posative)

In [None]:
random_seed = 42
torch.manual_seed(random_seed)
class_weights = [1, 3]
sample_weights = [0] * len(val_ds)
for idx, (data, label) in enumerate(val_ds):
    class_weight = class_weights[label]
    sample_weights[idx] = class_weight
sampler = WeightedRandomSampler(sample_weights, num_samples= len(sample_weights), replacement= True)
val_data = DataLoader(val_ds, batch_size= 200, sampler=sampler)

In [None]:
num_posative = 0
num_negative = 0
for data, label in val_data:
    num_negative += torch.sum(label==0)
    num_posative += torch.sum(label==1)
print(num_negative)
print(num_posative)

In [None]:
random_seed = 42
torch.manual_seed(random_seed)
class_weights = [1, 3]
sample_weights = [0] * len(test_ds)
for idx, (data, label) in enumerate(test_ds):
    class_weight = class_weights[label]
    sample_weights[idx] = class_weight
sampler = WeightedRandomSampler(sample_weights, num_samples= len(sample_weights), replacement= True)
test_data = DataLoader(test_ds, batch_size= 200, sampler=sampler)

In [None]:
num_posative = 0
num_negative = 0
for data, label in test_data:
    num_negative += torch.sum(label==0)
    num_posative += torch.sum(label==1)
print(num_negative)
print(num_posative)

## ***ResNet50 OVERSAMPLING***

In [None]:
plt.figure(figsize=(10, 7))
plt.plot(train_loss, color='green', label='train loss')
plt.plot(val_loss, color='blue', label='validataion loss')
plt.legend()
plt.savefig('loss.png')
plt.show()


## ***ResNet18 OVERSAMPLING***

In [None]:
plt.figure(figsize=(10, 7))
plt.plot(train_loss, color='green', label='train loss')
plt.plot(val_loss, color='blue', label='validataion loss')
plt.legend()
plt.savefig('loss.png')
plt.show()


## **VGG16 OVERSAMPLING**

In [None]:
plt.figure(figsize=(10, 7))
plt.plot(train_loss, color='green', label='train loss')
plt.plot(val_loss, color='blue', label='validataion loss')
plt.legend()
plt.savefig('loss.png')
plt.show()
