# Corrected Dataset

The corrected dataset can be found [here](https://drive.google.com/drive/folders/10TXXa6B_D4AKuBV085tX7UudH1hINBRJ).

This dataset is organized into the following folders: 
* False Parasitized
* False Uninfected
* True Parasitized
* True Uninfected

## Definitions

In [12]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_datasets as tfds
from IPython.display import display
from PIL import Image
import random
import pickle
import pathlib

def show_samples(samples):
    x = int(len(samples) / 2)
    y = len(samples) - x
    for i in range(len(samples)):
        plt.subplot(x, y, i + 1)
        plt.subplots_adjust(hspace=1)
        plt.imshow(samples[i][0], cmap='gray')
        plt.title("Uninfected" if samples[i][1] else "Infected", loc='center')
        plt.ylabel(f"Height ({samples[i][0].shape[0]})")
        plt.xlabel(f"Width ({samples[i][0].shape[1]})")
    plt.show()

class MalariaDataset(Dataset):
    def __init__(self, dataframe, transform=None):
        self.dataframe = dataframe
        self.transform = transform
        self.n_samples = len(dataframe)

    def __len__(self):
        return self.n_samples

    def __getitem__(self, index):
        image, label = self.dataframe[index]
        if self.transform: 
            image = self.transform(image)
        return image, label

class CNN(nn.Module):
    def __init__(self): 
        super().__init__()
        self.conv_layers = nn.Sequential(
            nn.Conv2d(3, 16, 3), 
            nn.ReLU(),
            nn.MaxPool2d(2, 1),
            nn.Conv2d(16, 12, 3), 
            nn.ReLU(),
            nn.MaxPool2d(2, 1),
            nn.Conv2d(12, 8, 3), 
            nn.ReLU(),
            nn.MaxPool2d(2, 1),
            nn.Dropout(0.1),
            nn.Flatten()
        )
        self.fc_layers = nn.Sequential(
            nn.Linear(8 * 19 * 19, 512), 
            nn.ReLU(),
            nn.Linear(512, 16),
            nn.ReLU(),
            nn.Linear(16, 1)
        )
    
    def forward(self, x): 
        output = self.conv_layers(x)
        output = self.fc_layers(output)
        return output

def evaluate(model, loader): 
    total_err = 0
    total_samples = 0
    for images, labels in loader: 
        total_err += int(((model(images) > 0.0).squeeze().long() != labels).sum())
        total_samples += len(labels)
    return float(total_err) / total_samples

## Loading Data

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
input_size = 64
data_dir = f"/content/drive/MyDrive/APS360/Data/corrected_{input_size}.pickle"
df = pd.read_pickle(data_dir)

In [None]:
# def changeToInt(image): 
#   image = image * 255
#   return image.astype(np.uint8)

# def resize(image, size): 
#   return np.array(Image.fromarray(image).resize(size, resample=Image.BOX))
  
# df["image"] = df["image"].apply(lambda image: changeToInt(image))
# df["image"] = df["image"].apply(lambda x: resize(x, (28, 28)))

## Model Setup

In [14]:
np.random.seed(1000)
np_df = df.to_numpy()
np.random.shuffle(np_df)
df_trn = np_df[:int(len(np_df) * 0.8)]
df_val = np_df[int(len(np_df) * 0.8):]

batch_size = 32

train_dataset = MalariaDataset(df_trn, transform=transforms.ToTensor())
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
val_dataset = MalariaDataset(df_val, transform=transforms.ToTensor())
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

num_steps = len(train_loader)
learning_rate = 0.001

model = CNN()
loss = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

model_path = pathlib.Path(f"/content/drive/MyDrive/APS360/Models/malaria_corrected_bs{batch_size}_lr{learning_rate}")
model_path.mkdir(parents=True, exist_ok=True)

## Model Training

In [None]:
torch.manual_seed(1000)
num_epochs = 300

train_loss = np.zeros(num_epochs)
val_error = np.zeros(num_epochs)

for epoch in range(num_epochs):
    running_loss = 0 
    for images, labels in train_loader: 
        # ensure labels are same size and type
        labels = labels.unsqueeze(1).float()
        # forward pass
        outputs = model(images)
        loss_value = loss(outputs, labels)
        # backward pass
        optimizer.zero_grad()
        loss_value.backward()
        optimizer.step()
        # for statistics
        running_loss += loss_value.item()
    # compute average loss 
    train_loss[epoch] = running_loss / num_steps
    # evaluate on validation set
    val_error[epoch] = evaluate(model, val_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss_value.item():.6f}, Validation Error: {val_error[epoch]:.6f}")
    # save model after each epoch
    torch.save(model.state_dict(), model_path / f"epoch_{epoch+1}")

# show total loss
plt.title("Total Loss")
plt.plot(range(1, len(train_loss) + 1), train_loss)
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.show()
plt.title("Validation Error")
plt.plot(range(1, len(val_error) + 1), val_error)
plt.xlabel("Epochs")
plt.ylabel("Error")
plt.show()

Epoch 1/300, Loss: 0.081750, Validation Error: 0.061139
Epoch 2/300, Loss: 0.024062, Validation Error: 0.028302
Epoch 3/300, Loss: 0.054863, Validation Error: 0.026125
Epoch 4/300, Loss: 0.066525, Validation Error: 0.027758
Epoch 5/300, Loss: 0.071794, Validation Error: 0.027213
Epoch 6/300, Loss: 0.054551, Validation Error: 0.023403
Epoch 7/300, Loss: 0.046548, Validation Error: 0.022678
Epoch 8/300, Loss: 0.004271, Validation Error: 0.023222
Epoch 9/300, Loss: 0.018773, Validation Error: 0.023041
Epoch 10/300, Loss: 0.004302, Validation Error: 0.024673
Epoch 11/300, Loss: 0.003260, Validation Error: 0.024855
Epoch 12/300, Loss: 0.008499, Validation Error: 0.023222


## Model Evaluation

In [None]:
net = CNN()
state = torch.load(model_path / 'epoch_number')
net.load_state_dict(state)

incorrect = []
total_correct = 0
for images, labels in val_loader: 
    output = (net(images) > 0.0).squeeze().long()
    for pred, label, image in zip(output, labels, images): 
        if pred != label: incorrect.append((image, label))
        else: total_correct += 1
accuracy = float(total_correct) / len(df_val)
print(f"Validation Accuracy: {accuracy:.4f}")

incorrect = [[np.transpose(image.numpy(),(2, 1, 0)), label] for (image, label) in incorrect]
print(f"Number of Incorrect Images: {len(incorrect)}")

for i in range(0, 18, 6): 
    show_samples(incorrect[i:i+6])