# BreakHis Image Classification with ðŸ¤— Vision Transformers and `TensorFlow`

## Installation

In [None]:
# !pip install transformers datasets "tensorflow==2.6.0" tensorflow-addons --upgrade

## Setup & Configuration

In this step, we will define global configurations and parameters, which are used across the whole end-to-end fine-tuning process, e.g. `feature extractor` and `model` we will use. 

In this example we are going to fine-tune the [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) a Vision Transformer (ViT) pre-trained on ImageNet-21k (14 million images, 21,843 classes) at resolution 224x224.
There are also [large](https://huggingface.co/google/vit-large-patch16-224-in21k) and [huge](https://huggingface.co/google/vit-huge-patch14-224-in21k) flavors of original ViT.

In [None]:
from pathlib import Path
n_splits = 5

cwd = Path().absolute()
results_path = cwd / 'results_100x'


results_path

In [None]:
from datasets import load_dataset
import json
from keras.utils import to_categorical
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
import os
import tensorflow as tf
import tensorflow_addons as tfa
from transformers import create_optimizer, DefaultDataCollator, ViTImageProcessor, TFViTForImageClassification


## Dataset & Pre-processing

- **Data Source:** https://www.kaggle.com/code/nasrulhakim86/breast-cancer-histopathology-images-classification/data
- The Breast Cancer Histopathological Image Classification (BreakHis) is composed of 9,109 microscopic images of breast tumor tissue collected from 82 patients.
- The images are collected using different magnifying factors (40X, 100X, 200X, and 400X). 
- To date, it contains 2,480 benign and 5,429 malignant samples (700X460 pixels, 3-channel RGB, 8-bit depth in each channel, PNG format).
- This database has been built in collaboration with the P&D Laboratory â€“ Pathological Anatomy and Cytopathology, Parana, Brazil (http://www.prevencaoediagnose.com.br). 
- Each image filename stores information about the image itself: method of procedure biopsy, tumor class, tumor type, patient identification, and magnification factor. 
- For example, SOBBTA-14-4659-40-001.png is the image 1, at magnification factor 40X, of a benign tumor of type tubular adenoma, original from the slide 14-4659, which was collected by procedure SOB.

The `BreakHis` is not yet available as a dataset in the `datasets` library. To be able to create a `Dataset` instance we need to write a small little helper function, which will load our `Dataset` from the filesystem and create the instance to use later for training.

This notebook assumes that the dataset is available in directory tree next to this file and its directory name is `breakhis_400x`

In [None]:
output_paths = [os.path.basename(f.path) for f in os.scandir(results_path) if f.is_dir()]

output_paths

#### Find the best model

In [None]:
import pandas as pd

def find_best_model_idx_and_acc(output_path):
    csv_files = [results_path / output_path / f'train_metrics_{idx}.csv' for idx in range(n_splits)]
    dataframes = [pd.read_csv(file) for file in csv_files]


    best_model_index = None
    best_val_accuracy = 0.0

    for i, df in enumerate(dataframes):
        val_accuracy = df.iloc[-1]['val_accuracy']
        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            best_model_index = i

    print(f"Best model index: {best_model_index}, val_accuracy: {best_val_accuracy}")
    return best_model_index, best_val_accuracy

In [None]:
best_models = {}
for output_path in output_paths:
    best_models[output_path] = (find_best_model_idx_and_acc(output_path))

print(best_models)

In [None]:
def compute_mean(x):
    try:
        # Usuwamy nawiasy kwadratowe i dzielimy string na listÄ™, uÅ¼ywajÄ…c przecinka jako separatora
        lst = json.loads(x)
        lst = lst.replace("[", "").replace("]", "").split(", ")
        # Konwersja kaÅ¼dego elementu listy na float
        lst = [float(i) for i in lst]
        return np.mean(lst)
    except ValueError as e:
        print(f"Cannot convert {x} to list of floats: {e}")
        return None


def calculate_mean_metrics(output_path):
    numeric_columns = ['accuracy', 'auc', 'loss', 'precision', 'recall', 'f1_score',
                    'val_accuracy', 'val_auc', 'val_loss', 'val_precision', 'val_recall', 'val_f1_score']

    csv_files = [results_path / output_path / f'train_metrics_{idx}.csv' for idx in range(n_splits)]
    dataframes = [pd.read_csv(file) for file in csv_files]
    
    for df in dataframes:
        df['f1_score'] = df['f1_score'].apply(compute_mean)
        df['val_f1_score'] = df['val_f1_score'].apply(compute_mean)
    
    last_rows_numeric = [df[numeric_columns].iloc[-1] for df in dataframes]
    mean_metrics = pd.concat(last_rows_numeric, axis=1).mean(axis=1)
    std_metrics = pd.concat(last_rows_numeric, axis=1).std(axis=1)

    metrics = {
        metric_name: {
            "mean": mean_metrics[metric_name],
            "std": std_metrics[metric_name],
        }
        for metric_name in mean_metrics.index
    }

    with open(results_path / output_path / 'train_metrics_mean_with_std.json', 'w') as f:
        json.dump(metrics, f, indent=4)

    metrics["output_path"] = output_path
    
    return metrics

In [None]:
mean_metrics = []
for output_path in output_paths:
    mean_metrics.append(calculate_mean_metrics(output_path))

max_val_acc = max(mean_metrics, key=lambda x: x['val_accuracy']['mean'])
max_val_acc

In [None]:
for mm in mean_metrics:
    print(mm['output_path'], mm['val_accuracy']['mean'])

In [None]:
# with open(results_path / f'model_info_{best_model_index}.json', 'r') as f:
best_model_output_path = max_val_acc["output_path"]
best_model_index = best_models[best_model_output_path][0]
with open(results_path / best_model_output_path / f'model_info_{best_model_index}.json', 'r') as f:
    best_model_info = json.load(f)

best_model_info


In [None]:

print(best_model_output_path)

In [None]:
input_path = cwd / f'breakhis_{best_model_info["zoom"]}x'

In [None]:
import pandas as pd

csv_files = [results_path / best_model_output_path / f'train_metrics_{idx}.csv' for idx in range(best_model_info["n_splits"])]
dataframes = [pd.read_csv(file) for file in csv_files]


In [None]:

for df in dataframes:
    df['f1_score'] = df['f1_score'].apply(compute_mean)
    df['val_f1_score'] = df['val_f1_score'].apply(compute_mean)

In [None]:
# Definiujemy metryki dla treningu i walidacji osobno
train_metrics = ['accuracy', 'loss', 'auc', 'precision', 'recall', 'f1_score']
val_metrics = ['val_accuracy', 'val_loss', 'val_auc', 'val_precision', 'val_recall', 'val_f1_score']

assert len(train_metrics) == len(val_metrics), "Liczba metryk treningowych musi byÄ‡ taka sama jak liczba metryk walidacyjnych"

import ast
import numpy as np

num_metrics = len(train_metrics)

# Modyfikujemy figsize, aby wykresy byÅ‚y bardziej kwadratowe
fig, axes = plt.subplots(num_metrics, 2, figsize=(12, num_metrics * 3))

title_postfix = f'(powiÄ™kszenie {best_model_info["zoom"]}x)'
train_title_midfix = 'na zbiorze treningowym'
val_title_midfix = 'na zbiorze walidacyjnym' 
metrics_mapping = {'accuracy': 'SkutecznoÅ›Ä‡', 'auc': 'AUC', 'loss': 'Strata', 'precision': 'Precyzja', 'recall': 'CzuÅ‚oÅ›Ä‡', 'f1_score': 'F1-score',
                   'val_accuracy': 'SkutecznoÅ›Ä‡', 'val_auc': 'AUC', 'val_loss': 'Strata', 'val_precision': 'Precyzja', 'val_recall': 'CzuÅ‚oÅ›Ä‡', 'val_f1_score': 'F1-score'}

for i, (train_metric, val_metric) in enumerate(zip(train_metrics, val_metrics)):
    for j, df in enumerate(dataframes):
        axes[i, 0].plot(df[train_metric], label=f'Fold {j}')
        axes[i, 1].plot(df[val_metric], label=f'Fold {j}')

    train_title = f'{metrics_mapping[train_metric]} {train_title_midfix} {title_postfix}'
    val_title = f'{metrics_mapping[val_metric]} {val_title_midfix} {title_postfix}'

    axes[i, 0].set_title(train_title)
    axes[i, 0].set_ylabel(metrics_mapping[train_metric])
    axes[i, 1].set_xlabel('Epoka')
    axes[i, 0].legend()

    axes[i, 1].set_title(val_title)
    axes[i, 1].set_ylabel(metrics_mapping[val_metric])
    axes[i, 1].set_xlabel('Epoka')
    axes[i, 1].legend()

plt.tight_layout()
plt.savefig(results_path / best_model_output_path / f'train_metrics_{best_model_info["zoom"]}.png')
plt.show()


#### Final evaluation on test dataset

In [None]:
tf.debugging.disable_traceback_filtering()


image_processor = ViTImageProcessor.from_pretrained(best_model_info['model_id'])


def remove_extra_dim(example):
    example['pixel_values'] = np.squeeze(example['pixel_values'], axis=0)
    return example


def process_example(image):
    inputs = image_processor(image, return_tensors='tf')
    return inputs['pixel_values']


def process_dataset(example):
    example['pixel_values'] = process_example(
        Image.open(example['file_loc']).convert("RGB"))

    # example['pixel_values']=np.squeeze(example['pixel_values'], axis=0)
    example['label'] = to_categorical(example['label'], num_classes=2)
    return example


def load_test_data():
    test_csv = str(input_path / 'test.csv')
    dataset = load_dataset(
        'csv', data_files={'test': test_csv})

    dataset = dataset.map(process_dataset, with_indices=False, num_proc=1)

    print(f"Loaded test dataset: {len(dataset['test'])} samples")

    return dataset.map(remove_extra_dim)


test_dataset = load_test_data()

data_collator = DefaultDataCollator(return_tensors="tf")

test_dataset_tf = test_dataset['test'].to_tf_dataset(
    columns=['pixel_values'],
    label_cols=['label'],
    shuffle=False,
    batch_size=best_model_info['batch_size'],
    collate_fn=data_collator)


In [None]:
best_model = TFViTForImageClassification.from_pretrained(results_path / best_model_output_path / f'model_{best_model_index}')

loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)

metrics = [
    tf.keras.metrics.BinaryAccuracy(name="accuracy"),
    tf.keras.metrics.AUC(name='auc', from_logits=True),
    tf.keras.metrics.AUC(name='auc_multi', from_logits=True, num_labels=2, multi_label=True),
    tf.keras.metrics.Recall(name='recall'),
    tf.keras.metrics.Precision(name='precision'),
    tfa.metrics.F1Score(name='f1_score', num_classes=2, threshold=0.5),
]

optimizer, _ = create_optimizer(
    init_lr=best_model_info['learning_rate'],
    num_train_steps=best_model_info['num_train_steps'],
    weight_decay_rate=best_model_info['weight_decay_rate'],
    num_warmup_steps=best_model_info['num_warmup_steps'],
)
best_model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

results = best_model.evaluate(test_dataset_tf)

In [None]:
print("Test dataset evaluation results:")
test_metrics = {}
for metric, value in zip(best_model.metrics_names, results):
    print(metric, value)
    if isinstance(value, np.ndarray):
        value = list(value)
        value = [str(v) for v in value]
    else:
        value = str(value)
    test_metrics[metric] = value

with open(results_path / best_model_output_path / 'test_metrics.json', 'w') as test_metrics_json:
    json.dump(test_metrics, test_metrics_json, indent = 4)

#### Let's take a look at the details of the predictions

In [None]:
preds = best_model.predict(test_dataset_tf)
probabilities = tf.nn.softmax(preds.logits, axis=-1)


In [None]:
preds

In [None]:
labels_pred = np.argmax(probabilities, axis=-1)
labels_pred


In [None]:
np.array(test_dataset['test']['label'])


In [None]:
# Extract file locations and real labels from the test dataset
file_locs = [example['file_loc'] for example in test_dataset['test']]
labels = [np.argmax(example['label']) for example in test_dataset['test']]

# Create a DataFrame
results_df = pd.DataFrame(
    {'file_loc': file_locs, 'label': labels, 'label_pred': labels_pred})

# Save the DataFrame as a CSV file
results_df.to_csv(results_path / best_model_output_path / f'test_results.csv', index=False)
