# BreakHis Image Classification with 🤗 Vision Transformers and `TensorFlow`

## Setup & Configuration

In this step, we will define global configurations and parameters, which are used across the whole end-to-end fine-tuning process, e.g. `feature extractor` and `model` we will use. 

In this example we are going to fine-tune the [google/vit-base-patch16-224-in21k](https://huggingface.co/google/vit-base-patch16-224-in21k) a Vision Transformer (ViT) pre-trained on ImageNet-21k (14 million images, 21,843 classes) at resolution 224x224.
There are also [large](https://huggingface.co/google/vit-large-patch16-224-in21k) and [huge](https://huggingface.co/google/vit-huge-patch14-224-in21k) flavors of original ViT.

In [1]:
from pathlib import Path
n_splits = 5

cwd = Path().absolute()
results_path = cwd / 'results' / '400x_ConvNext_PT_patches224'


results_path

PosixPath('/home/miki/repos/uz/breakhis/vcs/results/400x_ConvNext_PT_patches224')

In [2]:
from datasets import load_dataset
import json
# from keras.utils import to_categorical
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
import os
# import tensorflow as tf
# import tensorflow_addons as tfa
from transformers import create_optimizer, DefaultDataCollator, AutoImageProcessor, AutoModelForImageClassification


  from .autonotebook import tqdm as notebook_tqdm


#### List all models

In [28]:
output_paths = sorted([os.path.basename(f.path) for f in os.scandir(results_path) if f.is_dir()])

output_paths

['model_0', 'model_1', 'model_2', 'model_3', 'model_4']

#### Find the best model

In [29]:
import pandas as pd

def find_best_model_idx_and_acc(output_paths):
    json_files = [results_path / output_path / 'all_results.json' for output_path in output_paths]
    # csv_files = [results_path / output_path / f'train_metrics_{idx}.csv' for idx in range(n_splits)]
    # csv_files = [results_path / f'train_metrics_{idx}.csv' for idx in range(n_splits)]
    # dataframes = [pd.read_csv(file) for file in csv_files]


    best_model_index = None
    best_f1_score = 0.0

    for i, json_files in enumerate(json_files):
        with open(json_files) as json_file:
            data = json.load(json_file)
            print(f"Model {i}: e_acc: {data['eval_accuracy']:.3f}, e_loss: {data['eval_loss']:.3f}, e_f1: {data['eval_f1']:.3f}")
            f1_score = data["eval_f1"]
            if f1_score > best_f1_score:
                best_f1_score = f1_score
                best_model_index = i

    print(f"Best model index: {best_model_index}, f1_score: {best_f1_score}")
    return best_model_index, best_f1_score

In [30]:
best_model = (find_best_model_idx_and_acc(output_paths))


Model 0: e_acc: 0.783, e_loss: 0.546, e_f1: 0.848
Model 1: e_acc: 0.742, e_loss: 1.186, e_f1: 0.841
Model 2: e_acc: 0.895, e_loss: 0.524, e_f1: 0.927
Model 3: e_acc: 0.808, e_loss: 0.891, e_f1: 0.841
Model 4: e_acc: 0.845, e_loss: 0.508, e_f1: 0.878
Best model index: 2, f1_score: 0.9266187050359712


In [32]:
def calculate_mean_metrics():
    numeric_columns = ['eval_accuracy', 'eval_auc', 'eval_f1', 'eval_loss', 'eval_precision', 'eval_recall', 'train_loss']
    n_splits = 5  # Define the number of splits you have

    json_files = [results_path / f'model_{idx}' / 'all_results.json' for idx in range(n_splits)]
    
    data_list = []
    for json_file in json_files:
        with open(json_file) as f:
            data = json.load(f)
            # append dict to the list, but only `numeric_columns`
            data_list.append({k: v for k, v in data.items() if k in numeric_columns})

    df = pd.DataFrame(data_list)

    mean_metrics = df.mean()
    std_metrics = df.std()

    print(df)

    metrics = {
        metric_name: {
            "mean": mean_metrics[metric_name],
            "std": std_metrics[metric_name],
        }
        for metric_name in mean_metrics.index
    }

    with open(results_path / 'train_metrics_mean_with_std.json', 'w') as f:
        json.dump(metrics, f, indent=4)

    metrics["output_path"] = str(results_path / 'train_metrics_mean_with_std.json')
    
    return metrics


In [33]:
# mean_metrics = []
mean_metrics = calculate_mean_metrics()

# max_val_acc = max(mean_metrics, key=lambda x: x['val_accuracy']['mean'])
# max_val_acc

   eval_accuracy  eval_auc   eval_f1  eval_loss  eval_precision  eval_recall  \
0       0.783499  0.868032  0.848391   0.545576        0.825660     0.872408   
1       0.742038  0.765488  0.841280   1.185728        0.931983     0.766667   
2       0.894737  0.874649  0.926619   0.523717        0.864430     0.998450   
3       0.808282  0.816104  0.841438   0.891135        0.845370     0.837542   
4       0.845238  0.908499  0.877909   0.507963        0.895087     0.861378   

   train_loss  
0    0.103908  
1    0.091258  
2    0.101322  
3    0.088616  
4    0.092490  


In [None]:
for mm in mean_metrics:
    print(mm['output_path'], mm['val_accuracy']['mean'])

In [None]:
# with open(results_path / f'model_info_{best_model_index}.json', 'r') as f:
best_model_output_path = max_val_acc["output_path"]
best_model_index = best_models[best_model_output_path][0]
with open(results_path / best_model_output_path / f'model_info_{best_model_index}.json', 'r') as f:
    best_model_info = json.load(f)

best_model_info


In [None]:

print(best_model_output_path)

In [None]:
input_path = cwd / f'breakhis_{best_model_info["zoom"]}x'

In [None]:
import pandas as pd

csv_files = [results_path / best_model_output_path / f'train_metrics_{idx}.csv' for idx in range(best_model_info["n_splits"])]
dataframes = [pd.read_csv(file) for file in csv_files]


In [None]:

for df in dataframes:
    df['f1_score'] = df['f1_score'].apply(compute_mean)
    df['val_f1_score'] = df['val_f1_score'].apply(compute_mean)

In [None]:
train_metrics = ['accuracy', 'loss', 'auc', 'precision', 'recall', 'f1_score']
val_metrics = ['val_accuracy', 'val_loss', 'val_auc', 'val_precision', 'val_recall', 'val_f1_score']

assert len(train_metrics) == len(val_metrics), "Number of train and validation metrics must be equal!"

import ast
import numpy as np

num_metrics = len(train_metrics)

fig, axes = plt.subplots(num_metrics, 2, figsize=(12, num_metrics * 3))

title_postfix = f'(Zoom {best_model_info["zoom"]}x)'
train_title_midfix = 'on training set'
val_title_midfix = 'on validation set' 
metrics_mapping = {'accuracy': 'Accuracy', 'auc': 'AUC', 'loss': 'Loss', 'precision': 'Precision', 'recall': 'Recall', 'f1_score': 'F1-score',
                   'val_accuracy': 'Accuracy', 'val_auc': 'AUC', 'val_loss': 'Loss', 'val_precision': 'Precision', 'val_recall': 'Recall', 'val_f1_score': 'F1-score'}

for i, (train_metric, val_metric) in enumerate(zip(train_metrics, val_metrics)):
    for j, df in enumerate(dataframes):
        axes[i, 0].plot(df[train_metric], label=f'Fold {j}')
        axes[i, 1].plot(df[val_metric], label=f'Fold {j}')

    train_title = f'{metrics_mapping[train_metric]} {train_title_midfix} {title_postfix}'
    val_title = f'{metrics_mapping[val_metric]} {val_title_midfix} {title_postfix}'

    axes[i, 0].set_title(train_title)
    axes[i, 0].set_ylabel(metrics_mapping[train_metric])
    axes[i, 1].set_xlabel('Epoka')
    axes[i, 0].legend()

    axes[i, 1].set_title(val_title)
    axes[i, 1].set_ylabel(metrics_mapping[val_metric])
    axes[i, 1].set_xlabel('Epoka')
    axes[i, 1].legend()

plt.tight_layout()
plt.savefig(results_path / best_model_output_path / f'train_metrics_{best_model_info["zoom"]}.png')
plt.show()


#### Final evaluation on test dataset

In [None]:
tf.debugging.disable_traceback_filtering()


image_processor = ViTImageProcessor.from_pretrained(best_model_info['model_id'])


def remove_extra_dim(example):
    example['pixel_values'] = np.squeeze(example['pixel_values'], axis=0)
    return example


def process_example(image):
    inputs = image_processor(image, return_tensors='tf')
    return inputs['pixel_values']


def process_dataset(example):
    example['pixel_values'] = process_example(
        Image.open(example['file_loc']).convert("RGB"))

    # example['pixel_values']=np.squeeze(example['pixel_values'], axis=0)
    example['label'] = to_categorical(example['label'], num_classes=2)
    return example


def load_test_data():
    test_csv = str(input_path / 'test.csv')
    dataset = load_dataset(
        'csv', data_files={'test': test_csv})

    dataset = dataset.map(process_dataset, with_indices=False, num_proc=1)

    print(f"Loaded test dataset: {len(dataset['test'])} samples")

    return dataset.map(remove_extra_dim)


test_dataset = load_test_data()

data_collator = DefaultDataCollator(return_tensors="tf")

test_dataset_tf = test_dataset['test'].to_tf_dataset(
    columns=['pixel_values'],
    label_cols=['label'],
    shuffle=False,
    batch_size=best_model_info['batch_size'],
    collate_fn=data_collator)


In [None]:
best_model = TFViTForImageClassification.from_pretrained(results_path / best_model_output_path / f'model_{best_model_index}')

loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)

metrics = [
    tf.keras.metrics.BinaryAccuracy(name="accuracy"),
    tf.keras.metrics.AUC(name='auc', from_logits=True),
    tf.keras.metrics.AUC(name='auc_multi', from_logits=True, num_labels=2, multi_label=True),
    tf.keras.metrics.Recall(name='recall'),
    tf.keras.metrics.Precision(name='precision'),
    tfa.metrics.F1Score(name='f1_score', num_classes=2, threshold=0.5),
]

optimizer, _ = create_optimizer(
    init_lr=best_model_info['learning_rate'],
    num_train_steps=best_model_info['num_train_steps'],
    weight_decay_rate=best_model_info['weight_decay_rate'],
    num_warmup_steps=best_model_info['num_warmup_steps'],
)
best_model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

results = best_model.evaluate(test_dataset_tf)

In [None]:
print("Test dataset evaluation results:")
test_metrics = {}
for metric, value in zip(best_model.metrics_names, results):
    print(metric, value)
    if isinstance(value, np.ndarray):
        value = list(value)
        value = [str(v) for v in value]
    else:
        value = str(value)
    test_metrics[metric] = value

with open(results_path / best_model_output_path / 'test_metrics.json', 'w') as test_metrics_json:
    json.dump(test_metrics, test_metrics_json, indent = 4)

#### Let's take a look at the details of the predictions

In [None]:
preds = best_model.predict(test_dataset_tf)
probabilities = tf.nn.softmax(preds.logits, axis=-1)


In [None]:
preds

In [None]:
labels_pred = np.argmax(probabilities, axis=-1)
labels_pred


In [None]:
np.array(test_dataset['test']['label'])


In [None]:
# Extract file locations and real labels from the test dataset
file_locs = [example['file_loc'] for example in test_dataset['test']]
labels = [np.argmax(example['label']) for example in test_dataset['test']]

# Create a DataFrame
results_df = pd.DataFrame(
    {'file_loc': file_locs, 'label': labels, 'label_pred': labels_pred})

# Save the DataFrame as a CSV file
results_df.to_csv(results_path / best_model_output_path / f'test_results.csv', index=False)
