# OOD Detection using the Intel® Geti™ SDK

This notebook shows the out-of-distribution (OOD) detection using the [kNN-based OOD detection](https://arxiv.org/abs/2204.06507) method for an image classification task. In this example, a classifier trained on the CUB-200-2011 dataset is used.

## 1: Preparing the dataset for training the classifier

### 1.1 Downloading and extracting the CUB-200-2011 dataset

The [CUB-200-2011](https://www.vision.caltech.edu/datasets/cub_200_2011/) dataset is a dataset of 200 classes of birds. In this notebook, we use 90% of the dataset for training the classifier and the rest 10% as the test set for in-distribution. The same images with corruption (e.g. motion blurred) are used as the out-of-distribution dataset.

In [None]:
%load_ext autoreload
%autoreload 2

import os
import tarfile
from urllib import request
from tqdm import tqdm
import splitfolders


#Provide the dataset (extracted/to be extracted) path here. If the dataset is not downloaded, it will be downloaded and extracted.
data_dir = "./use_cases/data/ood_detection/cub200"

# CUB-200-2011 example
cub200_tar = os.path.join(data_dir, "CUB_200_2011.tgz")
# If the dataset is not downloaded, download it
if not os.path.exists(cub200_tar):
    cub200_url = "https://data.caltech.edu/records/65de6-vp158/files/CUB_200_2011.tgz"
    os.makedirs(data_dir, exist_ok=True)
    print(f"Downloading dataset to {cub200_tar}")
    request.urlretrieve(cub200_url, cub200_tar)
    print("Download complete.")

# Extract the dataset unless it is already extracted.
if not os.path.exists(os.path.join(data_dir,"CUB_200_2011")):
    print(f"Extracting dataset to f{os.path.join(data_dir,'CUB_200_2011')}")
    with tarfile.open(cub200_tar, "r:gz") as tar:
        for member in tqdm(iterable=tar.getmembers(), total=len(tar.getmembers())):
             tar.extract(member=member,path=data_dir)

# Split the dataset for training and test purposes  - Split used is 80:20 and can be changed.
# The trainset will be further split into train,val and test automatically on the Geti instance.
dataset_dir = os.path.join(data_dir,"CUB_200_2011_split")
if not os.path.exists(dataset_dir):
    print(f"Splitting dataset into train and test at {dataset_dir}")
    splitfolders.ratio(os.path.join(data_dir,'CUB_200_2011','images'),
                   output=dataset_dir,
                   seed=117,
                   ratio=(.9, .1),
                   group_prefix=None,
                   move=False)
    os.rename(os.path.join(dataset_dir,'val'),os.path.join(dataset_dir,'id_test'))

print("Dataset ready to be used")


##  2: Train a classifier on the Intel® Geti™ instance.

### 2.1: Creating a Geti object and authenticating it.
For authentication, you need to have a .env file configuration file placed in the same directory of this notebook. More details [here](https://github.com/openvinotoolkit/geti-sdk/tree/main/notebooks#authentication).

In [None]:
from geti_sdk.utils import get_server_details_from_env
geti_server_configuration = get_server_details_from_env()

from geti_sdk import Geti
geti = Geti(server_config=geti_server_configuration)

### 2.2 : Creating a project and uploading the dataset

In [None]:
from geti_sdk.annotation_readers import DirectoryTreeAnnotationReader
PROJECT_NAME = 'CUB200-910' # Name of the project on the Geti instance

Skip the next step if the project is already created on the Geti instance.

In [None]:
annotation_reader = DirectoryTreeAnnotationReader(base_data_folder=os.path.join(dataset_dir,'train'))
print(f"# of images for training the classifier : {len(annotation_reader.get_data_filenames())}")
print(f"# of classes : {len(annotation_reader.get_all_label_names())}")


project = geti.create_single_task_project_from_dataset(
    project_name = PROJECT_NAME,
    project_type = "classification",
    path_to_images = os.path.join(dataset_dir,'train'),
    annotation_reader = annotation_reader,
    enable_auto_train=False,
)
print(project.summary)

### 2.3 Train the classifier
We choose the EfficientNet-V2-S algorithm for training the classifier. The list of available algorithms can be obtained using the following code snippet.

In [None]:
from geti_sdk.rest_clients import TrainingClient
from geti_sdk.rest_clients import ProjectClient
project_client = ProjectClient(session=geti.session, workspace_id=geti.workspace_id)
project = project_client.get_project_by_name(project_name=PROJECT_NAME)
training_client = TrainingClient(
    session=geti.session, workspace_id=geti.workspace_id, project=project
)

task = project.get_trainable_tasks()[0]
available_algorithms = training_client.get_algorithms_for_task(task=task)
print(available_algorithms.summary)

Skip this step if the classifier is already trained.

In [None]:
algorithm = available_algorithms.get_by_name(name="EfficientNet-V2-S")
status = training_client.get_status()
print(status.summary)

job = training_client.train_task(
    algorithm=algorithm,
    task=task,
)
training_client.monitor_jobs([job])

### 2.4 Downloading the trained classifier model for inference

In [None]:
from geti_sdk.demos.demo_projects.utils import ensure_project_is_trained
from geti_sdk.rest_clients.model_client import ModelClient

# Confirm is the model is trained.
_= ensure_project_is_trained(geti=geti, project=project)

In [None]:
model_client = ModelClient(session=geti.session, workspace_id=geti.workspace_id,project=project)
models = model_client.get_all_active_models()

# We need the model which has xai enabled - this allows us to get the feature vector from the model.
model_index = next((index for index, model in enumerate(models[0].optimized_models) if model.has_xai_head), None)
if model_index is None:
    raise Exception("No model with XAI head found! Please check if the project has such a model on the Geti instance")

model_for_deployment = models[0].optimized_models[model_index]
model_accuracy = model_for_deployment.performance.score

print(f"Model for deployment : {model_for_deployment.name} (accuracy : {model_accuracy*100:.2f} %)")
deployment = geti.deploy_project(project_name=PROJECT_NAME,models=[model_for_deployment])
deployment.load_inference_models(device="CPU")

## 3 : Out-of-distribution dataset

### 3.1: Generating the out-of-distribution dataset
We create the out-of-distribution by applying corruptions (e.g. motion blur) on the test set of in-distribution images. The strength of the corruptions is tuned until the test set has a classification accuracy of x%, i.e., half of the test set is classified incorrectly.

The possible corruptions are: `gaussian_blur`, `motion_blur`, `fake_snow`, `cut_out` and `poisson_noise`.

You can set the `generate_ood_images` flag to `False` and set the `ood_images_path` to the path of the out-of-distribution images if you want to use a different set of images as OOD.

In [None]:
id_images_path: str = os.path.join(dataset_dir,'id_test')
ood_images_path: str =  os.path.join(dataset_dir,'ood_test')  # Set this to the path of the OOD images if you want to use a different set of images as OOD.

In [None]:
generate_ood_images: bool = True

if generate_ood_images:
    from notebooks.use_cases.utils import TransformImages
    transform_images = TransformImages(corruption_type='motion_blur')
    ood_images_path  = transform_images.generate_ood_dataset_by_corruption\
            (
            geti_deployment = deployment,
            source_path = id_images_path,
            dest_path = ood_images_path,
            desired_accuracy = 50,
            desired_accuracy_tol=3.0,
            show_progress= True,
        )

In [None]:
from notebooks.use_cases.utils import display_sample_images_in_folder

display_sample_images_in_folder(id_images_path, n_images=10, title="In-distribution images")
display_sample_images_in_folder(ood_images_path, n_images=10, title="Out-of-distribution images")

## 4: OOD Detection

We are using a simple [kNN-based OOD detection method](https://arxiv.org/abs/2204.06507). The OOD score is calculated as the distance to the kth nearest neighbour in the feature space (of known in-distribution images).

### 4.1 : Calibration - Calculating the OOD score threshold

In [None]:
import numpy as np
from geti_sdk.rest_clients import ImageClient
from notebooks.use_cases.utils import extract_features_from_imageclient

image_client = ImageClient(session=geti.session,
                           workspace_id=geti.workspace_id,
                           project=project)

# set the number of images to be used for calculating the OOD score threshold. The images used for training the classifier are used for calibration.
# higher n_images_for_calib --> more accurate OOD score threshold, but also more time to get features of images
n_images_for_calib = -1 # set to -1 to use all images

features_id= extract_features_from_imageclient(
        deployment=deployment,
        image_client=image_client,
        geti_session=geti.session,
        n_images= n_images_for_calib,
        normalise_feats= True
)


In [None]:
import faiss
dknn_k = 6  # number of nearest neighbours to be used for calculating the OOD score threshold. This is a hyperparameter for the DkNN OOD detection method. A number in the range of 4-10 has given good results in our experiments. Should be lower than the number of classes in the dataset.

index_flat = faiss.IndexFlatL2(features_id.shape[1]) # Indexing the features of the ID images
index_flat.add(features_id.astype(np.float32))
dists, nns = index_flat.search(features_id.astype(np.float32), dknn_k+1) # Calculating the distances to the k nearest neighbours among the set of ID images

# Calculating the OOD score threshold
n_percentile = 99.9

# We set the distance threshold such that at least n_percentile% of known ID images are classified correctly.
# Higher number --> more strict OOD score threshold, more true positives, but also more false positives
ood_score_threshold = np.percentile(dists[:,dknn_k].flatten(),n_percentile)
print(f"OOD Threshold distance : {ood_score_threshold:.2f}")

### 4.2 : OOD Detection - Calculating the OOD scores for the ID and OOD test images

In [None]:
from notebooks.use_cases.utils import extract_features_from_img_folder

## test images for ID and OOD
id_images_path = os.path.join(dataset_dir,'id_test')

id_features = extract_features_from_img_folder(
        deployment=deployment,
        images_folder_path=id_images_path,
        normalise_feats= True
)

ood_features = extract_features_from_img_folder(
        deployment=deployment,
        images_folder_path=ood_images_path,
        normalise_feats= True
)

In [None]:
# Performing the knn search with k = dknn_k
scores_id, _ = index_flat.search(id_features.astype(np.float32), k=dknn_k)
scores_ood, _ = index_flat.search(ood_features.astype(np.float32), k=dknn_k)

# Take the highest distance --> this is the distance to the kth neighbour.
# Taking a negative of the scores as we would want the ID images to have higher value while plotting the results
scores_id = -scores_id[:, -1]
scores_ood = -scores_ood[:,-1]

scores_concat = np.concatenate((scores_id, scores_ood))
ground_truth_id = np.ones(scores_id.shape[0])
ground_truth_ood = np.zeros(scores_ood.shape[0])
ground_truth = np.concatenate((ground_truth_id, ground_truth_ood))

## 5 : OOD Detection - Plotting the results

### 5.1 : Results - ROC curve

In [None]:
from sklearn import metrics
from matplotlib import pyplot as plt
fpr, tpr, thresholds_roc = metrics.roc_curve(ground_truth, scores_concat)
precision, recall, thresholds_pr = metrics.precision_recall_curve(ground_truth, scores_concat)
auroc = metrics.auc(fpr, tpr)
fig = plt.figure(figsize=(5,5))
plt.plot(fpr,tpr,color="#003e6d")
plt.text(0.8, 0.3, f"AUROC = {auroc:.4f}", fontsize=12, ha='center')
plt.title("ROC \n(ID images as positive examples)")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.show()

### 5.2 : Results - Confusion Matrix

In [None]:
import seaborn as sns
predictions = scores_concat>(-ood_score_threshold)
confusion_matrix = metrics.confusion_matrix(ground_truth,predictions)
ax = sns.heatmap(confusion_matrix, annot=True, fmt='d', )
ax.set_xlabel("Predicted", fontsize=14, labelpad=20)
ax.xaxis.set_ticklabels(['(OOD)', '(ID)'])
ax.set_ylabel("Ground Truth", fontsize=14, labelpad=20)
ax.yaxis.set_ticklabels(['(OOD)', '(ID)'])
ax.set_title("Confusion Matrix - ID v OOD", fontsize=14, pad=20)
plt.show()

### 5.3 Results - Displaying mis-classified examples

In [None]:
from notebooks.use_cases.utils import show_top_n_misclassifications
# The following plot shows the overlap in scores between in- and out-of-distribution images.

fig = plt.figure(figsize=(5,5))
sns.kdeplot(scores_id, fill=True, color="#0068b5", label="ID")
sns.kdeplot(scores_ood, fill=True, color="#e96115", label="OOD")
plt.axvline(x=-ood_score_threshold, color="#001220", linestyle='--')
plt.xlabel("OOD Score")
plt.ylabel("Density")
plt.title("OOD Score Distribution")
plt.legend()

In [None]:
# The following figures show the top n misclassified ID and OOD images.
show_top_n_misclassifications(
        images_dir = id_images_path,
        scores = scores_id,
        type_of_samples = "id",
        n_images = 9,
)
show_top_n_misclassifications(
        images_dir = ood_images_path,
        scores = scores_ood,
        type_of_samples = "ood",
        n_images = 9,
)

In [None]:
# delete the project on the Geti instance if required (this can not be undone)
# project_client.delete_project(project=project_name, requires_confirmation=False)