In [None]:
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
%load_ext autoreload
%autoreload 2
import os
import pandas as pd
import torch.optim as optim

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from torch.utils.data import DataLoader
import torch.nn as nn
import torch
import sys
sys.path.append('../prepare_dataset')
sys.path.append('../models')
sys.path.append('../analyze_results')
import data_aug
import prepare_binary_dataset
import data_aug
import models_classification
import utils
import near_duplicates
import seaborn


## Load metadata

In [None]:
DATA_DIR='../../../data/eardrum_public_data'
img_df=pd.read_csv('../metadata/metadata.csv')

## Save embeddings
Modify the following codes if you want to use the feature embeddings of your own model instead of the trained models. 

In [None]:
save_dir='../experiment/vit_b_16_384_False_0.0_False_32_1234_100_True_False_0.05_0.01_0_0.9'
param_dict=utils.read_param_file(os.path.join(save_dir,'parameters.txt'))
model_state_name='model.pt'
cudaID=0
img_df['source_class']=img_df['source']+'_'+img_df['class']
device=torch.device(f'cuda:{cudaID}')
num_class=2
batch_size=32
skf = StratifiedKFold(n_splits=5, random_state=param_dict['seed'], shuffle=True)
test_fold_id=0
near_duplicate_set_list=[]
for train_index, test_index in skf.split(img_df, img_df['source_class']): 
    experiment_folder=os.path.join(save_dir,str(test_fold_id))
    train0=img_df.iloc[train_index,]
    test=img_df.iloc[test_index,]
    train,val=train_test_split(train0,stratify=train0['source_class'],test_size=0.2,shuffle=True,random_state=param_dict['seed'])
    model = models_classification.model_classification(param_dict['model_name'])
    model.cuda(cudaID)
    model.load_state_dict(torch.load(
        os.path.join(experiment_folder,model_state_name)
    ))
    train_tf,test_tf=data_aug.derive_transform(model.size,model.mean,model.std,
                                               scale=param_dict['scale'],
                     add_gauss_noise=param_dict['add_gauss_noise'],
                     elastic_tf=param_dict['elastic_tf'],
                     color_hue=param_dict['colorhue'])
    all_data=prepare_binary_dataset.OtoDataset_Binary(img_df,transform=test_tf,
                                                      data_dir=DATA_DIR,
    eclipse=param_dict['eclipse'],eclipse_extent=param_dict['eclipse_extent'])
    all_data_loader=DataLoader(all_data,batch_size=batch_size,num_workers=4,shuffle=False)
    embedding_list=near_duplicates.return_embeddings(model,all_data_loader,device)
    embedding_df=pd.DataFrame(embedding_list)
    embedding_df.to_csv(f'../metadata/embedding{test_fold_id}.csv',index=False)
    print(f'fold {test_fold_id} done')
    test_fold_id+=1

## Near duplicate image detection on Chile dataset
Alpha is a crucial hyperparameter. A larger alpha value would lead to the identification of more near-duplicate images. We recommend incrementally increasing the alpha value until there are images that are not immediately apparent as near-duplicates within the top 20 near-duplicate sets. We picked alpha =1.9 for the Chile dataset. 

In [None]:
merged_duplicate_sets_chile,df_chile=near_duplicates.return_near_duplicate_set_list(data_origin='Chile',img_df=img_df,alpha=1.9,merge_agressive=True)
near_duplicates.show_top_near_duplicate_set(df_chile,img_df,DATA_DIR,top_n=20)

In [None]:
merged_duplicate_sets_chile_flatten = set([item for sublist in merged_duplicate_sets_chile for item in sublist])
cnt_torm=len(merged_duplicate_sets_chile_flatten )-len(merged_duplicate_sets_chile)
print(f'number of redundant images: {cnt_torm}')

### Alpha = 2.0
By increasing alpha from 1.9 to 2.0, the algorithm is able to capture more near-duplicate images even if they have slightly different angles or field of view. However, for the second largest near-duplicate set, we were not sure whether these images were from two very different angles at the same eardrum, or they were just similar images but not near-duplicates. Thus we decided to stick with alpha = 1.9. 

In [None]:
merged_duplicate_sets_chile,df_chile=near_duplicates.return_near_duplicate_set_list(data_origin='Chile',img_df=img_df,alpha=2,merge_agressive=True)
near_duplicates.show_top_near_duplicate_set(df_chile,img_df,DATA_DIR,top_n=20)

## Near duplicate image detection on Turkey dataset
alpha = 0.3

In [None]:
merged_duplicate_sets_turkey,df_turkey=near_duplicates.return_near_duplicate_set_list(data_origin='Turkey',img_df=img_df,alpha=0.3,merge_agressive=True)
near_duplicates.show_top_near_duplicate_set(df_turkey,img_df,DATA_DIR,top_n=20)

In [None]:
merged_duplicate_sets_turkey_flatten = set([item for sublist in merged_duplicate_sets_turkey for item in sublist])
cnt_torm=len(merged_duplicate_sets_turkey_flatten )-len(merged_duplicate_sets_turkey)
print(f'number of redundant images: {cnt_torm}')

## Near duplicate image detection on Ohio dataset
alpha = 0.4

In [None]:
merged_duplicate_sets_ohio,df_ohio=near_duplicates.return_near_duplicate_set_list(data_origin='Ohio',img_df=img_df,alpha=0.4,merge_agressive=True)
near_duplicates.show_top_near_duplicate_set(df_ohio,img_df,DATA_DIR,top_n=20)

In [None]:
merged_duplicate_sets_ohio_flatten = set([item for sublist in merged_duplicate_sets_ohio for item in sublist])
cnt_torm=len(merged_duplicate_sets_ohio_flatten )-len(merged_duplicate_sets_ohio)
print(f'number of redundant images: {cnt_torm}')

## Similar-styled image detection on Ohio dataset
alpha = 0.9

In [None]:
merged_style_sets_ohio,df_style_ohio=near_duplicates.return_near_duplicate_set_list(data_origin='Ohio',img_df=img_df,alpha=0.9,merge_agressive=True)
near_duplicates.show_top_near_duplicate_set(df_style_ohio,img_df,DATA_DIR,top_n=5,print_class_distribution=True)

### Visualize a randomly selected set of 40 examples from Style II set

In [None]:
ii=0
import numpy as np
near_duplicate_set_df=df_style_ohio
neardup_set=near_duplicate_set_df['set'].values[ii]
neardup_set=[int(x) for x in neardup_set.split(',')]
relative_path=img_df['relative_file_path'].values[neardup_set]
subfolder_name=img_df['source'].values[neardup_set]
img_paths=DATA_DIR+'/'+subfolder_name+relative_path
sample_img_paths=np.random.choice(img_paths, size=40, replace=False)
utils.display_image_ingrid(sample_img_paths,ncol=10)

### Visualize a randomly selected set of 40 examples from the Style I set

In [None]:
ii=1
import numpy as np
near_duplicate_set_df=df_style_ohio
neardup_set=near_duplicate_set_df['set'].values[ii]
neardup_set=[int(x) for x in neardup_set.split(',')]
relative_path=img_df['relative_file_path'].values[neardup_set]
subfolder_name=img_df['source'].values[neardup_set]
img_paths=DATA_DIR+'/'+subfolder_name+relative_path
sample_img_paths=np.random.choice(img_paths, size=40, replace=False)
utils.display_image_ingrid(sample_img_paths,ncol=10)