In [None]:
# Import libraries and helper functions
import torch
import torchvision.transforms as transforms
import torchvision.models as models
import torch.backends.cudnn as cudnn
import torch.nn as nn
import torch.nn.functional as nnf

import matplotlib.pyplot as plt
import random
import pickle
import pandas as pd

from utils import *
from compute_metrics import *

import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
import time
import scipy.spatial.distance
from scipy.spatial.distance import hamming
from scipy.spatial import distance
from sklearn.mixture import GaussianMixture
from scipy.stats import norm
import seaborn as sns

# Global variables
device = 'cuda' if torch.cuda.is_available() else 'cpu'

class Args:
    batchsize = 100
    model = "mobilenet_v2"
    lr = 0.001
    epochs = 200
    dataset = "tinyimagenet"
    max_noise_rate = 0.4
args = Args()

In [None]:
# import the file name and directory that the results are saved in, the file could be from any hardness types
file_name = './results/values/imbalance/hardness_via_imbalance_{}_{}_epochs_{}_lr_{}_noise_{}_deployment.pkl'.\
    format(args.dataset, 'mobilenet_v2', 200, args.lr, args.max_noise_rate)


In [None]:
# Compute different metrics for each sample of the dataset
samples_df = compute_metrics(file_name, args.epochs)    

mid_training = compute_epoch_of_middle_training(file_name, 200)
        
samples_df['mid_feature_dist'] = samples_df.apply(lambda row: find_mid_training_scd(row['epochs-info'], 200, mid_training), axis=1)
samples_df['end_feature_acd'] = samples_df.apply(lambda row: find_end_training_acd(row['epochs-info'], 200), axis=1)
samples_df['end_feature_wjsd'] = samples_df.apply(lambda row: find_end_training_wjsd(row['epochs-info'], 200), axis=1)       
samples_df['end_loss'] = samples_df.apply(lambda row: find_end_training_loss(row['epochs-info'], 200), axis=1)
samples_df['end_conf'] = samples_df.apply(lambda row: find_end_training_conf(row['epochs-info'], 200), axis=1)
samples_df['gray'] = 1


In [None]:
#### Final results of data partitioning for different methods: ####

# baselines based on thresholding:
easy_threshold = np.median(samples_df['end_loss'])
Dnoisyids_thresh_loss = np.array(samples_df[samples_df['end_loss']>easy_threshold]['id'])
Dcleanids_thresh_loss = np.array(samples_df[samples_df['end_loss']<=easy_threshold]['id']) 


easy_threshold = np.median(samples_df['acc_over_train'])
Dnoisyids_thresh_acc = np.array(samples_df[samples_df['acc_over_train']<easy_threshold]['id'])
Dcleanids_thresh_acc = np.array(samples_df[samples_df['acc_over_train']>=easy_threshold]['id']) 


easy_threshold = np.median(samples_df['area_under_margin'])
Dnoisyids_thresh_aum = np.array(samples_df[samples_df['area_under_margin']<easy_threshold]['id'])
Dcleanids_thresh_aum = np.array(samples_df[samples_df['area_under_margin']>=easy_threshold]['id'])


In [None]:
# baselines based on 1-d GMM:

clusters_here =  compute_2d_GMM(samples_df, 'end_loss', 'gray')
samples_df['clusters_1dgmm_loss'] = clusters_here 

In [None]:
clusters_here =  compute_2d_GMM(samples_df, 'area_under_loss', 'gray')
samples_df['clusters_1dgmm_aul'] = clusters_here 

In [None]:
# set 0 or 1 based on the assigned clusters above
Dnoisyids_gmm_loss = np.array(samples_df[samples_df['clusters_1dgmm_loss']==1]['id'])
Dcleanids_gmm_loss = np.array(samples_df[samples_df['clusters_1dgmm_loss']!=1]['id'])

# set 0 or 1 based on the assigned clusters above
Dnoisyids_gmm_aul = np.array(samples_df[samples_df['clusters_1dgmm_aul']==1]['id'])
Dcleanids_gmm_aul = np.array(samples_df[samples_df['clusters_1dgmm_aul']!=1]['id'])


In [None]:
# baselines based on 2-d GMM:

clusters_here =  compute_2d_GMM(samples_df, 'end_feature_wjsd', 'end_feature_acd')

In [None]:
samples_df['clusters_gmm_wjsd_acd'] = clusters_here 
Dnoisyids_gmm_wjsd_acd = np.array(samples_df[samples_df['clusters_gmm_wjsd_acd']==1]['id'])
Dcleanids_gmm_wjsd_acd = np.array(samples_df[samples_df['clusters_gmm_wjsd_acd']==0]['id']) 


In [None]:
clusters_here =  compute_2d_GMM_3clusters(samples_df, 'acc_over_train', 'mid_feature_scd')

In [None]:
samples_df['clusters_gmm_acc_scd'] = clusters_here # higher acc and lower auf is better
Dnoisyids_gmm_acc_scd = np.array(samples_df[samples_df['clusters_gmm_acc_scd']==0]['id'])
Dcleanids_gmm_acc_scd = np.array(samples_df[samples_df['clusters_gmm_acc_scd']!=0]['id']) 


In [None]:
# which method do you want to analyze?
Dcleanids_here = # fill this

# results of the method
print('length:', len(Dcleanids_here))
print('correct percentage:', len(samples_df[samples_df['id'].isin(Dcleanids_here)][samples_df['label']==samples_df['original_label']])/len(Dcleanids_here))

a = len(samples_df[samples_df['id'].isin(Dnoisyids_here)][samples_df['label']!=samples_df['original_label']])
b = len(Dnoisyids_here)
prec_n = a/b
print('precision n:', a/b)

a = len(samples_df[samples_df['id'].isin(Dnoisyids_here)][samples_df['label']!=samples_df['original_label']])
b = len(samples_df[samples_df['label']!=samples_df['original_label']])
recal_n = a/b
print('recall n:', a/b)

print('f1-score n:', 2*prec_n*recal_n/(prec_n+recal_n))

a = len(samples_df[samples_df['id'].isin(Dcleanids_here)][(samples_df['hardness']>3)&(samples_df['label']==samples_df['original_label'])])
b = len(Dcleanids_here)
prec_h = a/b
#print('precision h:', a/b)

a = len(samples_df[samples_df['id'].isin(Dcleanids_here)][(samples_df['hardness']>3)&(samples_df['label']==samples_df['original_label'])])
b = len(samples_df[samples_df['hardness']>3])
recal_h = a/b
print('recall h:', a/b)

#print('f1-score h:', 2*prec_h*recal_h/(prec_h+recal_h))
